In [None]:
# Run this cell first and upload your kaggle.json file
from google.colab import files
import os

print("Please upload your kaggle.json file:")
uploaded = files.upload()

# Move kaggle.json to the correct directory
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

print("Kaggle API token configured successfully.")

In [None]:
import pandas as pd
import zipfile
import glob
import os

# 1. Download the dataset
dataset_slug = "asaniczka/tmdb-movies-dataset-2023-930k-movies"
print(f"Downloading dataset: {dataset_slug}...")
!kaggle datasets download -d {dataset_slug}

# 2. Unzip the file
zip_name = dataset_slug.split('/')[-1] + ".zip"
with zipfile.ZipFile(zip_name, 'r') as zip_ref:
    zip_ref.extractall("tmdb_data")

# 3. Find the CSV file (automatically detects the name)
csv_files = glob.glob("tmdb_data/*.csv")
if not csv_files:
    raise FileNotFoundError("No CSV file found in the downloaded dataset.")
csv_path = csv_files[0]
print(f"Reading file: {csv_path}")

# 4. Load data
# specifying low_memory=False to suppress mixed type warnings on large files
df = pd.read_csv(csv_path, low_memory=False)

# 5. Preprocessing & Filtering

# Convert release_date to datetime, handling errors by coercing to NaT
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')

# Drop rows with no release date
df = df.dropna(subset=['release_date'])

# Define Filter Criteria
START_DATE = '2015-12-28'
END_DATE = '2025-12-28'
MIN_VOTES = 100
MIN_REVENUE = 50_000_000  # 50 Million

print(f"Filtering for release date between {START_DATE} and {END_DATE}...")
print(f"Filtering for vote_count > {MIN_VOTES}...")
print(f"Filtering for revenue > {MIN_REVENUE}...")

# Apply Filters
filtered_df = df[
    (df['release_date'] >= START_DATE) &
    (df['release_date'] <= END_DATE) &
    (df['vote_count'] > MIN_VOTES) &
    (df['revenue'] > MIN_REVENUE)
].copy()

print(f"Movies remaining after filtering: {len(filtered_df)}")

# 6. Random Sampling
SAMPLE_SIZE = 200

if len(filtered_df) > SAMPLE_SIZE:
    sampled_df = filtered_df.sample(n=SAMPLE_SIZE, random_state=42) # random_state for reproducibility
    print(f"Successfully sampled {SAMPLE_SIZE} movies.")
else:
    sampled_df = filtered_df
    print(f"Warning: Only {len(filtered_df)} movies matched criteria. Returning all of them.")

# 7. Save to CSV
output_filename = "filtered_tmdb_movies_sample.csv"
sampled_df.to_csv(output_filename, index=False)

print(f"\nDone! Saved to {output_filename}")

# Trigger download of the result
files.download(output_filename)

In [None]:
import requests
import pandas as pd
import numpy as np

# 1. Configuration
cities = {
    "Tunis":  {"lat": 36.8065, "lon": 10.1815},
    "Sousse": {"lat": 35.8256, "lon": 10.6084},
    "Sfax":   {"lat": 34.7406, "lon": 10.7603}
}

START_DATE = "2023-12-20"
END_DATE = "2025-12-20"

# 2. Helper Functions for Logic

def get_weather_state(code):
    """
    Maps WMO Weather Codes to General States.
    Source: Open-Meteo WMO docs
    """
    if code in [0, 1]:
        return "Clear"
    elif code in [2, 3]:
        return "Cloudy"
    elif code in [45, 48]:
        return "Foggy"
    elif code in [51, 53, 55, 61, 63, 65, 80, 81, 82]:
        return "Rainy"
    elif code in [71, 73, 75, 77, 85, 86]:
        return "Snow/Hail" # Rare in Tunisia, but possible
    elif code in [95, 96, 99]:
        return "Stormy"
    else:
        return "Unknown"

def get_temp_category(temp):
    """
    Categorizes temperature based on Tunisian averages.
    """
    if temp < 13:
        return "Cold"       # Good for cinema (Indoor)
    elif 13 <= temp < 25:
        return "Mild"       # Bad for cinema (People go outside)
    elif 25 <= temp < 32:
        return "Warm"       # Neutral
    else:
        return "Hot"        # Good for cinema (AC seeking)

# 3. Main Loop to Fetch Data
all_data = []

print(f"Fetching data from {START_DATE} to {END_DATE}...\n")

for city_name, coords in cities.items():
    print(f"Processing {city_name}...")

    # Open-Meteo Archive API
    url = "https://archive-api.open-meteo.com/v1/archive"
    params = {
        "latitude": coords["lat"],
        "longitude": coords["lon"],
        "start_date": START_DATE,
        "end_date": END_DATE,
        "daily": ["weather_code", "temperature_2m_max"],
        "timezone": "auto"
    }

    response = requests.get(url, params=params)

    if response.status_code == 200:
        data = response.json()
        daily = data['daily']

        # Create a temporary dataframe for this city
        df_city = pd.DataFrame({
            'date': daily['time'],
            'max_temp': daily['temperature_2m_max'],
            'wmo_code': daily['weather_code']
        })

        df_city['city'] = city_name

        # Apply Logic
        df_city['weather_state'] = df_city['wmo_code'].apply(get_weather_state)
        df_city['temp_category'] = df_city['max_temp'].apply(get_temp_category)

        all_data.append(df_city)
    else:
        print(f"Error fetching {city_name}: {response.status_code}")

# 4. Combine and Clean
final_df = pd.concat(all_data, ignore_index=True)

# Select only the specific columns you requested
output_df = final_df[['city', 'date', 'weather_state', 'temp_category']]

# 5. Review and Save
print("\nSample Data:")
print(output_df.sample(10))

# Save to CSV
output_df.to_csv("tunisia_historical_weather.csv", index=False)
print("\nSaved to 'tunisia_historical_weather.csv'")