## Script de collecte TMDB ‚Äì Jeux de donn√©es par d√©cennies

Ce script permet de r√©cup√©rer automatiquement, via l‚ÄôAPI TMDB, des films ann√©e par ann√©e, puis de les enrichir avec des informations d√©taill√©es (cast, crew, keywords, etc.), et de sauvegarder le tout dans Google Drive, organis√© par d√©cennies. 

Il sert de base √† la constitution du dataset n√©cessaire au projet de recommandation de contenus culturels pour l‚Äôapprentissage des langues.



### Objectif du script

- R√©cup√©rer un **grand volume de films** sur une p√©riode donn√©e (`start_year` ‚Üí `end_year`).
- G√©rer les **limitations de TMDB** (pagination, limite √† 500 pages, popularit√©, etc.).
- Compl√©ter chaque film avec :
  - casting principal,
  - r√©alisateurs,
  - sc√©naristes,
  - genres,
  - pays et soci√©t√©s de production,
  - langues parl√©es,
  - mots-cl√©s,
  - notes et popularit√©.
- Sauvegarder les r√©sultats :
  - par tranche de **10 ans** (`years_per_file`),
  - dans des fichiers CSV s√©par√©s :
    - un fichier `base` (r√©sultats bruts de discover),
    - un fichier `full` (avec tous les d√©tails).

### Pr√©requis

- Environnement :
  - Le script est pr√©vu pour **Google Colab** avec montage Google Drive.
- Un compte TMDB + une **cl√© API valide**.

### Organisation des fichiers

Les fichiers sont enregistr√©s dans :

```text
/content/drive/MyDrive/tmdb_data_decennies_full_details


### 1. Librairies

In [None]:
import requests
import pandas as pd
import time
from tqdm import tqdm
import os
from google.colab import drive

### 2. R√©cup√©ration dataset 

### 2.1. Connection √† Google drive

In [None]:
# === 1. Connect to Google Drive ===
drive.mount('/content/drive')

# === 2. Create visible folder in Drive ===
def ensure_drive_folder(path, note="TMDb full dataset folder created automatically by Colab"):
    os.makedirs(path, exist_ok=True)
    readme_path = os.path.join(path, "README.txt")
    if not os.path.exists(readme_path):
        with open(readme_path, "w", encoding="utf-8") as f:
            f.write(note)
        print(f"üìÅ Folder created and visible in Drive: {path}")
    else:
        print(f"‚úÖ Folder already exists: {path}")

save_dir = "/content/drive/MyDrive/tmdb_data_decennies_full_details"
ensure_drive_folder(save_dir)

### 2.2. Requ√™te API

In [None]:
# === 3. Configuration ===
api_key = "6fb1e958076fb0c90fdc8286a488f89f"
base_url = "https://api.themoviedb.org/3/discover/movie"
start_year = 1950
end_year = 2025
years_per_file = 10
language = "en-US"

# === 4. Basic discover function ===
def get_movies_by_period(start_date, end_date):
    all_movies = []
    page = 1
    total_pages = 1

    while page <= total_pages:
        url = (
            f"{base_url}?api_key={api_key}&language={language}"
            f"&primary_release_date.gte={start_date}"
            f"&primary_release_date.lte={end_date}"
            f"&sort_by=popularity.desc&page={page}"
        )
        r = requests.get(url)
        if r.status_code != 200:
            print(f"‚ö†Ô∏è Error {r.status_code} between {start_date} and {end_date} (page {page})")
            time.sleep(5)
            continue

        data = r.json()
        total_pages = min(data.get("total_pages", 0), 500)
        all_movies.extend(data.get("results", []))
        page += 1

        if page % 40 == 0:
            time.sleep(10)
    return pd.DataFrame(all_movies)

# === 5. Full details (cast, crew, etc.) ===
def get_movie_details(movie_id):
    """Return complete movie details including cast, crew, budget, etc."""
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}&language={language}&append_to_response=credits,keywords,recommendations,reviews"
    r = requests.get(url)
    if r.status_code != 200:
        return None

    data = r.json()

    # Extract main cast & crew
    cast = [c["name"] for c in data.get("credits", {}).get("cast", [])[:5]]
    directors = [c["name"] for c in data.get("credits", {}).get("crew", []) if c.get("job") == "Director"]
    writers = [c["name"] for c in data.get("credits", {}).get("crew", []) if c.get("department") == "Writing"]

    # Keywords
    keywords = [k["name"] for k in data.get("keywords", {}).get("keywords", [])]

    return {
        "id": data.get("id"),
        "title": data.get("title"),
        "original_title": data.get("original_title"),
        "overview": data.get("overview"),
        "release_date": data.get("release_date"),
        "runtime": data.get("runtime"),
        "budget": data.get("budget"),
        "revenue": data.get("revenue"),
        "genres": [g["name"] for g in data.get("genres", [])],
        "production_companies": [p["name"] for p in data.get("production_companies", [])],
        "production_countries": [c["name"] for c in data.get("production_countries", [])],
        "spoken_languages": [l["english_name"] for l in data.get("spoken_languages", [])],
        "popularity": data.get("popularity"),
        "vote_average": data.get("vote_average"),
        "vote_count": data.get("vote_count"),
        "cast": cast,
        "directors": directors,
        "writers": writers,
        "keywords": keywords
    }

# === 6. Download one year (with fallback to months if needed) ===
def get_movies_by_year(year):
    print(f"üé¨ Downloading movies from {year} ...")
    df_year = get_movies_by_period(f"{year}-01-01", f"{year}-12-31")

    if len(df_year) >= 9500:
        print(f"‚ö†Ô∏è {year}: limit reached ({len(df_year)} movies). Splitting by month ...")
        dfs_months = []
        for month in range(1, 13):
            start = f"{year}-{month:02d}-01"
            end = f"{year}-{month+1:02d}-01" if month < 12 else f"{year+1}-01-01"
            df_m = get_movies_by_period(start, end)
            dfs_months.append(df_m)
        df_year = pd.concat(dfs_months, ignore_index=True)

    print(f"‚úÖ {len(df_year)} movies collected for {year}")
    return df_year

# === 7. Enrich each movie with full details ===
def enrich_with_details(df, decade_path):
    details_list = []
    existing_ids = set()

    if os.path.exists(decade_path):
        df_existing = pd.read_csv(decade_path)
        existing_ids = set(df_existing["id"])
        print(f"‚ôªÔ∏è {len(existing_ids)} movies already saved, resuming...")

    for movie_id in tqdm(df["id"], desc="Fetching details"):
        if movie_id in existing_ids:
            continue
        details = get_movie_details(movie_id)
        if details:
            details_list.append(details)

        # Rate limit
        if len(details_list) % 40 == 0:
            time.sleep(10)

        # Sauvegarde tous les 500 films
        if len(details_list) % 500 == 0:
            pd.DataFrame(details_list).to_csv(decade_path, mode='a', index=False, header=not os.path.exists(decade_path))
            details_list = []

    # Derni√®re sauvegarde
    if details_list:
        pd.DataFrame(details_list).to_csv(decade_path, mode='a', index=False, header=not os.path.exists(decade_path))

# === 8. Download each decade ===
for decade_start in tqdm(range(start_year, end_year + 1, years_per_file)):
    decade_end = min(decade_start + years_per_file - 1, end_year)
    base_file = f"{save_dir}/films_{decade_start}_{decade_end}_base.csv"
    full_file = f"{save_dir}/films_{decade_start}_{decade_end}_full.csv"

    if not os.path.exists(base_file):
        print(f"\nüìÖ Downloading base movies for {decade_start}-{decade_end} ...")
        dfs = []
        for year in range(decade_start, decade_end + 1):
            df_y = get_movies_by_year(year)
            dfs.append(df_y)

            # ‚úÖ Confirmation annuelle
            print(f"üìÖ ‚úÖ YEAR {year} DONE ‚Äî {len(df_y)} movies collected so far.\n")
            time.sleep(2)

        df_decade = pd.concat(dfs, ignore_index=True)
        df_decade.to_csv(base_file, index=False, encoding="utf-8")
        print(f"üíæ {len(df_decade)} base movies saved in {base_file}")
    else:
        print(f"‚è© Base file already exists for {decade_start}-{decade_end}")

    # Step 2: enrich with details
    df_base = pd.read_csv(base_file)
    print(f"üîç Enriching {len(df_base)} movies with full details ...")
    enrich_with_details(df_base, full_file)
    print(f"‚úÖ Full dataset saved: {full_file}\n")

print("\nüéâ COMPLETE DOWNLOAD FINISHED! All full datasets are in your Drive (tmdb_data_decennies_full_details)")
