<a href="https://colab.research.google.com/github/KatsuhitoArasaka/BabyLM-Tiny/blob/main/sampling_pipelines/sampling_subtitles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tmdbsimple

import tmdbsimple as tmdb

# 👇 Replace with your own API key
tmdb.API_KEY = "PASTE_YOUR_TMDB_API_KEY_HERE"

# Example call: get genres
genres = tmdb.Genres().movie_list()['genres']
print("✅ Available TMDb genres:")
for g in genres:
    print(f"{g['id']:>3} → {g['name']}")

In [None]:
from tqdm import tqdm

def get_movies_by_genre_popularity(genre_id, max_pages):
    """
    Fetches movies by TMDb genre ID using the /discover endpoint.
    Returns: list of dicts with title, year, and IMDb ID
    """
    all_movies = []

    for page in tqdm(range(1, max_pages + 1)):
        try:
            response = tmdb.Discover().movie(
                with_genres=genre_id,
                sort_by='popularity.desc',
                page=page,
                language='en-US'
            )
            for movie in response['results']:
                tmdb_id = movie['id']
                try:
                    movie_data = tmdb.Movies(tmdb_id).info()
                    imdb_id = movie_data.get("imdb_id")
                    title = movie_data.get("title")
                    year = movie_data.get("release_date", "")[:4]
                    if imdb_id and title:
                        all_movies.append({
                            "title": title,
                            "year": year,
                            "imdb_id": imdb_id
                        })
                except Exception as e:
                    print(f"⚠️ Failed to fetch IMDb ID for TMDb ID {tmdb_id}: {e}")
        except Exception as e:
            print(f"⚠️ Error on page {page}: {e}")

    # Ensure uniqueness by imdb_id
    unique_movies = {m['imdb_id']: m for m in all_movies}.values()
    return list(unique_movies)


import random

def get_movies_by_genre_true_random(genre_id, sample_size=160, pages_to_scan=40):
    all_movies = []

    for page in tqdm(random.sample(range(1, 501), pages_to_scan)):
        try:
            response = tmdb.Discover().movie(
                with_genres=genre_id,
                sort_by='popularity.desc',
                page=page,
                language='en-US'
            )
            for movie in response['results']:
                tmdb_id = movie['id']
                try:
                    movie_data = tmdb.Movies(tmdb_id).info()
                    imdb_id = movie_data.get("imdb_id")
                    title = movie_data.get("title")
                    year = movie_data.get("release_date", "")[:4]
                    if imdb_id and title:
                        all_movies.append({
                            "title": title,
                            "year": year,
                            "imdb_id": imdb_id
                        })
                except: continue
        except: continue

    # Ensure uniqueness by imdb_id
    unique_movies = {m['imdb_id']: m for m in all_movies}.values()
    return random.sample(list(unique_movies), min(sample_size, len(unique_movies)))



# Define the TMDb genre IDs for our target genres
GENRE_IDS = {
    "Action": 28,
    "Comedy": 35,
    "Documentary": 99,
    "History": 36,
    "Romance": 10749,
    "Science Fiction": 878,
}

# Collect ~160 movies per genre
genre_to_movies = {}

for genre_name, genre_id in GENRE_IDS.items():
    print(f"\n🔍 Fetching movies for genre: {genre_name}")

    # movies = get_movies_by_genre_popularity(genre_id, max_pages=8)  # ~20 movies per page
    # OR
    movies = get_movies_by_genre_true_random(genre_id, sample_size=160, pages_to_scan=40)

    genre_to_movies[genre_name] = movies
    print()
    print(f"✅ {len(movies)} movies found for {genre_name}")

In [None]:
for genre, movies in genre_to_movies.items():
    print(f"{genre}: {len(movies)}")

In [None]:
import json
import os

os.makedirs("movie_lists", exist_ok=True)

with open("movie_lists/genre_to_movies.json", "w", encoding="utf-8") as f:
    json.dump(genre_to_movies, f, ensure_ascii=False, indent=2)

print("✅ Saved genre_to_movies.json")

In [None]:
import json

with open("movie_lists/genre_to_movies.json", "r", encoding="utf-8") as f:
    genre_to_movies = json.load(f)

# For example: first movies in "Action"
for movie in genre_to_movies["Action"][:5]:
    print(movie)