<a href="https://colab.research.google.com/github/KatsuhitoArasaka/BabyLM-Tiny/blob/main/sampling_pipelines/sampling_subtitles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tmdbsimple

import tmdbsimple as tmdb

# Replace with your own TMDB_API_KEY key
tmdb.API_KEY = "c8d81be5e9a484344fa47c71601ca9e1"

# Example call: get genres
genres = tmdb.Genres().movie_list()['genres']
print("✅ Available TMDb genres:")
for g in genres:
    print(f"{g['id']:>3} → {g['name']}")

In [None]:
from tqdm import tqdm

def get_movies_by_genre_popularity(genre_id, max_pages):
    """
    Fetches movies by TMDb genre ID using the /discover endpoint.
    Returns: list of dicts with title, year, and IMDb ID
    """
    all_movies = []

    for page in tqdm(range(1, max_pages + 1)):
        try:
            response = tmdb.Discover().movie(
                with_genres=genre_id,
                sort_by='popularity.desc',
                page=page,
                language='en-US'
            )
            for movie in response['results']:
                tmdb_id = movie['id']
                try:
                    movie_data = tmdb.Movies(tmdb_id).info()
                    imdb_id = movie_data.get("imdb_id")
                    title = movie_data.get("title")
                    year = movie_data.get("release_date", "")[:4]
                    if imdb_id and title:
                        all_movies.append({
                            "title": title,
                            "year": year,
                            "imdb_id": imdb_id
                        })
                except Exception as e:
                    print(f"⚠️ Failed to fetch IMDb ID for TMDb ID {tmdb_id}: {e}")
        except Exception as e:
            print(f"⚠️ Error on page {page}: {e}")

    # Ensure uniqueness by imdb_id
    unique_movies = {m['imdb_id']: m for m in all_movies}.values()
    return list(unique_movies)


import random

def get_movies_by_genre_true_random(genre_id, sample_size=160, pages_to_scan=40):
    all_movies = []

    for page in tqdm(random.sample(range(1, 501), pages_to_scan)):
        try:
            response = tmdb.Discover().movie(
                with_genres=genre_id,
                sort_by='popularity.desc',
                page=page,
                language='en-US'
            )
            for movie in response['results']:
                tmdb_id = movie['id']
                try:
                    movie_data = tmdb.Movies(tmdb_id).info()
                    imdb_id = movie_data.get("imdb_id")
                    title = movie_data.get("title")
                    year = movie_data.get("release_date", "")[:4]
                    if imdb_id and title:
                        all_movies.append({
                            "title": title,
                            "year": year,
                            "imdb_id": imdb_id
                        })
                except: continue
        except: continue

    # Ensure uniqueness by imdb_id
    unique_movies = {m['imdb_id']: m for m in all_movies}.values()
    return random.sample(list(unique_movies), min(sample_size, len(unique_movies)))



# Define the TMDb genre IDs for our target genres
GENRE_IDS = {
    "Action": 28,
    "Comedy": 35,
    "Documentary": 99,
    "History": 36,
    "Romance": 10749,
    "Science Fiction": 878,
}

# Collect ~160 movies per genre
genre_to_movies = {}

for genre_name, genre_id in GENRE_IDS.items():
    print(f"\n🔍 Fetching movies for genre: {genre_name}")

    # movies = get_movies_by_genre_popularity(genre_id, max_pages=8)  # ~20 movies per page
    # OR
    movies = get_movies_by_genre_true_random(genre_id, sample_size=160, pages_to_scan=40)

    genre_to_movies[genre_name] = movies
    print()
    print(f"✅ {len(movies)} movies found for {genre_name}")

In [None]:
for genre, movies in genre_to_movies.items():
    print(f"Genre: {genre} — {len(movies)} movies")
    for movie in movies[:5]:
        title = movie.get("title", "N/A")
        imdb_id = movie.get("imdb_id", "N/A")
        print(f"   • {title} — IMDb ID: {imdb_id}")

In [None]:
import json
import os

os.makedirs("movie_lists", exist_ok=True)

with open("movie_lists/genre_to_movies.json", "w", encoding="utf-8") as f:
    json.dump(genre_to_movies, f, ensure_ascii=False, indent=2)

print("✅ Saved genre_to_movies.json")

In [None]:
import json

with open("movie_lists/genre_to_movies.json", "r", encoding="utf-8") as f:
    genre_to_movies = json.load(f)

for genre, movies in genre_to_movies.items():
    print(f"\n🎬 Genre: {genre} — {len(movies)} movies")
    for movie in movies[:5]:
        title = movie.get("title")
        imdb_id = movie.get("imdb_id")
        print(f"   • {title} — IMDb ID: {imdb_id}")

In [None]:
# STEP 1: Install gdown to download from Google Drive
!pip install -q gdown

# STEP 2: Download the subtitles archive (from Sublens-20M Dataset)
# Google Drive file ID from: https://drive.google.com/file/d/1Xmty1wID7RjZLBXIv09hUctq8OMmhOW5/view
file_id = "1Xmty1wID7RjZLBXIv09hUctq8OMmhOW5"
output = "Sublens_20M_subtitles.zip"

# Download the ZIP file
!gdown --id {file_id} -O {output}

# STEP 3: Extract the archive
!unzip -q Sublens_20M_subtitles.zip -d subtitles_cf

# STEP 4: Print full directory tree under 'subtitles_cf'
import os
for root, dirs, files in os.walk("subtitles_cf"):
    print(root)
    for fname in files[:5]:  # print up to 5 files per folder
        print("   └─", fname)


# STEP 5: Check sample files
# sample_files = os.listdir("subtitles_cf/subtitles")[:5]
# print("📂 Sample subtitle files:", sample_files)

In [None]:
import pysrt
from collections import defaultdict

# Output folders
os.makedirs("output", exist_ok=True)
os.makedirs("logs", exist_ok=True)

# Set limits
TARGET_WORDS = 1_000_000
DEV_TARGET_WORDS = int(0.2 * TARGET_WORDS)

# Word counters
word_counts = defaultdict(int)
dev_word_counts = defaultdict(int)

# Track unfound movies
not_found = defaultdict(list)

# Safe genre name (e.g. Science Fiction → Science_Fiction)
def safe_genre_name(g):
    return g.replace(" ", "_")

# Initialize output file handles
train_files = {}
dev_files = {}

for genre in genre_to_movies:
    safe_name = safe_genre_name(genre)
    train_files[genre] = open(f"output/{safe_name}.train", "w", encoding="utf-8")
    dev_files[genre] = open(f"output/{safe_name}_dev.train", "w", encoding="utf-8")

# Process each genre
for genre, movies in genre_to_movies.items():
    print(f"\n📂 Genre: {genre}")

    for movie in tqdm(movies):
        imdb_id = movie["imdb_id"]
        srt_path = f"subtitles_cf/subtitles/{imdb_id}.srt"

        if not os.path.exists(srt_path):
            not_found[genre].append((movie["title"], imdb_id))
            continue

        try:
            subs = pysrt.open(srt_path)
            lines = [sub.text.strip() for sub in subs if sub.text.strip()]
        except Exception as e:
            print(f"⚠️ Error reading {imdb_id}: {e}")
            not_found[genre].append((movie["title"], imdb_id))
            continue

        for line in lines:
            words = line.split()
            num_words = len(words)
            if num_words == 0:
                continue

            if word_counts[genre] < TARGET_WORDS:
                train_files[genre].write(line + "\n")
                word_counts[genre] += num_words
            elif dev_word_counts[genre] < DEV_TARGET_WORDS:
                dev_files[genre].write(line + "\n")
                dev_word_counts[genre] += num_words

            if word_counts[genre] >= TARGET_WORDS and dev_word_counts[genre] >= DEV_TARGET_WORDS:
                break

# Close all files
for f in train_files.values(): f.close()
for f in dev_files.values(): f.close()

# Log missing movies
for genre, missing in not_found.items():
    safe_name = safe_genre_name(genre)
    with open(f"logs/{safe_name}_missing.txt", "w", encoding="utf-8") as f:
        for title, imdb in missing:
            f.write(f"{title} ({imdb})\n")

print("\n✅ Done. Missing titles saved in logs/*.txt")