<a href="https://colab.research.google.com/github/KatsuhitoArasaka/BabyLM-Tiny/blob/main/sampling_pipelines/sampling_subtitles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Sources:

https://www.themoviedb.org/settings/api -- for imdbIDs

https://github.com/sagiede/SubtitleCF/tree/main?tab=readme-ov-file -- for subtitles

In [None]:
!pip install tmdbsimple

import tmdbsimple as tmdb

# Replace with your own TMDB_API_KEY key
tmdb.API_KEY = "c8d81be5e9a484344fa47c71601ca9e1"

# Example call: get genres
genres = tmdb.Genres().movie_list()['genres']
print("✅ Available TMDb genres:")
for g in genres:
    print(f"{g['id']:>3} → {g['name']}")

In [None]:
from tqdm import tqdm

def get_movies_by_genre_popularity(genre_id, max_pages):
    """
    Fetches movies by TMDb genre ID using the /discover endpoint.
    Returns: list of dicts with title, year, and IMDb ID
    """
    all_movies = []

    for page in tqdm(range(1, max_pages + 1)):
        try:
            response = tmdb.Discover().movie(
                with_genres=genre_id,
                sort_by='popularity.desc',
                page=page,
                language='en-US'
            )
            for movie in response['results']:
                tmdb_id = movie['id']
                try:
                    movie_data = tmdb.Movies(tmdb_id).info()
                    imdb_id = movie_data.get("imdb_id")
                    title = movie_data.get("title")
                    year = movie_data.get("release_date", "")[:4]
                    if imdb_id and title:
                        all_movies.append({
                            "title": title,
                            "year": year,
                            "imdb_id": imdb_id
                        })
                except Exception as e:
                    print(f"⚠️ Failed to fetch IMDb ID for TMDb ID {tmdb_id}: {e}")
        except Exception as e:
            print(f"⚠️ Error on page {page}: {e}")

    # Ensure uniqueness by imdb_id
    unique_movies = {m['imdb_id']: m for m in all_movies}.values()
    return list(unique_movies)


import random

def get_movies_by_genre_true_random(genre_id, sample_size=160, pages_to_scan=40):
    all_movies = []

    for page in tqdm(random.sample(range(1, 1001), pages_to_scan)):
        try:
            response = tmdb.Discover().movie(
                with_genres=genre_id,
                sort_by='popularity.desc',
                page=page,
                language='en-US'
            )
            for movie in response['results']:
                tmdb_id = movie['id']
                try:
                    movie_data = tmdb.Movies(tmdb_id).info()
                    imdb_id = movie_data.get("imdb_id")
                    title = movie_data.get("title")
                    year = movie_data.get("release_date", "")[:4]
                    if imdb_id and title:
                        all_movies.append({
                            "title": title,
                            "year": year,
                            "imdb_id": imdb_id
                        })
                except: continue
        except: continue

    # Ensure uniqueness by imdb_id
    unique_movies = {m['imdb_id']: m for m in all_movies}.values()
    return random.sample(list(unique_movies), min(sample_size, len(unique_movies)))



# Define the TMDb genre IDs for our target genres
GENRE_IDS = {
    "Action": 28,
    "Comedy": 35,
    "Documentary": 99,
    "History": 36,
    "Romance": 10749,
    "Science Fiction": 878,
}

# Collect ~n movies per genre
genre_to_movies = {}

for genre_name, genre_id in GENRE_IDS.items():
    print(f"\n🔍 Fetching movies for genre: {genre_name}")

    # movies = get_movies_by_genre_popularity(genre_id, max_pages=50)  # ~20 movies per page
    # OR
    movies = get_movies_by_genre_true_random(genre_id, sample_size=300, pages_to_scan=120)  # adjust

    genre_to_movies[genre_name] = movies
    print()
    print(f"✅ {len(movies)} movies found for {genre_name}")

In [None]:
for genre, movies in genre_to_movies.items():
    print(f"Genre: {genre} — {len(movies)} movies")
    for movie in movies[:5]:
        title = movie.get("title", "N/A")
        imdb_id = movie.get("imdb_id", "N/A")
        print(f"   • {title} — IMDb ID: {imdb_id}")

In [None]:
import json
import os

os.makedirs("movie_lists", exist_ok=True)

with open("movie_lists/genre_to_movies.json", "w", encoding="utf-8") as f:
    json.dump(genre_to_movies, f, ensure_ascii=False, indent=2)

print("✅ Saved genre_to_movies.json")

In [None]:
import json

with open("movie_lists/genre_to_movies.json", "r", encoding="utf-8") as f:
    genre_to_movies = json.load(f)

for genre, movies in genre_to_movies.items():
    print(f"\n🎬 Genre: {genre} — {len(movies)} movies")
    for movie in movies[:5]:
        title = movie.get("title")
        imdb_id = movie.get("imdb_id")
        print(f"   • {title} — IMDb ID: {imdb_id}")

In [None]:
# STEP 1: Install gdown to download from Google Drive
!pip install -q gdown

# STEP 2: Download the subtitles archive (from Sublens-20M Dataset)
# Google Drive file ID from: https://drive.google.com/file/d/1Xmty1wID7RjZLBXIv09hUctq8OMmhOW5/view
file_id = "1Xmty1wID7RjZLBXIv09hUctq8OMmhOW5"
output = "Sublens_20M_subtitles.zip"

# Download the ZIP file
!gdown --id {file_id} -O {output}

# STEP 3: Extract the archive
!unzip -q Sublens_20M_subtitles.zip -d subtitles_cf

# STEP 4: Print full directory tree under 'subtitles_cf'
import os
for root, dirs, files in os.walk("subtitles_cf"):
    print(root)
    for fname in files[:5]:  # print up to 5 files per folder
        print("   └─", fname)

In [None]:
import os

# Example IMDb ID

imdb_id_raw = "tt0281176"
imdb_id = imdb_id_raw.replace("tt", "")
folder_path = f"subtitles_cf/Sublens_20M/subtitles/{imdb_id}"

# Find the first .srt file in the folder
srt_files = [f for f in os.listdir(folder_path) if f.lower().endswith(".srt")]
srt_path = os.path.join(folder_path, srt_files[0])
print("✅ File exists:" if os.path.exists(srt_path) else "❌ File NOT found")

In [None]:
!pip install pysrt

In [None]:
import pysrt
from collections import defaultdict
import json

# Output folders
os.makedirs("output", exist_ok=True)
os.makedirs("logs", exist_ok=True)

# Limits
TARGET_WORDS = 1_000_000
DEV_TARGET_WORDS = int(0.2 * TARGET_WORDS)

# Word counters
word_counts = defaultdict(int)
dev_word_counts = defaultdict(int)

# Log unfound
not_found = defaultdict(list)

# Track already used imdb_ids to avoid reprocessing
used_imdb_ids = defaultdict(set)  # genre → set of imdb_ids

# Load used_imdb_ids if it exists
if os.path.exists("logs/used_imdb_ids.json"):
    with open("logs/used_imdb_ids.json", "r", encoding="utf-8") as f:
        raw_used = json.load(f)
        for genre, ids in raw_used.items():
            used_imdb_ids[genre] = set(ids)

# Safe genre name (e.g. Science Fiction → Science_Fiction)
def safe_genre_name(g):
    return g.replace(" ", "_")

# Count words already in file
def count_words_in_file(path):
    if not os.path.exists(path):
        return 0
    with open(path, encoding="utf-8") as f:
        return sum(len(line.split()) for line in f)

# Initialize output file handles and word counters
train_files = {}
dev_files = {}

for genre in genre_to_movies:
    safe_name = safe_genre_name(genre)
    train_path = f"output/{safe_name}.train"
    dev_path = f"output/{safe_name}_dev.train"

    # Count existing words (if any)
    word_counts[genre] = count_words_in_file(train_path)
    dev_word_counts[genre] = count_words_in_file(dev_path)

    # Open files in append mode
    train_files[genre] = open(train_path, "a", encoding="utf-8")
    dev_files[genre] = open(dev_path, "a", encoding="utf-8")

# Process
for genre, movies in genre_to_movies.items():
    print(f"\n📂 Genre: {genre}")

    for movie in tqdm(movies):
        imdb_id_raw = movie["imdb_id"]        # e.g. tt0281176
        imdb_id = imdb_id_raw.replace("tt", "")  # e.g. 0281176
        # Skip if already processed
        if imdb_id_raw in used_imdb_ids[genre]:
            continue
        folder_path = f"subtitles_cf/Sublens_20M/subtitles/{imdb_id}"

        if not os.path.isdir(folder_path):
            not_found[genre].append((movie["title"], imdb_id_raw))
            continue

        srt_files = [f for f in os.listdir(folder_path) if f.lower().endswith(".srt")]
        if not srt_files:
            not_found[genre].append((movie["title"], imdb_id_raw))
            continue

        srt_path = os.path.join(folder_path, srt_files[0])

        # Try multiple encodings for robustness
        encodings_to_try = ['utf-8', 'iso-8859-1', 'cp1252', 'latin1']
        subs = None

        for enc in encodings_to_try:
            try:
                subs = pysrt.open(srt_path, encoding=enc)
                break  # Success
            except UnicodeDecodeError:
                continue
            except Exception as e:
                print(f"⚠️ Error reading {imdb_id_raw} with encoding {enc}: {e}")
                break  # Skip on other errors

        if subs is None:
            not_found[genre].append((movie["title"], imdb_id_raw))
            continue

        # Extract non-empty subtitle lines
        lines = [sub.text.strip() for sub in subs if sub.text.strip()]


        for line in lines:
            num_words = len(line.split())
            if num_words == 0:
                continue

            if word_counts[genre] < TARGET_WORDS:
                train_files[genre].write(line + "\n")
                word_counts[genre] += num_words
            elif dev_word_counts[genre] < DEV_TARGET_WORDS:
                dev_files[genre].write(line + "\n")
                dev_word_counts[genre] += num_words

            if word_counts[genre] >= TARGET_WORDS and dev_word_counts[genre] >= DEV_TARGET_WORDS:
                break

        # Mark movie as used
        used_imdb_ids[genre].add(imdb_id_raw)

# Close files
for f in train_files.values(): f.close()
for f in dev_files.values(): f.close()

# Save logs
for genre, missing in not_found.items():
    safe_name = safe_genre_name(genre)
    with open(f"logs/{safe_name}_missing.txt", "w", encoding="utf-8") as f:
        for title, imdb in missing:
            f.write(f"{title} ({imdb})\n")

# Save used_imdb_ids for next runs
with open("logs/used_imdb_ids.json", "w", encoding="utf-8") as f:
    json.dump({g: list(ids) for g, ids in used_imdb_ids.items()}, f, ensure_ascii=False, indent=2)

print("\n✅ Done. Logs saved. Word counts:")

# Final check
for genre in genre_to_movies:
    print(f"{genre}: train={word_counts[genre]:,} words, dev={dev_word_counts[genre]:,} words")

In [None]:
# Quick check if logs contain anything
import glob

for path in sorted(glob.glob("logs/*.txt")):
    with open(path, encoding="utf-8") as f:
        lines = f.readlines()
        print(f"{path}: {len(lines)} missing")

In [None]:
import glob

print("\n📊 Summary per genre (existing output files):")

existing_train_files = glob.glob("output/*.train")
genres_seen = set()

for path in existing_train_files:
    filename = os.path.basename(path)
    if filename.endswith("_dev.train"):
        genre = filename.replace("_dev.train", "").replace("_", " ")
    else:
        genre = filename.replace(".train", "").replace("_", " ")
    genres_seen.add(genre)

for genre in sorted(genres_seen):
    safe_name = safe_genre_name(genre)
    train_path = f"output/{safe_name}.train"
    dev_path = f"output/{safe_name}_dev.train"
    train_words = count_words_in_file(train_path)
    dev_words = count_words_in_file(dev_path)

    # Try to load missing subtitles log if exists
    missing_path = f"logs/{safe_name}_missing.txt"
    if os.path.exists(missing_path):
        with open(missing_path, encoding="utf-8") as f:
            missing_lines = f.readlines()
            missing_count = len(missing_lines)
    else:
        missing_count = "?"

    print(f"\n🎬 Genre: {genre}")
    print(f"  • Words in train:      {train_words:,}")
    print(f"  • Words in dev:        {dev_words:,}")
    print(f"  • Subtitles missing:   {missing_count}")

    if train_words < TARGET_WORDS or dev_words < DEV_TARGET_WORDS:
        print("  ⚠️ Dataset incomplete! Consider adding more subtitle sources.")
    else:
        print("  ✅ Dataset size is correct.")

In [None]:
TOKEN = "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"  # ← GitHub Personal Access Token (PAT)  ⚠️
# XXXXXXXXXXXXXXXXXXXXXXXXXXXX
!git clone https://{TOKEN}@github.com/KatsuhitoArasaka/BabyLM-Tiny.git  # your repository link  ⚠️
%cd /content/BabyLM-Tiny

# Copy generated dataset files into the correct directory
!cp /content/output/*.train datasets/open_subtitles/

# List the contents to confirm files are in place
!ls -lh datasets/open_subtitles

!git config --global user.email "nikitagorety@gmail.com"
!git config --global user.name "NikitaGoryetiy"

# Stage the new files for commit
!git add datasets/open_subtitles/*.train

# Commit the changes
!git commit -m "Add OpenSubtitles datasets: 1M train + 200K dev per genre"

# Push to GitHub using the token
!git push