In [None]:
import os
import kagglehub
import pandas as pd
from pathlib import Path


os.environ["KAGGLEHUB_CACHE"] = f"{Path.cwd()}/raw"

path = kagglehub.dataset_download("tmdb/tmdb-movie-metadata")
print("Path to dataset files:", path)


  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /Users/georgiosklonis/dev/github/cineops/backend/db/seeds/raw/datasets/tmdb/tmdb-movie-metadata/versions/2


In [2]:
movies = pd.read_csv(f"{path}/tmdb_5000_movies.csv")
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [3]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [7]:
import requests
from dotenv import load_dotenv, find_dotenv

def fetch_poster_url(TMDB_V3_API_KEY: str, POSTER_BASE_URL: str, movie_id: int) -> str:
    if not TMDB_V3_API_KEY:
        raise ValueError("TMDB_V3_API_KEY not found")
    
    resp = requests.get(
    f"https://api.themoviedb.org/3/movie/{movie_id}",
    params={"api_key": TMDB_V3_API_KEY, "language": "en-US"},
    timeout=15
    )

    data = resp.json()
    poster_path = data.get("poster_path")

    if not poster_path or len(poster_path) < 7 or poster_path is None:
        return "none"
    return f"{POSTER_BASE_URL}{poster_path}"


In [None]:
from tqdm.auto import tqdm

load_dotenv(find_dotenv())

TMDB_V3_API_KEY = os.getenv("TMDB_V3_API_KEY")
POSTER_BASE_URL = "https://image.tmdb.org/t/p/w500"

if TMDB_V3_API_KEY is None:
    raise ValueError("TMDB_V3_API_KEY not found")

poster_urls = []

for movie_id in tqdm(movies["id"], total=len(movies), desc="Fetching posters"):
    poster_urls.append(
        fetch_poster_url(
            TMDB_V3_API_KEY=TMDB_V3_API_KEY,
            POSTER_BASE_URL=POSTER_BASE_URL,
            movie_id=movie_id,
        )
    )

movies["poster_url"] = poster_urls

Fetching posters: 100%|██████████| 4803/4803 [11:33<00:00,  6.93it/s]


In [16]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [36]:
movies["status"].unique()

array(['Released', 'Post Production', 'Rumored'], dtype=object)

In [38]:
movies_processed = movies[movies["status"] == "Released"]
movies_processed["status"].unique()

array(['Released'], dtype=object)

In [43]:
columns_to_drop = ["revenue", 
                   "vote_count", 
                   "vote_average", 
                   "popularity", 
                   "budget", 
                   "homepage", 
                   "production_countries", 
                   "production_companies",
                   "tagline",
                   "spoken_languages",
                   "original_title",
                   "status",
                   "original_language",
                   "keywords"
                   ]
movies_processed = movies.drop(columns=columns_to_drop)
movies_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   genres        4803 non-null   object 
 1   id            4803 non-null   int64  
 2   overview      4800 non-null   object 
 3   release_date  4802 non-null   object 
 4   runtime       4801 non-null   float64
 5   title         4803 non-null   object 
 6   poster_url    4803 non-null   object 
dtypes: float64(1), int64(1), object(5)
memory usage: 262.8+ KB


In [45]:
movies_processed.rename(columns={"id": "tmdb_id"}, inplace=True)

In [None]:
import json

movies_processed["genres"] = movies["genres"].apply(json.loads)
movies_processed["genre_names"] = movies_processed["genres"].apply(lambda gs: sorted(g["name"] for g in gs))
movies_processed.drop(columns=["genres"], inplace=True)

movies_processed.head(1)

Unnamed: 0,tmdb_id,overview,release_date,runtime,title,poster_url,genre_names
0,19995,"In the 22nd century, a paraplegic Marine is di...",2009-12-10,162.0,Avatar,https://image.tmdb.org/t/p/w500/gKY6q7SjCkAU6F...,"[Action, Adventure, Fantasy, Science Fiction]"


In [55]:
all_genres = sorted({name for names in movies_processed["genre_names"] for name in names})
all_genres

['Action',
 'Adventure',
 'Animation',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Foreign',
 'History',
 'Horror',
 'Music',
 'Mystery',
 'Romance',
 'Science Fiction',
 'TV Movie',
 'Thriller',
 'War',
 'Western']

In [56]:
movies_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   tmdb_id       4803 non-null   int64  
 1   overview      4800 non-null   object 
 2   release_date  4802 non-null   object 
 3   runtime       4801 non-null   float64
 4   title         4803 non-null   object 
 5   poster_url    4803 non-null   object 
 6   genre_names   4803 non-null   object 
dtypes: float64(1), int64(1), object(5)
memory usage: 262.8+ KB


In [None]:
genres_with_custom_ids = []

for i, genre in enumerate(all_genres):
    genres_with_custom_ids.append((i, genre))

genres_with_custom_ids

[(0, 'Action'),
 (1, 'Adventure'),
 (2, 'Animation'),
 (3, 'Comedy'),
 (4, 'Crime'),
 (5, 'Documentary'),
 (6, 'Drama'),
 (7, 'Family'),
 (8, 'Fantasy'),
 (9, 'Foreign'),
 (10, 'History'),
 (11, 'Horror'),
 (12, 'Music'),
 (13, 'Mystery'),
 (14, 'Romance'),
 (15, 'Science Fiction'),
 (16, 'TV Movie'),
 (17, 'Thriller'),
 (18, 'War'),
 (19, 'Western')]

In [73]:
genre_id_map = {name: gid for gid, name in genres_with_custom_ids}

movies_processed["genre_ids"] = movies_processed["genre_names"].apply(
    lambda names: [genre_id_map[name] for name in names]
)

movies_processed.drop(columns=["genre_names"], inplace=True)

movies_processed.head(1)

Unnamed: 0,tmdb_id,overview,release_date,runtime,title,poster_url,genre_ids
0,19995,"In the 22nd century, a paraplegic Marine is di...",2009-12-10,162.0,Avatar,https://image.tmdb.org/t/p/w500/gKY6q7SjCkAU6F...,"[0, 1, 8, 15]"


In [None]:
output_path = Path.cwd() / "processed" / "tmdb_movies_clean.csv"
output_path.parent.mkdir(parents=True, exist_ok=True)
movies_processed.to_csv(output_path, index=False)

In [77]:
processed_dir = Path.cwd() / "processed"
processed_dir.mkdir(exist_ok=True)

genres_df = pd.DataFrame(genres_with_custom_ids, columns=["id", "name"])
genres_df.to_csv(processed_dir / "tmdb_genres_clean.csv", index=False)