# Importing relevant libraries

In [27]:
from kaggle.api.kaggle_api_extended import KaggleApi
import zipfile
import os
import pandas as pd
import ast
import requests
import time

1. Get your kaggle.json file:
- Go to https://www.kaggle.com/account where account is ur account name
- Scroll to the API section
- Click Create New API Token
- This downloads kaggle.json to your computer (usually in Downloads folder)


2. Put kaggle.json in the right folder
- Move the file to this folder:
    C:\Users\Morad Elshorbagy\\.kaggle\

If the .kaggle folder doesn’t exist, create it manually

In [28]:
# === STEP 0: Setup Kaggle API and download dataset ZIP ===
api = KaggleApi()
api.authenticate()

dataset_zip = 'the-movies-dataset.zip'
output_folder = 'data'

# Download the dataset ZIP only if not already downloaded
if not os.path.exists(dataset_zip):
    print("Downloading dataset ZIP...")
    api.dataset_download_files('rounakbanik/the-movies-dataset', path='.', unzip=False)
else:
    print("Dataset ZIP already downloaded.")

Dataset ZIP already downloaded.


## Extracting relevant files
we will only use ratings and movies, they contain everything we need in order to make a collaborative filtering recommender

In [29]:
# Create data folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# === STEP 1: Extract all necessary files (only if missing) ===
needed_files = ['ratings.csv', 'movies_metadata.csv']
existing_files = os.listdir(output_folder)

with zipfile.ZipFile(dataset_zip, 'r') as zip_ref:
    for file in needed_files:
        if file not in existing_files:
            print(f"Extracting {file}...")
            zip_ref.extract(file, path=output_folder)
        else:
            print(f"{file} already extracted.")

print("Extraction complete.\n")

ratings.csv already extracted.
movies_metadata.csv already extracted.
Extraction complete.



## Loading datasets

In [30]:
# === STEP 2: Load datasets ===
print("Loading datasets...")
movies = pd.read_csv(os.path.join(output_folder, 'movies_metadata.csv'), low_memory=False)
ratings = pd.read_csv(os.path.join(output_folder, 'ratings.csv'))
print("Datasets loaded.\n")

Loading datasets...
Datasets loaded.



In [31]:
# Inspect structure of all datasets
print("movies_metadata.csv columns:\n", movies.columns)
print("ratings.csv columns:\n", ratings.columns)

movies_metadata.csv columns:
 Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')
ratings.csv columns:
 Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')


In [32]:
# === STEP 3: Make copies to keep originals intact ===
movies_clean = movies.copy()
ratings_clean = ratings.copy()


In [33]:
# === STEP 4: Define helper functions ===
def parse_json_column(json_str):
    """Parse JSON-like string to list of names."""
    try:
        items = ast.literal_eval(json_str)
        return [item['name'] for item in items]
    except (ValueError, SyntaxError):
        return []

def safe_int_conversion(val):
    """Safely convert to int, return None if fails."""
    try:
        return int(val)
    except:
        return None

# Cleaning the data
1. Filling NaN with 0s
2. Removing rows with missing info
2. Removing duplicates
3. Removing irrelevant columns like timestamp
4. Ensuring all columns are of the right type

In [34]:
# === STEP 5: Clean movies metadata ===
print("Cleaning movies metadata...")
# Convert budget and revenue to numeric, fill NaN with 0
movies_clean['budget'] = pd.to_numeric(movies_clean['budget'], errors='coerce').fillna(0)
movies_clean['revenue'] = pd.to_numeric(movies_clean['revenue'], errors='coerce').fillna(0)

# Drop rows missing critical info
movies_clean = movies_clean.dropna(subset=['title', 'id'])

# Convert 'id' to numeric and drop invalid rows
movies_clean['id'] = pd.to_numeric(movies_clean['id'], errors='coerce')
movies_clean = movies_clean.dropna(subset=['id'])
movies_clean['id'] = movies_clean['id'].astype(int)

# Parse genres column (JSON string) into list of genre names
movies_clean['genres'] = movies_clean['genres'].apply(parse_json_column)
# Replace empty genres lists with ['Unknown']
movies_clean['genres'] = movies_clean['genres'].apply(lambda x: x if x else ['Unknown'])
print("Movies metadata cleaned.\n")

# === STEP 6: Clean ratings data ===
print("Cleaning ratings data...")
# Drop duplicates (same userId, movieId)
ratings_clean = ratings_clean.drop_duplicates(subset=['userId', 'movieId'])
# Drop rows with missing essential columns
ratings_clean = ratings_clean.dropna(subset=['userId', 'movieId', 'rating'])
# Convert types properly
ratings_clean['userId'] = ratings_clean['userId'].astype(int)
ratings_clean['movieId'] = ratings_clean['movieId'].astype(int)
ratings_clean['rating'] = ratings_clean['rating'].astype(float)
ratings_clean = ratings_clean.drop('timestamp', axis=1)

print("Ratings data cleaned.\n")


Cleaning movies metadata...
Movies metadata cleaned.

Cleaning ratings data...
Ratings data cleaned.



In [35]:
# === STEP 10: Summary info ===
print(f"Movies dataset shape: {movies_clean.shape}")
print(f"Ratings dataset shape: {ratings_clean.shape}")


Movies dataset shape: (45460, 24)
Ratings dataset shape: (26024289, 3)


# Saving the files

In [36]:
# === STEP 11: Save cleaned data to CSV for reuse ===
movies_clean.to_csv(os.path.join(output_folder, 'movies_full_clean.csv'), index=False)
ratings_clean.to_csv(os.path.join(output_folder, 'ratings_clean.csv'), index=False)

print(f"Cleaned datasets saved in '{output_folder}' folder.")

Cleaned datasets saved in 'data' folder.


# Extracting correct posters url for frontend
We will use the TMDb API to get the correct poster URLs for each movie.

heavily recommend using kaggle or colab for their GPUs


In [45]:
# Load your movie CSV
df = pd.read_csv(os.path.join(output_folder, 'movies_full_clean.csv'))
df = df[df['id'].notnull()].head(5000)
df['id'] = df['id'].astype(int)
df = df.drop(['adult', 'belongs_to_collection','budget','genres','homepage', 'imdb_id',	'original_language', 'original_title',	'overview',	'popularity', 'production_companies',	'production_countries',	'release_date',	'revenue',	'runtime',	'spoken_languages',	'status',	'tagline',	'video', 'vote_average',	'vote_count'], axis=1)


df["movieId"] = df.index + 1

api_key = "ddcd46c520d289bef23dc0f9a303a79c"
base_url = "https://api.themoviedb.org/3/movie/{}?api_key={}"
poster_base_url = "https://image.tmdb.org/t/p/w500"

# If partial file exists, resume
output_file = os.path.join(output_folder, 'movies_with_tmdb_data.csv')
if os.path.exists(output_file):
    df_saved = pd.read_csv(output_file)

    # Make sure columns exist before update
    if 'poster_url' not in df.columns:
        df["poster_url"] = ""
    if 'fetched_title' not in df.columns:
        df["fetched_title"] = ""

    start_index = len(df_saved)
    df.update(df_saved)
    print(f"Resuming from index {start_index}...")
else:
    df["poster_url"] = ""
    df["fetched_title"] = ""
    start_index = 0

# Loop through TMDb IDs
for i in range(start_index, len(df)):
    row = df.iloc[i]
    tmdb_id = row["id"]
    try:
        url = base_url.format(int(tmdb_id), api_key)
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            df.at[i, "poster_url"] = (
                poster_base_url + data.get("poster_path", "") if data.get("poster_path") else ""
            )
            df.at[i, "fetched_title"] = data.get("title", "")
        else:
            print(f"Error for ID {tmdb_id}: {response.status_code}")
        if i % 200 == 0:
            df.to_csv(output_file, index=False)
            print(f"Checkpoint saved at row {i}")
        time.sleep(0.25)
    except Exception as e:
        print(f"Error processing ID {tmdb_id}: {e}")

df = df.rename(columns={"id": "tmdbId"})
# Final save
df.to_csv(output_file, index=False)
print("Done. Enriched metadata saved to:", output_file)


Resuming from index 5000...
Done. Enriched metadata saved to: data\movies_with_tmdb_data.csv
