# 🎬 IMDb US‑Only Movie Dataset Cleaning Pipeline (2000–2022)

This notebook:
1. Loads IMDb datasets from official URLs  
2. Cleans and filters for **US‑only movies (2000–2022)**  
3. Removes documentaries and incomplete records  
4. Saves cleaned data both as compressed CSV and Excel spreadsheets for easy review


In [2]:
#!/usr/bin/env python
# coding: utf-8

import pandas as pd
import numpy as np
import os

pd.set_option('display.max_columns', None)  # See all columns when inspecting

In [4]:
# === 1. LOAD RAW DATA ===

# Title Basics: type, name, year, runtime, genre
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)

# Ratings: averageRating and numVotes
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)

# AKAs: alternative titles and release regions
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)


In [7]:
# === 2. REPLACE '\N' WITH NaN ACROSS ALL DATASETS ===
akas.replace({'\\N': np.nan}, inplace=True)
basics.replace({'\\N': np.nan}, inplace=True)
ratings.replace({'\\N': np.nan}, inplace=True)


In [8]:
# === 3. FILTER AKAs FOR US TITLES ONLY ===
akas = akas[akas['region'] == 'US'].reset_index(drop=True)


In [13]:
# === 4. CLEAN & FILTER BASICS ===

# Drop duplicate rows
basics.drop_duplicates(inplace=True)

# Remove rows with missing runtime or genres
basics.dropna(subset=['runtimeMinutes', 'genres'], inplace=True)

# Keep only movies
basics = basics[basics['titleType'] == 'movie']

# Convert years to numeric and filter for range
basics['startYear'] = pd.to_numeric(basics['startYear'], errors='coerce')
basics = basics[(basics['startYear'] >= 2000) & (basics['startYear'] <= 2022)]

# Remove documentaries
basics = basics[~basics['genres'].str.lower().str.contains('documentary')]

# Keep only movies present in US AKAs dataset
basics = basics[basics['tconst'].isin(akas['titleId'])].reset_index(drop=True)


In [15]:
# === 5. CLEAN & FILTER RATINGS ===
ratings = ratings[ratings['tconst'].isin(akas['titleId'])].reset_index(drop=True)


In [17]:
# === 6. SAVE CLEANED FILES ===

os.makedirs('Data', exist_ok=True)

basics.to_csv('Data/final_basics.csv.gz', compression='gzip', index=False)
akas.to_csv('Data/final_akas.csv.gz', compression='gzip', index=False)
ratings.to_csv('Data/final_ratings.csv.gz', compression='gzip', index=False)

print("Files saved to 'Data/' folder.")


Files saved to 'Data/' folder.


In [33]:
# === 7. QUICK VERIFY ===
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0082328,movie,Embodiment of Evil,Encarnação do Demônio,0,2008.0,,94,Horror
4,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...,...
91006,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019.0,,74,Drama
91007,tt9915872,movie,The Last White Witch,Boku no kanojo wa mahoutsukai,0,2019.0,,97,"Comedy,Drama,Fantasy"
91008,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama
91009,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"


In [35]:
akas

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,3,Carmencita,US,,imdbDisplay,,0
1,tt0000002,6,The Clown and His Dogs,US,,,literal English title,0
2,tt0000003,3,Poor Pierrot,US,,imdbDisplay,,0
3,tt0000005,3,Blacksmith Scene,US,,imdbDisplay,,0
4,tt0000005,5,Blacksmith Scene #1,US,,alternative,,0
...,...,...,...,...,...,...,...,...
1655559,tt9916560,2,March of Dimes Presents: Once Upon a Dime,US,,imdbDisplay,,0
1655560,tt9916620,2,The Copeland Case,US,,imdbDisplay,,0
1655561,tt9916702,2,Loving London: The Playground,US,,,,0
1655562,tt9916756,2,Pretty Pretty Black Girl,US,,imdbDisplay,,0


In [37]:
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2170
1,tt0000002,5.5,298
2,tt0000003,6.5,2232
3,tt0000005,6.2,2973
4,tt0000006,5.0,217
...,...,...,...
578201,tt9916200,8.0,277
578202,tt9916204,8.0,321
578203,tt9916348,8.2,22
578204,tt9916362,6.4,6106
