# 📊 Project Title: IMDb US‑Only Movie Dataset Cleaning Pipeline (2000–2022)
**Author:** Joseph Tulani Aytch  
**Last Updated:** Aug 2025  
**Purpose:** Clean and filter IMDb datasets for U.S.‑only movies, removing documentaries and incomplete records, and saving reproducible outputs.

---

## 📌 How to Use This Notebook
- **View Only:** Scroll to see cleaned tables and steps – no setup needed.
- **Run Yourself:**
  1. Install dependencies:  
     ```bash
     pip install -r requirements.txt
     ```
  2. Run all cells.  
     _Requires internet access for initial IMDb dataset download._
- **Update Local Copy:** Edit in JupyterLab on any machine, push to GitHub, and pull updates elsewhere.

---

## 🛠 Skills Demonstrated
- Data sourcing from public URLs
- Data cleaning and filtering with `pandas` / `numpy`
- Conditional logic for reproducible pipelines
- Portfolio‑friendly documentation



In [1]:
#!/usr/bin/env python
# coding: utf-8

import pandas as pd
import numpy as np
import os

pd.set_option('display.max_columns', None)  # See all columns when inspecting

In [2]:
# === 0. CHECK FOR EXISTING CLEAN FILES BEFORE DOWNLOADING ===
data_dir = os.path.join("Data")
basics_path  = os.path.join(data_dir, "final_basics.csv.gz")
akas_path    = os.path.join(data_dir, "final_akas.csv.gz")
ratings_path = os.path.join(data_dir, "final_ratings.csv.gz")

if all(os.path.exists(p) for p in [basics_path, akas_path, ratings_path]):
    basics  = pd.read_csv(basics_path,  compression='gzip')
    akas    = pd.read_csv(akas_path,    compression='gzip')
    ratings = pd.read_csv(ratings_path, compression='gzip')
    print("📂 Loaded existing cleaned data from local Data/ folder.")
    skip_download = True
else:
    print("🌐 Cleaned files not found locally — will download raw IMDb datasets.")
    skip_download = False


📂 Loaded existing cleaned data from local Data/ folder.


In [3]:
if not skip_download:
    # === 1. LOAD RAW DATA ===
    basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
    basics = pd.read_csv(basics_url, sep='\t', low_memory=False)

    ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
    ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)

    akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
    akas = pd.read_csv(akas_url, sep='\t', low_memory=False)


In [4]:
# === 2. REPLACE '\N' WITH NaN ACROSS ALL DATASETS ===
akas.replace({'\\N': np.nan}, inplace=True)
basics.replace({'\\N': np.nan}, inplace=True)
ratings.replace({'\\N': np.nan}, inplace=True)


In [5]:
# === 3. FILTER AKAs FOR US TITLES ONLY ===
akas = akas[akas['region'] == 'US'].reset_index(drop=True)


In [6]:
# === 4. CLEAN & FILTER BASICS ===

# Drop duplicate rows
basics.drop_duplicates(inplace=True)

# Remove rows with missing runtime or genres
basics.dropna(subset=['runtimeMinutes', 'genres'], inplace=True)

# Keep only movies
basics = basics[basics['titleType'] == 'movie']

# Convert years to numeric and filter for range
basics['startYear'] = pd.to_numeric(basics['startYear'], errors='coerce')
basics = basics[(basics['startYear'] >= 2000) & (basics['startYear'] <= 2022)]

# Remove documentaries
basics = basics[~basics['genres'].str.lower().str.contains('documentary')]

# Keep only movies present in US AKAs dataset
basics = basics[basics['tconst'].isin(akas['titleId'])].reset_index(drop=True)


In [7]:
# === 5. CLEAN & FILTER RATINGS ===
ratings = ratings[ratings['tconst'].isin(akas['titleId'])].reset_index(drop=True)


In [8]:
 # === 6. SAVE CLEANED FILES ===

# Always save in the repo's Data/ folder (relative path)
data_dir = os.path.join("Data")
os.makedirs(data_dir, exist_ok=True)

basics_path  = os.path.join(data_dir, "final_basics.csv.gz")
akas_path    = os.path.join(data_dir, "final_akas.csv.gz")
ratings_path = os.path.join(data_dir, "final_ratings.csv.gz")

basics.to_csv(basics_path,  compression='gzip', index=False)
akas.to_csv(akas_path,      compression='gzip', index=False)
ratings.to_csv(ratings_path,compression='gzip', index=False)

print(f"✅ Cleaned files saved to: {data_dir}/")
print(f"  - {os.path.basename(basics_path)}")
print(f"  - {os.path.basename(akas_path)}")
print(f"  - {os.path.basename(ratings_path)}")


✅ Cleaned files saved to: Data/
  - final_basics.csv.gz
  - final_akas.csv.gz
  - final_ratings.csv.gz


In [9]:
# === 7. QUICK VERIFY ===
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama
...,...,...,...,...,...,...,...,...,...
84195,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019,,74,Drama
84196,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019,,97,"Comedy,Drama,Fantasy"
84197,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
84198,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"


In [10]:
akas

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0
...,...,...,...,...,...,...,...,...
1395965,tt9916702,1,Loving London: The Playground,US,,,,0.0
1395966,tt9916720,10,The Demonic Nun,US,,tv,,0.0
1395967,tt9916720,12,The Nun 2,US,,imdbDisplay,,0.0
1395968,tt9916756,1,Pretty Pretty Black Girl,US,,imdbDisplay,,0.0


In [11]:
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1930
1,tt0000002,5.8,261
2,tt0000005,6.2,2560
3,tt0000006,5.1,176
4,tt0000007,5.4,798
...,...,...,...
479650,tt9916204,8.2,251
479651,tt9916348,8.5,17
479652,tt9916362,6.4,5073
479653,tt9916428,3.8,14
