TMDB Movie Data Analysis Project - Phase 2: Data Cleaning\
Step 2: Data Inspection & Cleaning Setup



In [1]:
# ============================================================================
# IMPORTS
# ============================================================================
import pandas as pd
import numpy as np
import json
from ast import literal_eval 

In [2]:
# ============================================================================
# STEP 1: Load the Raw Data
# ============================================================================

print("="*70)
print("LOADING RAW MOVIE DATA")
print("="*70)

# Load the CSV file we created in Phase 1
df = pd.read_csv('movies_raw_data.csv')

print(f"\n Data loaded successfully!")
print(f" Shape: {df.shape[0]} rows × {df.shape[1]} columns")

LOADING RAW MOVIE DATA

 Data loaded successfully!
 Shape: 18 rows × 28 columns


In [3]:
# ============================================================================
# STEP 2: Initial Inspection
# ============================================================================

print("\n" + "="*70)
print("INITIAL DATA INSPECTION")
print("="*70)

# Show first few rows
print("\n1. First 3 rows:")
print("-" * 70)
print(df.head(3))

# Show all column names
print("\n2. All columns in dataset:")
print("-" * 70)
for i, col in enumerate(df.columns, 1):
    print(f"{i:2d}. {col}")

# Show data types
print("\n3. Data types:")
print("-" * 70)
print(df.dtypes)

# Show missing values count
print("\n4. Missing values per column:")
print("-" * 70)
missing_counts = df.isnull().sum()
print(missing_counts[missing_counts > 0])


INITIAL DATA INSPECTION

1. First 3 rows:
----------------------------------------------------------------------
   adult                     backdrop_path  \
0  False  /9wXPKruA6bWYk2co5ix6fH59Qr8.jpg   
1  False  /7JNzw1tSZZEgsBw6lu0VfO2X2Ef.jpg   
2  False  /8BTsTfln4jlQrLXUBquXJ0ASQy9.jpg   

                               belongs_to_collection     budget  \
0  {'id': 86311, 'name': 'The Avengers Collection...  356000000   
1  {'id': 87096, 'name': 'Avatar Collection', 'po...  237000000   
2  {'id': 10, 'name': 'Star Wars Collection', 'po...  245000000   

                                              genres  \
0  [{'id': 12, 'name': 'Adventure'}, {'id': 878, ...   
1  [{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...   
2  [{'id': 12, 'name': 'Adventure'}, {'id': 28, '...   

                                            homepage      id    imdb_id  \
0     https://www.marvel.com/movies/avengers-endgame  299534  tt4154796   
1               https://www.avatar.com/movies/avatar   19

In [4]:
# ============================================================================
# STEP 3: Identify JSON Columns That Need Extraction
# ============================================================================

print("\n" + "="*70)
print("JSON COLUMNS THAT NEED EXTRACTION")
print("="*70)

# These columns contain JSON-like data (lists of dictionaries)
json_columns = [
    'belongs_to_collection',
    'genres', 
    'production_countries',
    'production_companies',
    'spoken_languages',
    'cast',
    'crew'
]

print("\nColumns with nested JSON data:")
for col in json_columns:
    if col in df.columns:
        print(f"   {col}")
        # Show example of first non-null value
        non_null = df[col].dropna().iloc[0] if not df[col].dropna().empty else None
        if non_null:
            print(f"    Example: {str(non_null)[:100]}...")
    else:
        print(f"   {col} (not found)")



JSON COLUMNS THAT NEED EXTRACTION

Columns with nested JSON data:
   belongs_to_collection
    Example: {'id': 86311, 'name': 'The Avengers Collection', 'poster_path': '/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg', ...
   genres
    Example: [{'id': 12, 'name': 'Adventure'}, {'id': 878, 'name': 'Science Fiction'}, {'id': 28, 'name': 'Action...
   production_countries
    Example: [{'iso_3166_1': 'US', 'name': 'United States of America'}]...
   production_companies
    Example: [{'id': 420, 'logo_path': '/hUzeosd33nzE5MCNsZxCGEKTXaQ.png', 'name': 'Marvel Studios', 'origin_coun...
   spoken_languages
    Example: [{'english_name': 'English', 'iso_639_1': 'en', 'name': 'English'}, {'english_name': 'Japanese', 'is...
   cast
    Example: [{'adult': False, 'gender': 2, 'id': 3223, 'known_for_department': 'Acting', 'name': 'Robert Downey ...
   crew
    Example: [{'adult': False, 'gender': 0, 'id': 3019687, 'known_for_department': 'Directing', 'name': 'Paul Sch...


In [5]:
# ============================================================================
# STEP 4: Sample Inspection - Let's Look at One Movie in Detail
# ============================================================================

print("\n" + "="*70)
print("DETAILED LOOK AT ONE MOVIE (Avengers: Endgame)")
print("="*70)

# Find Avengers Endgame (ID: 299534)
sample_movie = df[df['id'] == 299534].iloc[0] if 299534 in df['id'].values else df.iloc[0]

print(f"\nMovie: {sample_movie['title']}")
print(f"ID: {sample_movie['id']}")
print(f"Release Date: {sample_movie['release_date']}")
print(f"Budget: ${sample_movie['budget']:,.0f}")
print(f"Revenue: ${sample_movie['revenue']:,.0f}")

print("\n--- JSON Fields (Before Extraction) ---")

# Show genres structure
if 'genres' in df.columns and pd.notna(sample_movie['genres']):
    print(f"\nGenres (raw): {sample_movie['genres'][:200]}...")

# Show collection structure  
if 'belongs_to_collection' in df.columns and pd.notna(sample_movie['belongs_to_collection']):
    print(f"\nCollection (raw): {sample_movie['belongs_to_collection'][:200]}...")

# Show cast structure (just first 2 cast members)
if 'cast' in df.columns and pd.notna(sample_movie['cast']):
    print(f"\nCast (raw - first 200 chars): {str(sample_movie['cast'])[:200]}...")



DETAILED LOOK AT ONE MOVIE (Avengers: Endgame)

Movie: Avengers: Endgame
ID: 299534
Release Date: 2019-04-24
Budget: $356,000,000
Revenue: $2,799,439,100

--- JSON Fields (Before Extraction) ---

Genres (raw): [{'id': 12, 'name': 'Adventure'}, {'id': 878, 'name': 'Science Fiction'}, {'id': 28, 'name': 'Action'}]...

Collection (raw): {'id': 86311, 'name': 'The Avengers Collection', 'poster_path': '/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg', 'backdrop_path': '/zuW6fOiusv4X9nnW3paHGfXcSll.jpg'}...

Cast (raw - first 200 chars): [{'adult': False, 'gender': 2, 'id': 3223, 'known_for_department': 'Acting', 'name': 'Robert Downey Jr.', 'original_name': 'Robert Downey Jr.', 'popularity': 8.8706, 'profile_path': '/5qHNjhtjMD4YWH3U...


In [6]:
# ============================================================================
# STEP 5: Check What Columns to Drop
# ============================================================================

print("\n" + "="*70)
print("COLUMNS TO DROP (Per Requirements)")
print("="*70)

columns_to_drop = ['adult', 'imdb_id', 'original_title', 'video', 'homepage']

print("\nColumns that should be dropped:")
for col in columns_to_drop:
    if col in df.columns:
        print(f"   {col} - EXISTS (will drop)")
    else:
        print(f"   {col} - NOT FOUND (already missing)")


COLUMNS TO DROP (Per Requirements)

Columns that should be dropped:
   adult - EXISTS (will drop)
   imdb_id - EXISTS (will drop)
   original_title - EXISTS (will drop)
   video - EXISTS (will drop)
   homepage - EXISTS (will drop)


In [7]:

# ============================================================================
# STEP 6: Data Quality Check
# ============================================================================

print("\n" + "="*70)
print("DATA QUALITY CHECKS")
print("="*70)

# Check for duplicates
duplicates = df.duplicated(subset=['id']).sum()
print(f"\n1. Duplicate movie IDs: {duplicates}")

# Check for movies with ID = 0 or missing
invalid_ids = df[df['id'].isna() | (df['id'] == 0)].shape[0]
print(f"2. Invalid movie IDs (0 or NaN): {invalid_ids}")

# Check budget/revenue = 0
budget_zero = (df['budget'] == 0).sum()
revenue_zero = (df['revenue'] == 0).sum()
print(f"3. Movies with budget = 0: {budget_zero}")
print(f"4. Movies with revenue = 0: {revenue_zero}")

# Check status column if exists
if 'status' in df.columns:
    print(f"\n5. Movie status distribution:")
    print(df['status'].value_counts())

# Check vote_count = 0
if 'vote_count' in df.columns:
    vote_zero = (df['vote_count'] == 0).sum()
    print(f"\n6. Movies with vote_count = 0: {vote_zero}")



DATA QUALITY CHECKS

1. Duplicate movie IDs: 0
2. Invalid movie IDs (0 or NaN): 0
3. Movies with budget = 0: 0
4. Movies with revenue = 0: 0

5. Movie status distribution:
status
Released    18
Name: count, dtype: int64

6. Movies with vote_count = 0: 0


In [9]:
# ============================================================================
# STEP 7: Summary Before Cleaning
# ============================================================================

print(f"""
Current Dataset:
  • Total movies: {df.shape[0]}
  • Total columns: {df.shape[1]}
  • Movies with issues: {invalid_ids + duplicates}
  
Next Steps:
  1. Drop unwanted columns ({len([c for c in columns_to_drop if c in df.columns])} columns)
  2. Extract data from {len([c for c in json_columns if c in df.columns])} JSON columns
  3. Process cast & crew to find director and cast names
  4. Convert data types (budget/revenue to millions, dates to datetime)
  5. Handle missing values and zeros
  6. Filter to keep only Released movies with good data quality
  
""")




Current Dataset:
  • Total movies: 18
  • Total columns: 28
  • Movies with issues: 0
  
Next Steps:
  1. Drop unwanted columns (5 columns)
  2. Extract data from 7 JSON columns
  3. Process cast & crew to find director and cast names
  4. Convert data types (budget/revenue to millions, dates to datetime)
  5. Handle missing values and zeros
  6. Filter to keep only Released movies with good data quality
  

