TMDB Movie Data Analysis Project - Phase 2: Data Cleaning \
Step 3: Complete Data Cleaning & Transformation

This script performs all required cleaning operations on the raw movie data.


In [3]:
# ============================================================================
# IMPORTS
# ============================================================================
import pandas as pd
import numpy as np
import json
from ast import literal_eval
import warnings
warnings.filterwarnings('ignore')  # Suppress warnings for cleaner output

In [4]:
# ============================================================================
# LOAD RAW DATA
# ============================================================================

# Load raw data
df = pd.read_csv('movies_raw_data.csv')
print(f"\n Loaded raw data: {df.shape[0]} rows × {df.shape[1]} columns")

# Create a copy to preserve original
df_clean = df.copy()


 Loaded raw data: 18 rows × 28 columns


In [5]:

# ============================================================================
# HELPER FUNCTIONS FOR JSON EXTRACTION
# ============================================================================

def safe_eval(value):
    """
    Safely convert string representation of list/dict to actual Python object.
    Returns None if conversion fails.
    """
    if pd.isna(value) or value == '' or value == '[]':
        return None
    try:
        # Try to evaluate as Python literal
        return literal_eval(value) if isinstance(value, str) else value
    except:
        return None


def extract_names_from_list(data, key='name'):
    """
    Extract names from a list of dictionaries.
    
    Example input: [{'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}]
    Example output: 'Action|Adventure'
    """
    if not data or pd.isna(data):
        return None
    
    # If it's a string, convert to list
    if isinstance(data, str):
        data = safe_eval(data)
    
    # If still not a list, return None
    if not isinstance(data, list) or len(data) == 0:
        return None
    
    # Extract names and join with |
    try:
        names = [item[key] for item in data if isinstance(item, dict) and key in item]
        return '|'.join(names) if names else None
    except:
        return None


def extract_collection_name(data):
    """
    Extract collection name from belongs_to_collection field.
    
    Example input: {'id': 86311, 'name': 'The Avengers Collection', ...}
    Example output: 'The Avengers Collection'
    """
    if pd.isna(data) or data == '' or data == '{}':
        return None
    
    # If it's a string, convert to dict
    if isinstance(data, str):
        data = safe_eval(data)
    
    # Extract name
    if isinstance(data, dict) and 'name' in data:
        return data['name']
    
    return None


def find_director(crew_data):
    """
    Find the director from crew list.
    
    Example input: [{'name': 'John Doe', 'job': 'Director'}, ...]
    Example output: 'John Doe'
    """
    if not crew_data or pd.isna(crew_data):
        return None
    
    # If it's a string, convert to list
    if isinstance(crew_data, str):
        crew_data = safe_eval(crew_data)
    
    if not isinstance(crew_data, list):
        return None
    
    # Find first person with job='Director'
    try:
        for person in crew_data:
            if isinstance(person, dict) and person.get('job') == 'Director':
                return person.get('name')
    except:
        pass
    
    return None


def extract_cast_names(cast_data, max_names=5):
    """
    Extract top cast member names.
    
    Example input: [{'name': 'Robert Downey Jr.', 'order': 0}, ...]
    Example output: 'Robert Downey Jr.|Chris Evans|Mark Ruffalo|...'
    """
    if not cast_data or pd.isna(cast_data):
        return None
    
    # If it's a string, convert to list
    if isinstance(cast_data, str):
        cast_data = safe_eval(cast_data)
    
    if not isinstance(cast_data, list) or len(cast_data) == 0:
        return None
    
    # Extract names (top 5 by default)
    try:
        names = [person['name'] for person in cast_data[:max_names] 
                 if isinstance(person, dict) and 'name' in person]
        return '|'.join(names) if names else None
    except:
        return None

In [6]:
# ============================================================================
# CLEANING STEP 1: Drop Unwanted Columns
# ============================================================================

columns_to_drop = ['adult', 'imdb_id', 'original_title', 'video', 'homepage']

# Only drop columns that exist
existing_to_drop = [col for col in columns_to_drop if col in df_clean.columns]

if existing_to_drop:
    df_clean = df_clean.drop(columns=existing_to_drop)
    print(f" Dropped {len(existing_to_drop)} columns: {', '.join(existing_to_drop)}")
else:
    print(" No columns to drop (already removed or never existed)")

print(f" New shape: {df_clean.shape[0]} rows × {df_clean.shape[1]} columns")


 Dropped 5 columns: adult, imdb_id, original_title, video, homepage
 New shape: 18 rows × 23 columns


In [7]:
# ============================================================================
# CLEANING STEP 2: Extract Data from JSON Columns
# ============================================================================

print("\n" + "-"*70)
print("2. EXTRACTING DATA FROM JSON COLUMNS")
print("-"*70)

# 2.1 Extract Collection Name
if 'belongs_to_collection' in df_clean.columns:
    print("\n  2.1 Extracting collection names...")
    df_clean['belongs_to_collection'] = df_clean['belongs_to_collection'].apply(extract_collection_name)
    non_null = df_clean['belongs_to_collection'].notna().sum()
    print(f"       Extracted collection names: {non_null} movies belong to a collection")

# 2.2 Extract Genres
if 'genres' in df_clean.columns:
    print("\n  2.2 Extracting genres...")
    df_clean['genres'] = df_clean['genres'].apply(extract_names_from_list)
    print(f"       Genres extracted and formatted with '|' separator")

    # Safe example printing
    genres_non_null = df_clean['genres'].dropna()
    if len(genres_non_null) > 0:
        print(f"      Example: {genres_non_null.iloc[0][:50]}...")
    else:
        print("      Example: No non-null genre entries available.")

# 2.3 Extract Spoken Languages
if 'spoken_languages' in df_clean.columns:
    print("\n  2.3 Extracting spoken languages...")
    df_clean['spoken_languages'] = df_clean['spoken_languages'].apply(
        lambda x: extract_names_from_list(x, key='english_name')
    )
    print(f"       Languages extracted")

# 2.4 Extract Production Countries
if 'production_countries' in df_clean.columns:
    print("\n  2.4 Extracting production countries...")
    df_clean['production_countries'] = df_clean['production_countries'].apply(extract_names_from_list)
    print(f"       Countries extracted")

# 2.5 Extract Production Companies
if 'production_companies' in df_clean.columns:
    print("\n  2.5 Extracting production companies...")
    df_clean['production_companies'] = df_clean['production_companies'].apply(extract_names_from_list)
    print(f"       Companies extracted")


----------------------------------------------------------------------
2. EXTRACTING DATA FROM JSON COLUMNS
----------------------------------------------------------------------

  2.1 Extracting collection names...
       Extracted collection names: 16 movies belong to a collection

  2.2 Extracting genres...
       Genres extracted and formatted with '|' separator
      Example: Adventure|Science Fiction|Action...

  2.3 Extracting spoken languages...
       Languages extracted

  2.4 Extracting production countries...
       Countries extracted

  2.5 Extracting production companies...
       Companies extracted


In [8]:
# ============================================================================
# CLEANING STEP 3: Process Cast & Crew
# ============================================================================

print("\n" + "-"*70)
print("3. PROCESSING CAST & CREW DATA")
print("-"*70)

# 3.1 Extract Director
if 'crew' in df_clean.columns:
    print("\n  3.1 Finding directors...")
    df_clean['director'] = df_clean['crew'].apply(find_director)
    directors_found = df_clean['director'].notna().sum()
    print(f"       Directors found for {directors_found} movies")

# 3.2 Calculate Crew Size
if 'crew' in df_clean.columns:
    print("\n  3.2 Calculating crew size...")
    df_clean['crew_size'] = df_clean['crew'].apply(
        lambda x: len(safe_eval(x)) if safe_eval(x) else 0
    )
    print(f"       Crew size calculated")

# 3.3 Extract Cast Names
if 'cast' in df_clean.columns:
    print("\n  3.3 Extracting cast names...")
    df_clean['cast'] = df_clean['cast'].apply(lambda x: extract_cast_names(x, max_names=5))
    print(f"       Top 5 cast members extracted per movie")

# 3.4 Calculate Cast Size (before extraction)
# We need to recalculate from raw data since we already modified cast
if 'cast' in df.columns:
    print("\n  3.4 Calculating cast size...")
    df_clean['cast_size'] = df['cast'].apply(
        lambda x: len(safe_eval(x)) if safe_eval(x) else 0
    )
    print(f"       Cast size calculated")

# Drop the crew column (we don't need it anymore)
if 'crew' in df_clean.columns:
    df_clean = df_clean.drop(columns=['crew'])
    print("\n   Dropped 'crew' column (no longer needed)")



----------------------------------------------------------------------
3. PROCESSING CAST & CREW DATA
----------------------------------------------------------------------

  3.1 Finding directors...
       Directors found for 18 movies

  3.2 Calculating crew size...
       Crew size calculated

  3.3 Extracting cast names...
       Top 5 cast members extracted per movie

  3.4 Calculating cast size...
       Cast size calculated

   Dropped 'crew' column (no longer needed)


In [9]:
# ============================================================================
# CLEANING STEP 4: Convert Data Types
# ============================================================================

print("\n" + "-"*70)
print("4. CONVERTING DATA TYPES")
print("-"*70)

# 4.1 Convert Budget and Revenue to Millions USD
print("\n  4.1 Converting budget and revenue to millions USD...")

# Convert to numeric first (in case they're strings)
df_clean['budget'] = pd.to_numeric(df_clean['budget'], errors='coerce')
df_clean['revenue'] = pd.to_numeric(df_clean['revenue'], errors='coerce')

# Replace zeros with NaN (unrealistic values)
df_clean.loc[df_clean['budget'] == 0, 'budget'] = np.nan
df_clean.loc[df_clean['revenue'] == 0, 'revenue'] = np.nan

# Convert to millions
df_clean['budget_musd'] = df_clean['budget'] / 1_000_000
df_clean['revenue_musd'] = df_clean['revenue'] / 1_000_000

# Drop original columns
df_clean = df_clean.drop(columns=['budget', 'revenue'])

print(f"       Budget and revenue converted to millions USD")
print(f"       Zeros replaced with NaN")

# 4.2 Convert Release Date to Datetime
print("\n  4.2 Converting release_date to datetime...")
df_clean['release_date'] = pd.to_datetime(df_clean['release_date'], errors='coerce')
print(f"       Release date converted to datetime format")

# 4.3 Convert ID and Popularity to Numeric
print("\n  4.3 Converting numeric columns...")
df_clean['id'] = pd.to_numeric(df_clean['id'], errors='coerce')
df_clean['popularity'] = pd.to_numeric(df_clean['popularity'], errors='coerce')
df_clean['runtime'] = pd.to_numeric(df_clean['runtime'], errors='coerce')
df_clean['vote_average'] = pd.to_numeric(df_clean['vote_average'], errors='coerce')
df_clean['vote_count'] = pd.to_numeric(df_clean['vote_count'], errors='coerce')
print(f"       All numeric columns converted")


----------------------------------------------------------------------
4. CONVERTING DATA TYPES
----------------------------------------------------------------------

  4.1 Converting budget and revenue to millions USD...
       Budget and revenue converted to millions USD
       Zeros replaced with NaN

  4.2 Converting release_date to datetime...
       Release date converted to datetime format

  4.3 Converting numeric columns...
       All numeric columns converted


In [10]:
# ============================================================================
# CLEANING STEP 5: Handle Missing Values & Special Cases
# ============================================================================

print("\n" + "-"*70)
print("5. HANDLING MISSING VALUES")
print("-"*70)

# 5.1 Replace runtime = 0 with NaN
print("\n  5.1 Handling runtime zeros...")
runtime_zeros = (df_clean['runtime'] == 0).sum()
df_clean.loc[df_clean['runtime'] == 0, 'runtime'] = np.nan
print(f"       Replaced {runtime_zeros} runtime zeros with NaN")

# 5.2 Handle vote_count = 0
print("\n  5.2 Handling vote_count zeros...")
vote_zeros = (df_clean['vote_count'] == 0).sum()
if vote_zeros > 0:
    # If vote_count = 0, also set vote_average to NaN
    df_clean.loc[df_clean['vote_count'] == 0, 'vote_average'] = np.nan
    print(f"       Set vote_average to NaN for {vote_zeros} movies with no votes")

# 5.3 Replace placeholder text with NaN
print("\n  5.3 Replacing placeholder text...")
placeholder_values = ['No Data', 'no data', 'N/A', 'n/a', '']

for col in ['overview', 'tagline']:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].replace(placeholder_values, np.nan)

print(f"       Replaced placeholders in text columns")


----------------------------------------------------------------------
5. HANDLING MISSING VALUES
----------------------------------------------------------------------

  5.1 Handling runtime zeros...
       Replaced 0 runtime zeros with NaN

  5.2 Handling vote_count zeros...

  5.3 Replacing placeholder text...
       Replaced placeholders in text columns


In [11]:
# ============================================================================
# CLEANING STEP 6: Filter & Remove Invalid Rows
# ============================================================================

print("\n" + "-"*70)
print("6. FILTERING INVALID ROWS")
print("-"*70)

initial_rows = len(df_clean)

# 6.1 Remove rows with unknown ID or title
print("\n  6.1 Removing rows with missing ID or title...")
df_clean = df_clean.dropna(subset=['id', 'title'])
print(f"       Removed {initial_rows - len(df_clean)} rows")

# 6.2 Remove duplicates based on ID
print("\n  6.2 Removing duplicate movie IDs...")
before_dup = len(df_clean)
df_clean = df_clean.drop_duplicates(subset=['id'], keep='first')
print(f"       Removed {before_dup - len(df_clean)} duplicates")

# 6.3 Keep only rows with at least 10 non-NaN columns
print("\n  6.3 Filtering rows with sufficient data (≥10 non-NaN columns)...")
before_filter = len(df_clean)
df_clean = df_clean[df_clean.notna().sum(axis=1) >= 10]
print(f"       Removed {before_filter - len(df_clean)} rows with insufficient data")

# 6.4 Filter to include only 'Released' movies
print("\n  6.4 Filtering to keep only 'Released' movies...")
if 'status' in df_clean.columns:
    before_status = len(df_clean)
    df_clean = df_clean[df_clean['status'] == 'Released']
    print(f"       Kept only Released movies: removed {before_status - len(df_clean)} rows")
    
    # Drop status column (no longer needed)
    df_clean = df_clean.drop(columns=['status'])
    print(f"       Dropped 'status' column")



----------------------------------------------------------------------
6. FILTERING INVALID ROWS
----------------------------------------------------------------------

  6.1 Removing rows with missing ID or title...
       Removed 0 rows

  6.2 Removing duplicate movie IDs...
       Removed 0 duplicates

  6.3 Filtering rows with sufficient data (≥10 non-NaN columns)...
       Removed 0 rows with insufficient data

  6.4 Filtering to keep only 'Released' movies...
       Kept only Released movies: removed 0 rows
       Dropped 'status' column


In [12]:
# ============================================================================
# CLEANING STEP 7: Reorder Columns (Final Structure)
# ============================================================================

print("\n" + "-"*70)
print("7. REORDERING COLUMNS TO FINAL STRUCTURE")
print("-"*70)

# Define desired column order
desired_columns = [
    'id', 'title', 'tagline', 'release_date', 'genres', 
    'belongs_to_collection', 'original_language', 'budget_musd', 
    'revenue_musd', 'production_companies', 'production_countries', 
    'vote_count', 'vote_average', 'popularity', 'runtime', 
    'overview', 'spoken_languages', 'poster_path', 'cast', 
    'cast_size', 'director', 'crew_size'
]

# Only keep columns that exist
final_columns = [col for col in desired_columns if col in df_clean.columns]

# Reorder
df_clean = df_clean[final_columns]

print(f" Columns reordered to final structure")
print(f" Final column count: {len(final_columns)}")


----------------------------------------------------------------------
7. REORDERING COLUMNS TO FINAL STRUCTURE
----------------------------------------------------------------------
 Columns reordered to final structure
 Final column count: 22


In [13]:
# ============================================================================
# CLEANING STEP 8: Reset Index
# ============================================================================

print("\n" + "-"*70)
print("8. RESETTING INDEX")
print("-"*70)

df_clean = df_clean.reset_index(drop=True)
print(" Index reset")


----------------------------------------------------------------------
8. RESETTING INDEX
----------------------------------------------------------------------
 Index reset


In [14]:

# ============================================================================
# FINAL SUMMARY & SAVE
# ============================================================================

print("\n" + "="*70)
print("CLEANING COMPLETE! ")
print("="*70)

print(f"""
Final Dataset Summary:
  • Total movies: {len(df_clean)}
  • Total columns: {len(df_clean.columns)}
  • Missing values: {df_clean.isnull().sum().sum()}
  • Date range: {df_clean['release_date'].min().year} - {df_clean['release_date'].max().year}
  • Budget range: ${df_clean['budget_musd'].min():.1f}M - ${df_clean['budget_musd'].max():.1f}M
  • Revenue range: ${df_clean['revenue_musd'].min():.1f}M - ${df_clean['revenue_musd'].max():.1f}M
""")

# Show sample
print("\n Sample of cleaned data (first 3 rows, key columns):")
print("-"*70)
display_cols = ['title', 'release_date', 'genres', 'budget_musd', 'revenue_musd', 'vote_average']
display_cols = [col for col in display_cols if col in df_clean.columns]
print(df_clean[display_cols].head(3))

# Save to CSV
print("\n" + "-"*70)
print("SAVING CLEANED DATA")
print("-"*70)

df_clean.to_csv('movies_clean.csv', index=False)
print(" Saved to: movies_clean.csv")

print("\n" + "="*70)
print(" DATA CLEANING PHASE COMPLETE!")
print("="*70)
print("\nYou're now ready for Phase 3: KPI Analysis & Rankings! ")


CLEANING COMPLETE! 

Final Dataset Summary:
  • Total movies: 18
  • Total columns: 22
  • Missing values: 2
  • Date range: 1997 - 2019
  • Budget range: $125.0M - $356.0M
  • Revenue range: $1243.2M - $2923.7M


 Sample of cleaned data (first 3 rows, key columns):
----------------------------------------------------------------------
                          title release_date  \
0             Avengers: Endgame   2019-04-24   
1                        Avatar   2009-12-15   
2  Star Wars: The Force Awakens   2015-12-15   

                                     genres  budget_musd  revenue_musd  \
0          Adventure|Science Fiction|Action        356.0   2799.439100   
1  Action|Adventure|Fantasy|Science Fiction        237.0   2923.706026   
2          Adventure|Action|Science Fiction        245.0   2068.223624   

   vote_average  
0         8.237  
1         7.594  
2         7.255  

----------------------------------------------------------------------
SAVING CLEANED DATA
-------

In [15]:
# ============================================================================
# CLEANING STEP 3: Process Cast & Crew
# ============================================================================

print("\n" + "-"*70)
print("3. PROCESSING CAST & CREW DATA")
print("-"*70)

# 3.1 Extract Director
if 'crew' in df_clean.columns:
    print("\n  3.1 Finding directors...")
    df_clean['director'] = df_clean['crew'].apply(find_director)
    directors_found = df_clean['director'].notna().sum()
    print(f"       Directors found for {directors_found} movies")

# 3.2 Calculate Crew Size
if 'crew' in df_clean.columns:
    print("\n  3.2 Calculating crew size...")
    df_clean['crew_size'] = df_clean['crew'].apply(
        lambda x: len(safe_eval(x)) if safe_eval(x) else 0
    )
    print(f"       Crew size calculated")

# 3.3 Extract Cast Names
if 'cast' in df_clean.columns:
    print("\n  3.3 Extracting cast names...")
    df_clean['cast'] = df_clean['cast'].apply(lambda x: extract_cast_names(x, max_names=5))
    print(f"       Top 5 cast members extracted per movie")

# 3.4 Calculate Cast Size (before extraction)
# We need to recalculate from raw data since we already modified cast
if 'cast' in df.columns:
    print("\n  3.4 Calculating cast size...")
    df_clean['cast_size'] = df['cast'].apply(
        lambda x: len(safe_eval(x)) if safe_eval(x) else 0
    )
    print(f"       Cast size calculated")

# Drop the crew column (we don't need it anymore)
if 'crew' in df_clean.columns:
    df_clean = df_clean.drop(columns=['crew'])
    print("\n   Dropped 'crew' column (no longer needed)")


----------------------------------------------------------------------
3. PROCESSING CAST & CREW DATA
----------------------------------------------------------------------

  3.3 Extracting cast names...
       Top 5 cast members extracted per movie

  3.4 Calculating cast size...
       Cast size calculated


In [16]:
# ============================================================================
# CLEANING STEP 4: Convert Data Types
# ============================================================================

print("\n" + "-"*70)
print("4. CONVERTING DATA TYPES")
print("-"*70)

# 4.1 Convert Budget and Revenue to Millions USD
print("\n  4.1 Converting budget and revenue to millions USD...")

if 'budget' in df_clean.columns and 'revenue' in df_clean.columns:

    # Convert to numeric
    df_clean['budget'] = pd.to_numeric(df_clean['budget'], errors='coerce')
    df_clean['revenue'] = pd.to_numeric(df_clean['revenue'], errors='coerce')

    # Replace zeros with NaN
    df_clean.loc[df_clean['budget'] == 0, 'budget'] = np.nan
    df_clean.loc[df_clean['revenue'] == 0, 'revenue'] = np.nan

    # Convert to millions
    df_clean['budget_musd'] = df_clean['budget'] / 1_000_000
    df_clean['revenue_musd'] = df_clean['revenue'] / 1_000_000

    # Drop originals
    df_clean = df_clean.drop(columns=['budget', 'revenue'])

    print("       Budget and revenue converted to millions USD")
    print("       Zeros replaced with NaN")

else:
    print("       Columns 'budget' and/or 'revenue' not found — skipping conversion.")

print(f"       Budget and revenue converted to millions USD")
print(f"       Zeros replaced with NaN")

# 4.2 Convert Release Date to Datetime
print("\n  4.2 Converting release_date to datetime...")
df_clean['release_date'] = pd.to_datetime(df_clean['release_date'], errors='coerce')
print(f"       Release date converted to datetime format")

# 4.3 Convert ID and Popularity to Numeric
print("\n  4.3 Converting numeric columns...")
df_clean['id'] = pd.to_numeric(df_clean['id'], errors='coerce')
df_clean['popularity'] = pd.to_numeric(df_clean['popularity'], errors='coerce')
df_clean['runtime'] = pd.to_numeric(df_clean['runtime'], errors='coerce')
df_clean['vote_average'] = pd.to_numeric(df_clean['vote_average'], errors='coerce')
df_clean['vote_count'] = pd.to_numeric(df_clean['vote_count'], errors='coerce')
print(f"       All numeric columns converted")


----------------------------------------------------------------------
4. CONVERTING DATA TYPES
----------------------------------------------------------------------

  4.1 Converting budget and revenue to millions USD...
       Columns 'budget' and/or 'revenue' not found — skipping conversion.
       Budget and revenue converted to millions USD
       Zeros replaced with NaN

  4.2 Converting release_date to datetime...
       Release date converted to datetime format

  4.3 Converting numeric columns...
       All numeric columns converted


In [17]:
# ============================================================================
# CLEANING STEP 5: Handle Missing Values & Special Cases
# ============================================================================

print("\n" + "-"*70)
print("5. HANDLING MISSING VALUES")
print("-"*70)

# 5.1 Replace runtime = 0 with NaN
print("\n  5.1 Handling runtime zeros...")
runtime_zeros = (df_clean['runtime'] == 0).sum()
df_clean.loc[df_clean['runtime'] == 0, 'runtime'] = np.nan
print(f"       Replaced {runtime_zeros} runtime zeros with NaN")

# 5.2 Handle vote_count = 0
print("\n  5.2 Handling vote_count zeros...")
vote_zeros = (df_clean['vote_count'] == 0).sum()
if vote_zeros > 0:
    # If vote_count = 0, also set vote_average to NaN
    df_clean.loc[df_clean['vote_count'] == 0, 'vote_average'] = np.nan
    print(f"       Set vote_average to NaN for {vote_zeros} movies with no votes")

# 5.3 Replace placeholder text with NaN
print("\n  5.3 Replacing placeholder text...")
placeholder_values = ['No Data', 'no data', 'N/A', 'n/a', '']

for col in ['overview', 'tagline']:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].replace(placeholder_values, np.nan)

print(f"       Replaced placeholders in text columns")


----------------------------------------------------------------------
5. HANDLING MISSING VALUES
----------------------------------------------------------------------

  5.1 Handling runtime zeros...
       Replaced 0 runtime zeros with NaN

  5.2 Handling vote_count zeros...

  5.3 Replacing placeholder text...
       Replaced placeholders in text columns


In [18]:
# ============================================================================
# CLEANING STEP 6: Filter & Remove Invalid Rows
# ============================================================================

print("\n" + "-"*70)
print("6. FILTERING INVALID ROWS")
print("-"*70)

initial_rows = len(df_clean)

# 6.1 Remove rows with unknown ID or title
print("\n  6.1 Removing rows with missing ID or title...")
df_clean = df_clean.dropna(subset=['id', 'title'])
print(f"       Removed {initial_rows - len(df_clean)} rows")

# 6.2 Remove duplicates based on ID
print("\n  6.2 Removing duplicate movie IDs...")
before_dup = len(df_clean)
df_clean = df_clean.drop_duplicates(subset=['id'], keep='first')
print(f"       Removed {before_dup - len(df_clean)} duplicates")

# 6.3 Keep only rows with at least 10 non-NaN columns
print("\n  6.3 Filtering rows with sufficient data (≥10 non-NaN columns)...")
before_filter = len(df_clean)
df_clean = df_clean[df_clean.notna().sum(axis=1) >= 10]
print(f"       Removed {before_filter - len(df_clean)} rows with insufficient data")

# 6.4 Filter to include only 'Released' movies
print("\n  6.4 Filtering to keep only 'Released' movies...")
if 'status' in df_clean.columns:
    before_status = len(df_clean)
    df_clean = df_clean[df_clean['status'] == 'Released']
    print(f"       Kept only Released movies: removed {before_status - len(df_clean)} rows")
    
    # Drop status column (no longer needed)
    df_clean = df_clean.drop(columns=['status'])
    print(f"       Dropped 'status' column")


----------------------------------------------------------------------
6. FILTERING INVALID ROWS
----------------------------------------------------------------------

  6.1 Removing rows with missing ID or title...
       Removed 0 rows

  6.2 Removing duplicate movie IDs...
       Removed 0 duplicates

  6.3 Filtering rows with sufficient data (≥10 non-NaN columns)...
       Removed 0 rows with insufficient data

  6.4 Filtering to keep only 'Released' movies...


In [19]:
# ============================================================================
# CLEANING STEP 7: Reorder Columns (Final Structure)
# ============================================================================

print("\n" + "-"*70)
print("7. REORDERING COLUMNS TO FINAL STRUCTURE")
print("-"*70)

# Define desired column order
desired_columns = [
    'id', 'title', 'tagline', 'release_date', 'genres', 
    'belongs_to_collection', 'original_language', 'budget_musd', 
    'revenue_musd', 'production_companies', 'production_countries', 
    'vote_count', 'vote_average', 'popularity', 'runtime', 
    'overview', 'spoken_languages', 'poster_path', 'cast', 
    'cast_size', 'director', 'crew_size'
]

# Only keep columns that exist
final_columns = [col for col in desired_columns if col in df_clean.columns]

# Reorder
df_clean = df_clean[final_columns]

print(f" Columns reordered to final structure")
print(f" Final column count: {len(final_columns)}")


----------------------------------------------------------------------
7. REORDERING COLUMNS TO FINAL STRUCTURE
----------------------------------------------------------------------
 Columns reordered to final structure
 Final column count: 22


In [20]:
# ============================================================================
# CLEANING STEP 8: Reset Index
# ============================================================================

print("\n" + "-"*70)
print("8. RESETTING INDEX")
print("-"*70)

df_clean = df_clean.reset_index(drop=True)
print(" Index reset")


----------------------------------------------------------------------
8. RESETTING INDEX
----------------------------------------------------------------------
 Index reset


---
