# Movie Success Prediction - Data Integration

How to join movies_metadata.csv, keywords.csv, and credits.csv together

In [None]:
# JOINING STRATEGY
# ================
# All three files share a common 'id' column that identifies each movie
#
# movies_metadata.csv (45,466 rows)
#   ├── id (movie ID)
#   ├── title, release_date, budget, revenue, runtime, etc.
#
# keywords.csv (46,419 rows)
#   ├── id (movie ID) ← matches movies_metadata.id
#   └── keywords (JSON array)
#
# credits.csv (45,476 rows)
#   ├── id (movie ID) ← matches movies_metadata.id
#   ├── cast (JSON array with actor data)
#   └── crew (JSON array with director/producer data)
#
# MERGE OPERATION:
# Start with movies as the "left" table (primary)
# LEFT JOIN keywords on id → adds keyword info (or NaN if no match)
# LEFT JOIN credits on id → adds cast/crew info (or NaN if no match)
#
# This ensures:
# ✓ All movies are kept (none filtered out)
# ✓ Each movie gets its matching keywords/credits data
# ✓ Missing data is marked as NaN (handled during preprocessing)

print("JOIN STRATEGY EXPLAINED")
print("=" * 60)
print("\nPRINCIPLE: LEFT JOIN")
print("- Keep all rows from LEFT table (movies_metadata)")
print("- Add matching rows from RIGHT tables (keywords, credits)")
print("- If no match found, fill with NaN")
print("\nRESULT: Final dataset has:")
print("- All movies from movies_metadata")
print("- Keywords for each movie (if available)")
print("- Cast/Crew for each movie (if available)")

In [None]:
import pandas as pd
import numpy as np
import json
from ast import literal_eval
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
print('Libraries imported!')

In [None]:
# Load all three files
print('Loading data files...')
movies_df = pd.read_csv('../data/raw/movies_metadata.csv')
keywords_df = pd.read_csv('../data/raw/keywords.csv')
credits_df = pd.read_csv('../data/raw/credits.csv')

print(f'\nDATA SHAPES:')
print(f'  movies_metadata: {movies_df.shape}')
print(f'  keywords:        {keywords_df.shape}')
print(f'  credits:         {credits_df.shape}')

print(f'\nEXAMINE ID COLUMNS:')
print(f'  movies_df["id"].dtype: {movies_df["id"].dtype}')
print(f'  keywords_df["id"].dtype: {keywords_df["id"].dtype}')
print(f'  credits_df["id"].dtype: {credits_df["id"].dtype}')
print(f'\n  (Different types! Must convert to match before joining)')

In [None]:
# Parse JSON columns
def parse_json_column(val):
    if pd.isna(val):
        return []
    if isinstance(val, str):
        try:
            return json.loads(val)
        except:
            try:
                return literal_eval(val) if isinstance(literal_eval(val), list) else []
            except:
                return []
    return val

print('Parsing JSON columns...')
for col in ['genres', 'production_companies', 'production_countries', 'spoken_languages']:
    if col in movies_df.columns:
        movies_df[col] = movies_df[col].apply(parse_json_column)

keywords_df['keywords'] = keywords_df['keywords'].apply(parse_json_column)
credits_df['cast'] = credits_df['cast'].apply(parse_json_column)
credits_df['crew'] = credits_df['crew'].apply(parse_json_column)
print('JSON parsing complete!')

In [None]:
# STEP 1: Standardize ID columns (CRITICAL!)
print('Step 1: Standardizing ID columns...')
movies_df['id'] = pd.to_numeric(movies_df['id'], errors='coerce').astype('Int64')
keywords_df['id'] = pd.to_numeric(keywords_df['id'], errors='coerce').astype('Int64')
credits_df['id'] = pd.to_numeric(credits_df['id'], errors='coerce').astype('Int64')
print(f'  ✓ All IDs converted to Int64')

# STEP 2: Extract features from keywords
print('\nStep 2: Extracting keywords features...')
keywords_df['num_keywords'] = keywords_df['keywords'].apply(lambda x: len(x) if isinstance(x, list) else 0)
keywords_features = ['id', 'num_keywords']
print(f'  ✓ Created {len(keywords_features)} features from keywords')

# STEP 3: Extract features from credits
print('\nStep 3: Extracting credits features...')
credits_df['num_cast'] = credits_df['cast'].apply(lambda x: len(x) if isinstance(x, list) else 0)
credits_df['num_crew'] = credits_df['crew'].apply(lambda x: len(x) if isinstance(x, list) else 0)

def get_director(crew_list):
    if isinstance(crew_list, list):
        for member in crew_list:
            if isinstance(member, dict) and member.get('job') == 'Director':
                return member.get('name', 'Unknown')
    return 'Unknown'

credits_df['director'] = credits_df['crew'].apply(get_director)
credits_features = ['id', 'num_cast', 'num_crew', 'director']
print(f'  ✓ Created {len(credits_features)} features from credits')

# STEP 4: Perform LEFT JOINS
print('\nStep 4: Performing LEFT JOINs...')
df = movies_df.copy()
print(f'  Starting with {len(df)} movies')

# Join keywords
df = df.merge(keywords_df[keywords_features], on='id', how='left')
print(f'  ✓ After joining keywords: {len(df)} movies')

# Join credits
df = df.merge(credits_df[credits_features], on='id', how='left')
print(f'  ✓ After joining credits: {len(df)} movies')

# Fill NaN values
df['num_keywords'] = df['num_keywords'].fillna(0).astype(int)
df['num_cast'] = df['num_cast'].fillna(0).astype(int)
df['num_crew'] = df['num_crew'].fillna(0).astype(int)

print(f'\nFINAL MERGED DATASET: {df.shape}')
print(f'  Columns: {df.shape[1]}')

In [None]:
# Verify the joins worked correctly
print('VERIFYING JOINS:')
print('=' * 60)
print(f'\nMovies with keywords data: {(df["num_keywords"] > 0).sum()} / {len(df)}')
print(f'Movies with cast data: {(df["num_cast"] > 0).sum()} / {len(df)}')
print(f'Movies with known director: {(df["director"] != "Unknown").sum()} / {len(df)}')

print('\n\nSAMPLE OF JOINED DATA:')
print(df[['title', 'num_keywords', 'num_cast', 'director']].head(10))

In [None]:
print('\n\nJOIN SUMMARY')
print('=' * 60)
print('\nWhat happened:')
print('1. All 3 datasets loaded separately')
print('2. ID columns converted to same type (Int64)')
print('3. Features extracted from keywords & credits')
print('4. LEFT JOIN performed on movies table:')
print('   movies_df ← keywords_df (on id)')
print('   result    ← credits_df (on id)')
print('5. Missing values filled with 0')
print('\nResult: Single merged dataframe with:')
print(f'  - All movies ({len(df)})')
print(f'  - All their attributes (genres, budget, runtime, etc.)')
print(f'  - All keywords data')
print(f'  - All cast/crew data')
print('\nEach row = 1 movie with complete data across all sources')