In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ast import literal_eval
import zipfile


keywords_zip_path = "keywords.csv.zip"
movies_metadata_zip_path = "movies_metadata.csv.zip"
credits_zip_path = "credits.csv.zip"
links_csv_path = "links.csv"

with zipfile.ZipFile(keywords_zip_path, 'r') as zip_ref:
    keywords_df = pd.read_csv(zip_ref.open(zip_ref.namelist()[0]))

with zipfile.ZipFile(movies_metadata_zip_path, 'r') as zip_ref:
    movies_metadata_df = pd.read_csv(zip_ref.open(zip_ref.namelist()[0]), low_memory=False)

with zipfile.ZipFile(credits_zip_path, 'r') as zip_ref:
    credits_df = pd.read_csv(zip_ref.open(zip_ref.namelist()[0]))

links_df = pd.read_csv(links_csv_path)

movies_metadata_df['id'] = pd.to_numeric(movies_metadata_df['id'], errors='coerce')
movies_metadata_df = movies_metadata_df.dropna(subset=['id'])
movies_metadata_df['id'] = movies_metadata_df['id'].astype(int)

links_df = links_df.dropna(subset=['tmdbId'])
links_df['tmdbId'] = links_df['tmdbId'].astype(int)

valid_ids = set(links_df['tmdbId'])

movies_metadata_filtered = movies_metadata_df[movies_metadata_df['id'].isin(valid_ids)]
credits_filtered = credits_df[credits_df['id'].isin(valid_ids)]
keywords_filtered = keywords_df[keywords_df['id'].isin(valid_ids)]

merged_df = movies_metadata_filtered.merge(credits_filtered, on='id', how='left')
master_dataset = merged_df.merge(keywords_filtered, on='id', how='left')

print("Shape of final master dataset:", master_dataset.shape)
print("\nColumn names in master dataset:\n", master_dataset.columns.tolist())


Shape of final master dataset: (46629, 27)

Column names in master dataset:
 ['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id', 'imdb_id', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'video', 'vote_average', 'vote_count', 'cast', 'crew', 'keywords']


In [24]:
master_dataset['cast'] = master_dataset['cast'].fillna('[]').apply(literal_eval)
master_dataset['crew'] = master_dataset['crew'].fillna('[]').apply(literal_eval)
master_dataset['keywords'] = master_dataset['keywords'].fillna('[]').apply(literal_eval)

In [25]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [26]:
master_dataset['cast']      = master_dataset['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
master_dataset['cast']      = master_dataset['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)
master_dataset['keywords']  = master_dataset['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
master_dataset['director']  = master_dataset['crew'].apply(get_director)
master_dataset['cast']          = master_dataset['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
master_dataset['main_director'] = master_dataset['director']
master_dataset['director']      = master_dataset['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
master_dataset['director']      = master_dataset['director'].apply(lambda x: [x,x,x])
s = master_dataset.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'
s = s.value_counts()
print(s[:5])

keyword
woman director      3128
independent film    1942
murder              1314
based on novel       841
musical              734
Name: count, dtype: int64


In [27]:
s = s[s > 1]

In [28]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')
master_dataset['keywords'] = master_dataset['keywords'].apply(
    lambda x: [stemmer.stem(i) for i in x if len(i) > 1]
)
master_dataset['keywords'] = master_dataset['keywords'].apply(
    lambda x: [i.replace(" ", "").lower() for i in x]
)
master_dataset['keywords'].head(3)

0    [jealousi, toy, boy, friendship, friend, rival...
1    [boardgam, disappear, basedonchildren'sbook, n...
2       [fish, bestfriend, duringcreditssting, oldmen]
Name: keywords, dtype: object

In [29]:
for col in ['keywords', 'cast', 'director', 'genres']:
    master_dataset[col] = master_dataset[col].apply(lambda x: x if isinstance(x, list) else [])
master_dataset['soup'] = master_dataset['keywords'] + master_dataset['cast'] + master_dataset['director'] + master_dataset['genres']
master_dataset['soup'] = master_dataset['soup'].apply(lambda x: ' '.join([str(i) for i in x]))
master_dataset['soup'].head(3)

0    jealousi toy boy friendship friend rivalri boy...
1    boardgam disappear basedonchildren'sbook newho...
2    fish bestfriend duringcreditssting oldmen walt...
Name: soup, dtype: object

In [30]:
print(master_dataset.columns)

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'cast', 'crew', 'keywords', 'director',
       'main_director', 'soup'],
      dtype='object')


In [33]:
master_dataset.to_csv('master_dataset_new.csv', index=False)

#### Week 4

In [34]:
master_dataset_new = pd.read_csv('master_dataset_new.csv')
print(master_dataset_new.columns)

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'cast', 'crew', 'keywords', 'director',
       'main_director', 'soup'],
      dtype='object')


In [35]:
master_dataset_new.drop(['adult', 'belongs_to_collection', 'budget','homepage','original_language', 'production_companies','production_countries', 'revenue', 'runtime','spoken_languages','status','video'],axis=1,inplace=True)
master_dataset_new.drop(['overview', 'tagline','vote_average', 'vote_count', 'cast', 'crew','keywords', 'director'],axis=1,inplace=True)
master_dataset_new.drop(['id','imdb_id','original_title','poster_path','genres'],axis=1,inplace=True)
master_dataset_new['popularity'] = master_dataset_new.apply(lambda r: r['popularity'] if type(r['popularity']) == float else np.nan, axis=1)
master_dataset_new.dropna(inplace=True)
master_dataset_new['main_director'] = master_dataset_new.apply(lambda r: r['main_director'] if len(r['main_director']) > 1 else np.nan, axis=1)
master_dataset_new.dropna(inplace=True)
master_dataset_new.sort_values(by='popularity', ascending=False, inplace=True)
master_dataset_new.drop(['popularity'], axis=1, inplace=True)
master_dataset_new.dropna(inplace=True)
master_dataset_new.reset_index(inplace=True, drop=True)
master_dataset_new['release_date'] = master_dataset_new.apply(lambda r: r['release_date'] if len(r['release_date']) > 1 else np.nan, axis=1)
master_dataset_new.dropna(inplace=True)
master_dataset_new = master_dataset_new[:2500]


In [36]:
master_dataset_new.head()

Unnamed: 0,release_date,title,main_director,soup
0,2015-06-17,Minions,Kyle Balda,assist aftercreditssting duringcreditssting ev...
1,2017-05-30,Wonder Woman,Patty Jenkins,dccomic hero greekmytholog island worldwari su...
2,2017-03-16,Beauty and the Beast,Bill Condon,franc magic castl fairytal music curs anthropo...
3,2017-06-28,Baby Driver,Edgar Wright,robberi atlanta music crimeboss romanc tinnitu...
4,2014-10-24,Big Hero 6,Chris Williams,brotherbrotherrelationship hero talent reveng ...


In [37]:
print(master_dataset_new.shape)

(2500, 4)


In [38]:
master_dataset_new.to_csv('master_dataset_final.csv', index=False)