In [None]:
import pandas as pd
import numpy as np
import ast

## Loading Movie Meta Data

In [106]:
path = "../data"
movie_meta_file = "/movies_metadata.csv"
df_meta = pd.read_csv(path+movie_meta_file)

  df_meta = pd.read_csv(path+movie_meta_file)


In [107]:

df_meta = df_meta.drop([19730, 29503, 35587]) # drop invalid movies
df_meta = df_meta.set_index(df_meta['id'].str.strip().replace(',','').astype(int)) 


# Extract important features
meta_features = ['genres', 'imdb_id', 'original_language', 'revenue', 'release_date', 'spoken_languages', 'title', 'vote_average', 'vote_count', 'overview', 'poster_path', 'popularity']
meta = df_meta[meta_features]

## Credits and Keywords

In [108]:
# credits 
credits = pd.read_csv(path+ "/credits.csv")

# Create col for directors
credits['Director'] = credits['crew'].apply(lambda x:[dct['name'] for dct in ast.literal_eval(x) if dct['job'] == 'Director'])
df_credits = credits.set_index('id') # set index to movie_id 


# Keywords 
df_keywords = pd.read_csv(path + "/keywords.csv")
df_keywords = df_keywords.set_index('id')  # set index to movie_id


# Merging datasets
df_key_credit = df_keywords.merge(df_credits, left_index=True, right_on='id')
df = df_key_credit.merge(meta, left_index=True, right_on='id')

df.drop(['crew'], axis=1, inplace=True) # drop crew

## Data Cleaning

In [109]:
# Retrieving releavant data 
df['genres'] = df['genres'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)]) # Returns a list of genres

df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan) # returns the year of release

# Keyword extraction and cleanup
df['keywords'] = df['keywords'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)])
df['keywords'] = df['keywords'].apply(lambda x: ' '.join([i.replace(" ", "") for i in x]))
df['keywords'].fillna("", inplace=True) # replace missing values with empty string

# extract the overview
df['overview'] = df['overview'].fillna('') # will be used for text processing

# Extract cast
df['cast'] = df['cast'].apply(lambda x: [i['name'] for i in eval(x)]) 
df['cast'] = df['cast'].apply(lambda x: ' '.join([i.replace(" ", "") for i in x]))
df['cast'].fillna("", inplace=True) # replace missing values with empty string



df = df[(df['genres'].str.len() != 0) & (df['genres'].str.len() != 0)] # remove movies with no directors or genres recorded

df.reset_index(inplace=True) # reset index



In [112]:
df

Unnamed: 0,id,keywords,cast,Director,genres,imdb_id,original_language,revenue,release_date,spoken_languages,title,vote_average,vote_count,overview,poster_path,popularity
0,862,jealousy toy boy friendship friends rivalry bo...,TomHanks TimAllen DonRickles JimVarney Wallace...,[John Lasseter],"[Animation, Comedy, Family]",tt0114709,en,373554033.0,1995,"[{'iso_639_1': 'en', 'name': 'English'}]",Toy Story,7.7,5415.0,"Led by Woody, Andy's toys live happily in his ...",/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,21.946943
1,8844,boardgame disappearance basedonchildren'sbook ...,RobinWilliams JonathanHyde KirstenDunst Bradle...,[Joe Johnston],"[Adventure, Fantasy, Family]",tt0113497,en,262797249.0,1995,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Jumanji,6.9,2413.0,When siblings Judy and Peter discover an encha...,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,17.015539
2,15602,fishing bestfriend duringcreditsstinger oldmen,WalterMatthau JackLemmon Ann-Margret SophiaLor...,[Howard Deutch],"[Romance, Comedy]",tt0113228,en,0.0,1995,"[{'iso_639_1': 'en', 'name': 'English'}]",Grumpier Old Men,6.5,92.0,A family wedding reignites the ancient feud be...,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,11.7129
3,31357,basedonnovel interracialrelationship singlemot...,WhitneyHouston AngelaBassett LorettaDevine Lel...,[Forest Whitaker],"[Comedy, Drama, Romance]",tt0114885,en,81452156.0,1995,"[{'iso_639_1': 'en', 'name': 'English'}]",Waiting to Exhale,6.1,34.0,"Cheated on, mistreated and stepped on, the wom...",/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,3.859495
4,11862,baby midlifecrisis confidence aging daughter m...,SteveMartin DianeKeaton MartinShort KimberlyWi...,[Charles Shyer],[Comedy],tt0113041,en,76578911.0,1995,"[{'iso_639_1': 'en', 'name': 'English'}]",Father of the Bride Part II,5.7,173.0,Just when George Banks has recovered from his ...,/e64sOI48hQXyru7naBFyssKFxVd.jpg,8.387519
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44099,222848,,LisaBoyle KenaLand ZanetaPolard DonYanan Debra...,[Aaron Osborne],[Science Fiction],tt0112613,en,0.0,1995,"[{'iso_639_1': 'en', 'name': 'English'}]",Caged Heat 3000,3.5,1.0,It's the year 3000 AD. The world's most danger...,/4lF9LH0b0Z1X94xGK9IOzqEW6k1.jpg,0.661558
44100,30840,,PatrickBergin UmaThurman DavidMorrissey Jürgen...,[John Irvin],"[Drama, Action, Romance]",tt0102797,en,0.0,1991,"[{'iso_639_1': 'en', 'name': 'English'}]",Robin Hood,5.7,26.0,"Yet another version of the classic epic, with ...",/fQC46NglNiEMZBv5XHoyLuOWoN5.jpg,5.683753
44101,439050,tragiclove,LeilaHatami KouroshTahami ElhamKorda,[Hamid Nematollah],"[Drama, Family]",tt6209470,fa,0.0,NaT,"[{'iso_639_1': 'fa', 'name': 'فارسی'}]",Subdue,4.0,1.0,Rising and falling between a man and woman.,/jldsYflnId4tTWPx8es3uzsB1I8.jpg,0.072051
44102,111109,artist play pinoy,AngelAquino PerryDizon HazelOrencio JoelTorre ...,[Lav Diaz],[Drama],tt2028550,tl,0.0,2011,"[{'iso_639_1': 'tl', 'name': ''}]",Century of Birthing,9.0,3.0,An artist struggles to finish his work while a...,/xZkmxsNmYXJbKVsTRLLx3pqGHx7.jpg,0.178241


### Converting final dataframe to csv file

In [113]:
df.to_csv("final_movie.csv",index=False)