In [None]:
import pandas as pd
import numpy as np
import ast

## Loading Movie Meta Data

In [174]:
path = "../data"
movie_meta_file = "/movies_metadata.csv"
df_meta = pd.read_csv(path+movie_meta_file)

  df_meta = pd.read_csv(path+movie_meta_file)


In [173]:
df_meta.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [175]:

df_meta = df_meta.drop([19730, 29503, 35587]) # drop invalid movies
df_meta = df_meta.set_index(df_meta['id'].str.strip().replace(',','').astype(int)) 


# Extract important features
meta_features = ['genres', 'imdb_id', 'original_language', 'revenue', 'release_date', 'spoken_languages', 'title', 'vote_average', 'vote_count', 'overview', 'poster_path', 'popularity']
meta = df_meta[meta_features]

## Credits and Keywords

In [176]:
# credits 
credits = pd.read_csv(path+ "/credits.csv")

# Create col for directors
df_credits = credits.set_index('id') # set index to movie_id 


# Keywords 
df_keywords = pd.read_csv(path + "/keywords.csv")
df_keywords = df_keywords.set_index('id')  # set index to movie_id


# Merging datasets
df_key_credit = df_keywords.merge(df_credits, left_index=True, right_on='id')
df = df_key_credit.merge(meta, left_index=True, right_on='id')

df.drop_duplicates(inplace=True)

df['Director'] = df['crew'].apply(lambda x:[dct['name'] for dct in ast.literal_eval(x) if dct['job'] == 'Director'])
df.drop(['crew'], axis=1, inplace=True) # drop crew


## Data Cleaning

In [177]:
# Retrieving releavant data 
df['genres'] = df['genres'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)]) # Returns a list of genres

df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan) # returns the year of release

# Keyword extraction and cleanup
df['keywords'] = df['keywords'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)])
df['keywords'] = df['keywords'].apply(lambda x: ' '.join([i.replace(" ", "") for i in x]))
df['keywords'].fillna("", inplace=True) # replace missing values with empty string

# extract the overview
df['overview'] = df['overview'].fillna('') # will be used for text processing

# Extract cast
df['cast'] = df['cast'].apply(lambda x: [i['name'] for i in eval(x)]) 
df['cast'] = df['cast'].apply(lambda x: ' '.join([i.replace(" ", "") for i in x]))
df['cast'].fillna("", inplace=True) # replace missing values with empty string



df = df[(df['genres'].str.len() != 0) & (df['Director'].str.len() != 0)] # remove movies with no directors or genres recorded

df.reset_index(inplace=True) # reset index




In [154]:
df = df[df['vote_count'] > 5] # remove movies with a vote_count less than 5

In [178]:

str_cols = ["keywords", "cast", "genres", "release_date", "title", 
            "overview", "cast", "Director"]

new_data_types = {col:'str' for col in str_cols}
df = df.astype(new_data_types)

### Converting final dataframe to csv file

In [179]:
df.to_csv("data/final_movie.csv",index=False)