# Importing libraries

In [1]:
import numpy as np
import pandas as pd
import ast

# Reading the data

In [2]:
# reading credits file
credits = pd.read_csv('../datasets/credits.csv')

In [3]:
# top 5 rows of credits file
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [4]:
# number of movies and its features
credits.shape

(45476, 3)

In [5]:
# reading movie data
meta = pd.read_csv('../datasets/movies_metadata.csv', low_memory=False)

In [6]:
# top 5 rows
meta.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [7]:
# number of movies and its features
meta.shape

(45466, 24)

In [8]:
# data type of the features
meta.dtypes

adult                     object
belongs_to_collection     object
budget                    object
genres                    object
homepage                  object
id                        object
imdb_id                   object
original_language         object
original_title            object
overview                  object
popularity                object
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                  float64
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                     object
vote_average             float64
vote_count               float64
dtype: object

In [9]:
# modifying the data type of release date into pandas date time
meta['release_date'] = pd.to_datetime(meta['release_date'], errors='coerce')

In [10]:
# extracting year from release date
meta['year'] = meta['release_date'].dt.year

In [11]:
# calculating the movies generated in every year
meta['year'].value_counts().sort_index()

1874.0       1
1878.0       1
1883.0       1
1887.0       1
1888.0       2
          ... 
2015.0    1905
2016.0    1604
2017.0     532
2018.0       5
2020.0       1
Name: year, Length: 135, dtype: int64

In [12]:
# filtering out the movies upto 2017 because we have much less data for the following years
cols_needed = ['genres','id','title','year']
new_meta = meta.loc[meta['year']<=2017, cols_needed]

In [13]:
# top 5 rows
new_meta.head()

Unnamed: 0,genres,id,title,year
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,Toy Story,1995.0
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,Jumanji,1995.0
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,Grumpier Old Men,1995.0
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,Waiting to Exhale,1995.0
4,"[{'id': 35, 'name': 'Comedy'}]",11862,Father of the Bride Part II,1995.0


In [14]:
# converting the data type of id to integer
new_meta['id'] = new_meta['id'].astype(int)

# Creating a dataframe which contains the movie data upto 2017

In [15]:
# merging new_meta and credits
data = pd.merge(new_meta, credits, on='id')

In [16]:
pd.set_option('display.max_colwidth', 75)
data.head()

Unnamed: 0,genres,id,title,year,cast,crew
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': ...",862,Toy Story,1995.0,"[{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3...","[{'credit_id': '52fe4284c3a36847f8024f49', 'department': 'Directing', '..."
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id':...",8844,Jumanji,1995.0,"[{'cast_id': 1, 'character': 'Alan Parrish', 'credit_id': '52fe44bfc3a3...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'department': 'Production', ..."
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, 'name': 'Comedy'}]",15602,Grumpier Old Men,1995.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'credit_id': '52fe466a92514...","[{'credit_id': '52fe466a9251416c75077a89', 'department': 'Directing', '..."
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 1074...",31357,Waiting to Exhale,1995.0,"[{'cast_id': 1, 'character': ""Savannah 'Vannah' Jackson"", 'credit_id': ...","[{'credit_id': '52fe44779251416c91011acb', 'department': 'Directing', '..."
4,"[{'id': 35, 'name': 'Comedy'}]",11862,Father of the Bride Part II,1995.0,"[{'cast_id': 1, 'character': 'George Banks', 'credit_id': '52fe44959251...","[{'credit_id': '52fe44959251416c75039ed7', 'department': 'Sound', 'gend..."


In [17]:
# evaluates an expression node or a string containing a Python literal or container display
data['genres'] = data['genres'].map(lambda x: ast.literal_eval(x))
data['cast'] = data['cast'].map(lambda x: ast.literal_eval(x))
data['crew'] = data['crew'].map(lambda x: ast.literal_eval(x))

In [18]:
# cleaning genres column
def make_gen_list(x):
    gen = []
    st = ' '
    for i in x:
        if i.get('name') == 'Science Fiction':
            gen.append('Sci-Fi')
        else:
            gen.append(i.get('name'))
    if gen==[]:
        return np.NaN
    else:
        return st.join(gen)

In [None]:
# creating genres column
data['genres_list'] = data['genres'].apply(lambda x: make_gen_list(x))

In [None]:
data['genres_list']

In [None]:
# actor 1
def get_actor1(x):
    casts = []
    for i in x:
        casts.append(i.get('name'))
    if casts==[]:
        return np.NaN
    else:
        return casts[0]

In [None]:
# creating actor 1 column
data['actor_1_name'] = data['cast'].apply(lambda x: get_actor1(x))

In [None]:
# actor 2
def get_actor2(x):
    casts = []
    for i in x:
        casts.append(i.get('name'))
    if casts==[] or len(casts)<=1:
        return np.NaN
    else:
        return casts[1]

In [None]:
# creating actor 2 column
data['actor_2_name'] = data['cast'].apply(lambda x: get_actor2(x))

In [None]:
# actor 3
def get_actor3(x):
    casts = []
    for i in x:
        casts.append(i.get('name'))
    if casts==[] or len(casts)<=2:
        return np.NaN
    else:
        return casts[2]

In [None]:
# creating actor 3 column
data['actor_3_name'] = data['cast'].apply(lambda x: get_actor3(x))

In [None]:
# directors
def get_directors(x):
    dt = []
    st = ' '
    for i in x:
        if i.get('job')=='Director':
            dt.append(i.get('name'))
    if dt==[]:
        return np.NaN
    else:
        return st.join(dt)

In [None]:
# creating director column
data['director_name'] = data['crew'].map(lambda x: get_directors(x))

In [None]:
# movie dataset
movie = data.loc[:,['director_name','actor_1_name','actor_2_name','actor_3_name','genres_list','title']]

In [None]:
# null values
movie.isnull().sum()

In [None]:
# dropping null rows
movie = movie.dropna(how='any')

In [None]:
# checking again
movie.isnull().sum()

In [None]:
# editing columns
movie = movie.rename(columns={'genres_list':'genres'})
movie = movie.rename(columns={'title':'movie_title'})
movie['movie_title'] = movie['movie_title'].str.lower()

In [None]:
# dropping any duplicates if present
movie.drop_duplicates(subset ="movie_title", keep = 'last', inplace = True)

In [None]:
# final dataset
movie

In [None]:
# saving the dataset
movie.to_csv('../datasets/movies_upto_2017.csv',index=False)