### This file is for preprocessing and getting all the data in the movies_metadata and credits ready

In [3]:
import numpy as np
import pandas as pd
import ast

In [4]:
# import the credits data
credits_data = pd.read_csv('../datasets/credits.csv')

In [5]:
credits_data

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862
...,...,...,...
45471,"[{'cast_id': 0, 'character': '', 'credit_id': ...","[{'credit_id': '5894a97d925141426c00818c', 'de...",439050
45472,"[{'cast_id': 1002, 'character': 'Sister Angela...","[{'credit_id': '52fe4af1c3a36847f81e9b15', 'de...",111109
45473,"[{'cast_id': 6, 'character': 'Emily Shaw', 'cr...","[{'credit_id': '52fe4776c3a368484e0c8387', 'de...",67758
45474,"[{'cast_id': 2, 'character': '', 'credit_id': ...","[{'credit_id': '533bccebc3a36844cf0011a7', 'de...",227506


In [6]:
# Import the movies_meatadata
movies_metadata = pd.read_csv('../datasets/movies_metadata.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


The 'realease_date' feature is to be converted and used to create a 'year' column to check the years this data were collected

In [7]:
movies_metadata['release_date'] = pd.to_datetime(movies_metadata['release_date'], errors='coerce')
movies_metadata['year'] = movies_metadata['release_date'].dt.year
movies_metadata['year'].value_counts().sort_index()

1874.0       1
1878.0       1
1883.0       1
1887.0       1
1888.0       2
          ... 
2015.0    1905
2016.0    1604
2017.0     532
2018.0       5
2020.0       1
Name: year, Length: 135, dtype: int64

Now I can get only the data for 2017 as I already have data for up to 2016; No enough data for 2018 and 2019 for now

In [8]:
# Get the following features for 2017; ['genres','id','title','year']
new_metadata = movies_metadata.loc[movies_metadata.year == 2017,['genres','id','title','year']]

To merge the credits_data and new_metadata on id column, there is need for formatting the ids in both to a uniform data type.
Here I will format both to int

In [9]:
new_metadata['id'] = new_metadata['id'].astype(int)
data = pd.merge(new_metadata, credits_data, on='id')
data.head()

Unnamed: 0,genres,id,title,year,cast,crew
0,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",166426,Pirates of the Caribbean: Dead Men Tell No Tales,2017.0,"[{'cast_id': 1, 'character': 'Captain Jack Spa...","[{'credit_id': '52fe4c9cc3a36847f8236a65', 'de..."
1,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",141052,Justice League,2017.0,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{'credit_id': '55ef66dbc3a3686f1700a52d', 'de..."
2,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",284053,Thor: Ragnarok,2017.0,"[{'cast_id': 0, 'character': 'Thor Odinson', '...","[{'credit_id': '56a93fa4c3a36872db001e7a', 'de..."
3,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",283995,Guardians of the Galaxy Vol. 2,2017.0,"[{'cast_id': 3, 'character': 'Peter Quill / St...","[{'credit_id': '59171547925141583c0315a6', 'de..."
4,"[{'id': 14, 'name': 'Fantasy'}, {'id': 28, 'na...",245842,The King's Daughter,2017.0,"[{'cast_id': 0, 'character': 'King Louis XIV',...","[{'credit_id': '5431de49c3a36825d300007e', 'de..."


Now I want to format the features in my data dataframe
- Make the object types list with the ast library
- format_genres() used to format the genres list gotten to the suitable genre column similar to final_movie_metadata
- get all the actors names from the cast feature
- get the director name from the crew feature
- Finally I extract all the actors name, director names, title, and genre_list similar to final_movie_matadata all in a 'movie_data' dataframe

In [10]:
data['genres'] = data['genres'].map(lambda x: ast.literal_eval(x))
data['cast'] = data['cast'].map(lambda x: ast.literal_eval(x))
data['crew'] = data['crew'].map(lambda x: ast.literal_eval(x))

In [11]:
def format_genres(z):
    genre = []
    genre_join = " "
    for a in z:
        # 'Science Fiction' renamed to 'Sci-Fi'
        if a.get('name') == 'Science Fiction':
            scifi = 'Sci-Fi'
            genre.append(scifi)
        else:
            genre.append(a.get('name'))
    if genre == []:
        return np.NaN
    else:
        return (genre_join.join(genre))

In [12]:
# make the required genres column
data['genres_list'] = data['genres'].map(lambda z: format_genres(z))

In [13]:
data['genres_list']

0      Adventure Action Fantasy Comedy
1      Action Adventure Fantasy Sci-Fi
2      Action Adventure Fantasy Sci-Fi
3       Action Adventure Comedy Sci-Fi
4             Fantasy Action Adventure
                    ...               
526                     Romance Comedy
527         Crime Comedy Action Family
528    Family Animation Romance Comedy
529               Crime Drama Thriller
530                                NaN
Name: genres_list, Length: 531, dtype: object

In [14]:
def make_actor1(z):
    casts = []
    for a in z:
        casts.append(a.get('name'))
    if casts == []:
        return np.NaN
    else:
        return (casts[0])

In [15]:
def make_actor2(z):
    casts = []
    for a in z:
        casts.append(a.get('name'))
    if casts == [] or len(casts)<=1:
        return np.NaN
    else:
        return (casts[1])

In [16]:
def make_actor3(z):
    casts = []
    for a in z:
        casts.append(a.get('name'))
    if casts == [] or len(casts)<=2:
        return np.NaN
    else:
        return (casts[2])

In [17]:
def make_directors(z):
    dt = []
    st = " "
    for a in z:
        if a.get('job') == 'Director':
            dt.append(a.get('name'))
    if dt == []:
        return np.NaN
    else:
        return (st.join(dt))

In [18]:
data['actor_1_name'] = data['cast'].map(lambda z: make_actor1(z))
data['actor_2_name'] = data['cast'].map(lambda z: make_actor2(z))
data['actor_3_name'] = data['cast'].map(lambda z: make_actor3(z))
data['director_name'] = data['crew'].map(lambda z: make_directors(z))


In [19]:
movie_data = data.loc[:,['director_name','actor_1_name','actor_2_name','actor_3_name','genres_list','title']]

In [20]:
movie_data

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres_list,title
0,Joachim Rønning Espen Sandberg,Johnny Depp,Javier Bardem,Geoffrey Rush,Adventure Action Fantasy Comedy,Pirates of the Caribbean: Dead Men Tell No Tales
1,Zack Snyder,Ben Affleck,Henry Cavill,Gal Gadot,Action Adventure Fantasy Sci-Fi,Justice League
2,Taika Waititi,Chris Hemsworth,Tom Hiddleston,Cate Blanchett,Action Adventure Fantasy Sci-Fi,Thor: Ragnarok
3,James Gunn,Chris Pratt,Zoe Saldana,Dave Bautista,Action Adventure Comedy Sci-Fi,Guardians of the Galaxy Vol. 2
4,Sean McNamara,Pierce Brosnan,William Hurt,Benjamin Walker,Fantasy Action Adventure,The King's Daughter
...,...,...,...,...,...,...
526,Hannaleena Hauru,Inka Haapamäki,Rosa Honkonen,Tiitus Rantala,Romance Comedy,Thick Lashes of Lauri Mäntyvaara
527,Jonathan A. Rosenbaum,Lou Diamond Phillips,Wallace Shawn,Gina Holden,Crime Comedy Action Family,Cop and a Half: New Recruit
528,Beth David Esteban Bravo,,,,Family Animation Romance Comedy,In a Heartbeat
529,Ravi Udyawar,Sridevi Kapoor,Sajal Ali,Akshaye Khanna,Crime Drama Thriller,Mom


##### Formatting the movie_data in order to append to final_movie_metadata
- Drop all null values
- Rename the genres_list feature to genres and the title to movie_title
- make the comb column where all the names are joined together in one column
- make the comb column for the former dataset
- Now join the both dataframes

In [21]:
movie_data.isna().sum()

director_name     4
actor_1_name     22
actor_2_name     55
actor_3_name     70
genres_list       7
title             0
dtype: int64

In [22]:
# Remove all null values
movie_data.dropna(inplace=True)
# movie_data.dropna(how='any')


In [23]:
movie_data.isnull().sum()

director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
genres_list      0
title            0
dtype: int64

In [24]:
# Rename the 'genres_list' and 'title'
movie_data = movie_data.rename(columns={'genres_list':'genres'})
movie_data = movie_data.rename(columns={'title':'movie_title'})

In [25]:
# Convert movie_title to lowercase
# Create comb column
movie_data['movie_title'] = movie_data['movie_title'].str.lower()
movie_data['comb'] = movie_data['actor_1_name'] + ' ' + movie_data['actor_2_name'] + ' '+ movie_data['actor_3_name'] + ' '+ movie_data['director_name'] +' ' + movie_data['genres']


In [26]:
movie_data

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,Joachim Rønning Espen Sandberg,Johnny Depp,Javier Bardem,Geoffrey Rush,Adventure Action Fantasy Comedy,pirates of the caribbean: dead men tell no tales,Johnny Depp Javier Bardem Geoffrey Rush Joachi...
1,Zack Snyder,Ben Affleck,Henry Cavill,Gal Gadot,Action Adventure Fantasy Sci-Fi,justice league,Ben Affleck Henry Cavill Gal Gadot Zack Snyder...
2,Taika Waititi,Chris Hemsworth,Tom Hiddleston,Cate Blanchett,Action Adventure Fantasy Sci-Fi,thor: ragnarok,Chris Hemsworth Tom Hiddleston Cate Blanchett ...
3,James Gunn,Chris Pratt,Zoe Saldana,Dave Bautista,Action Adventure Comedy Sci-Fi,guardians of the galaxy vol. 2,Chris Pratt Zoe Saldana Dave Bautista James Gu...
4,Sean McNamara,Pierce Brosnan,William Hurt,Benjamin Walker,Fantasy Action Adventure,the king's daughter,Pierce Brosnan William Hurt Benjamin Walker Se...
...,...,...,...,...,...,...,...
524,Jim Strouse,Jessica Williams,Chris O'Dowd,Keith Stanfield,Romance Comedy,the incredible jessica james,Jessica Williams Chris O'Dowd Keith Stanfield ...
525,Farhad Mann,Adelaide Kane,Benjamin Hollingsworth,Jean Louisa Kelly,Romance,can't buy my love,Adelaide Kane Benjamin Hollingsworth Jean Loui...
526,Hannaleena Hauru,Inka Haapamäki,Rosa Honkonen,Tiitus Rantala,Romance Comedy,thick lashes of lauri mäntyvaara,Inka Haapamäki Rosa Honkonen Tiitus Rantala Ha...
527,Jonathan A. Rosenbaum,Lou Diamond Phillips,Wallace Shawn,Gina Holden,Crime Comedy Action Family,cop and a half: new recruit,Lou Diamond Phillips Wallace Shawn Gina Holden...


In [29]:
old_data = pd.read_csv('../datasets/final_movie_metadata.csv')

# Create comb column for the old dataframe
old_data['comb'] = old_data['actor_1_name'] + ' ' + old_data['actor_2_name'] + ' '+ old_data['actor_3_name'] + ' '+ old_data['director_name'] +' ' + old_data['genres']

In [30]:
# Append the movie_data to old_data in new_data
new_data = old_data.append(movie_data)

In [31]:
# Check if the merge was done with their respective shapes
old_data.shape, movie_data.shape, new_data.shape

((5043, 7), (458, 7), (5501, 7))

It is expected that with two datasets there must be duplicates; as such I drop duplicates in my new_data

In [32]:
new_data.drop_duplicates(subset ="movie_title", keep = 'last', inplace = True)

In [33]:
new_data

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi,avatar,CCH Pounder Joel David Moore Wes Studi James C...
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,pirates of the caribbean: at world's end,Johnny Depp Orlando Bloom Jack Davenport Gore ...
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,spectre,Christoph Waltz Rory Kinnear Stephanie Sigman ...
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller,the dark knight rises,Tom Hardy Christian Bale Joseph Gordon-Levitt ...
4,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary,star wars: episode vii - the force awakens ...,Doug Walker Rob Walker unknown Doug Walker Doc...
...,...,...,...,...,...,...,...
524,Jim Strouse,Jessica Williams,Chris O'Dowd,Keith Stanfield,Romance Comedy,the incredible jessica james,Jessica Williams Chris O'Dowd Keith Stanfield ...
525,Farhad Mann,Adelaide Kane,Benjamin Hollingsworth,Jean Louisa Kelly,Romance,can't buy my love,Adelaide Kane Benjamin Hollingsworth Jean Loui...
526,Hannaleena Hauru,Inka Haapamäki,Rosa Honkonen,Tiitus Rantala,Romance Comedy,thick lashes of lauri mäntyvaara,Inka Haapamäki Rosa Honkonen Tiitus Rantala Ha...
527,Jonathan A. Rosenbaum,Lou Diamond Phillips,Wallace Shawn,Gina Holden,Crime Comedy Action Family,cop and a half: new recruit,Lou Diamond Phillips Wallace Shawn Gina Holden...


In [34]:
# save the new_data to a new file
new_data.to_csv('../datasets/new_metadata.csv', index=False)