## Importing Libraries

In [1]:
import ast
import json
import matplotlib.pyplot as plt
import pandas as pd

# Metadata

## Data Loading

In [2]:
df_ori = pd.read_csv("data/movies_metadata.csv", low_memory=False)
df_ori.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,30/10/1995,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,15/12/1995,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,22/12/1995,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,22/12/1995,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,10/2/1995,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [3]:
df_ori.shape

(45466, 24)

In [4]:
df_ori.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

### Checking if there are missing values.

In [5]:
df_ori.isnull().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

## Data Cleaning

In [6]:
df_metadata = df_ori.copy()

### Columns that are needed to build our recommendation system

In [7]:
df_metadata = df_metadata[['id', 'title', 'genres', 'original_language', 'overview', 'tagline', 'production_countries', 'release_date', 'status']]

In [8]:
df_metadata.head()

Unnamed: 0,id,title,genres,original_language,overview,tagline,production_countries,release_date,status
0,862,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",en,"Led by Woody, Andy's toys live happily in his ...",,"[{'iso_3166_1': 'US', 'name': 'United States o...",30/10/1995,Released
1,8844,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",en,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,"[{'iso_3166_1': 'US', 'name': 'United States o...",15/12/1995,Released
2,15602,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",en,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,"[{'iso_3166_1': 'US', 'name': 'United States o...",22/12/1995,Released
3,31357,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",en,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,"[{'iso_3166_1': 'US', 'name': 'United States o...",22/12/1995,Released
4,11862,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",en,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,"[{'iso_3166_1': 'US', 'name': 'United States o...",10/2/1995,Released


### Check if duplicates titles have same release date 

In [9]:
df_metadata[["title", "release_date"]].duplicated().sum()

32

### Number of movies with no overviews

In [10]:
df_metadata[df_metadata.overview.isnull()].shape[0]

954

### Number of movies that has not yet been released

In [11]:
df_metadata[df_metadata.status != "Released"].shape[0]

452

**TODO:**  

We will remove movies which:
- have same titles and release date
- have no overviews
- have not yet been released

### Genres and Production Countries

In [12]:
df_metadata['genres'][0]

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [13]:
df_metadata['production_countries'][0]

"[{'iso_3166_1': 'US', 'name': 'United States of America'}]"

**TODO:**  

We need to extract the feature names from the data.

### Extract the movie feature names from the data

In [14]:
def extract_feature_names(feature):
    if feature:
        try:
            feature = json.dumps(ast.literal_eval(feature))
            json_feature = json.loads(feature)
            feature_name_list = [feature['name'] for feature in json_feature]
            feature_name_str = ' '.join(feature_name_list)
            return feature_name_str
        
        except TypeError:
            return ""
    else:
        return ""

In [15]:
def clean_movies_data_set(df):
    print(f"The number of movies in the original data set is: {df.shape[0]}")
    
    # Removes duplicates titles that have same release date
    df.drop_duplicates(subset = ['title', 'release_date'], inplace = True)
    
    # Removes movies that have no overview or have not yet been released
    index_drop = df[(df.overview.isnull()) | (df.status != 'Released')].index
    df.drop(index_drop, inplace=True)

    # Fills the rows with empty production_countries to NaN
    df.loc[df.production_countries == "[]", 'production_countries'] = pd.NA
    
    # Replaces all the null values with empty string
    df.fillna("", inplace = True)
    
    # Extracts the genre names and production countries from the data
    df['genres'] = df['genres'].apply(extract_feature_names)
    df['production_countries'] = df['production_countries'].apply(extract_feature_names)

    print(f"The number of movies in the cleaned data set is: {df.shape[0]}")
    
    return df

In [16]:
df_metadata = clean_movies_data_set(df_metadata)

The number of movies in the original data set is: 45466
The number of movies in the cleaned data set is: 44065


In [17]:
df_metadata.head()

Unnamed: 0,id,title,genres,original_language,overview,tagline,production_countries,release_date,status
0,862,Toy Story,Animation Comedy Family,en,"Led by Woody, Andy's toys live happily in his ...",,United States of America,30/10/1995,Released
1,8844,Jumanji,Adventure Fantasy Family,en,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,United States of America,15/12/1995,Released
2,15602,Grumpier Old Men,Romance Comedy,en,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,United States of America,22/12/1995,Released
3,31357,Waiting to Exhale,Comedy Drama Romance,en,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,United States of America,22/12/1995,Released
4,11862,Father of the Bride Part II,Comedy,en,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,United States of America,10/2/1995,Released


# Keywords

In [18]:
df_keywords = pd.read_csv('data/keywords.csv')

In [19]:
df_keywords.shape

(46419, 2)

In [20]:
df_keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [21]:
df_keywords.keywords[0]

"[{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id': 6054, 'name': 'friendship'}, {'id': 9713, 'name': 'friends'}, {'id': 9823, 'name': 'rivalry'}, {'id': 165503, 'name': 'boy next door'}, {'id': 170722, 'name': 'new toy'}, {'id': 187065, 'name': 'toy comes to life'}]"

In [22]:
df_keywords['keywords'] = df_keywords['keywords'].apply(extract_feature_names)

In [23]:
df_keywords.keywords[0]

'jealousy toy boy friendship friends rivalry boy next door new toy toy comes to life'

In [24]:
df_keywords.head()

Unnamed: 0,id,keywords
0,862,jealousy toy boy friendship friends rivalry bo...
1,8844,board game disappearance based on children's b...
2,15602,fishing best friend duringcreditsstinger old men
3,31357,based on novel interracial relationship single...
4,11862,baby midlife crisis confidence aging daughter ...


## Merge two data frames

In [25]:
df_metadata['id'] = df_metadata['id'].astype(str)
df_keywords['id'] = df_keywords['id'].astype(str)

In [None]:
df_metadata = df_metadata[['id', 'title', 'genres', 'original_language', 'overview', 'tagline', 'production_countries', 'release_date', 'status']]

In [26]:
df_merge = pd.merge(df_keywords, df_metadata, on='id', how='inner')[['id', 'title', 'genres', 'original_language', 'overview', 'tagline', 'keywords', 'production_countries', 'release_date', 'status']]

In [28]:
df_merge.head()

Unnamed: 0,id,title,genres,original_language,overview,tagline,keywords,production_countries,release_date,status
0,862,Toy Story,Animation Comedy Family,en,"Led by Woody, Andy's toys live happily in his ...",,jealousy toy boy friendship friends rivalry bo...,United States of America,30/10/1995,Released
1,8844,Jumanji,Adventure Fantasy Family,en,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,board game disappearance based on children's b...,United States of America,15/12/1995,Released
2,15602,Grumpier Old Men,Romance Comedy,en,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,fishing best friend duringcreditsstinger old men,United States of America,22/12/1995,Released
3,31357,Waiting to Exhale,Comedy Drama Romance,en,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,based on novel interracial relationship single...,United States of America,22/12/1995,Released
4,11862,Father of the Bride Part II,Comedy,en,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,baby midlife crisis confidence aging daughter ...,United States of America,10/2/1995,Released


In [32]:
df_merge.shape

(45001, 10)

In [54]:
df_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45001 entries, 0 to 45000
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   id                    45001 non-null  object
 1   title                 45001 non-null  object
 2   genres                45001 non-null  object
 3   original_language     45001 non-null  object
 4   overview              45001 non-null  object
 5   tagline               45001 non-null  object
 6   keywords              45001 non-null  object
 7   production_countries  45001 non-null  object
 8   release_date          45001 non-null  object
 9   status                45001 non-null  object
dtypes: object(10)
memory usage: 3.4+ MB


### Save the merged data frame to a csv file

In [37]:
df_merge.to_csv("data/filtered_data.csv", index=False)

In [38]:
df = pd.read_csv("data/filtered_data.csv")

In [40]:
df.head()

Unnamed: 0,id,title,genres,original_language,overview,tagline,keywords,production_countries,release_date,status
0,862,Toy Story,Animation Comedy Family,en,"Led by Woody, Andy's toys live happily in his ...",,jealousy toy boy friendship friends rivalry bo...,United States of America,30/10/1995,Released
1,8844,Jumanji,Adventure Fantasy Family,en,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,board game disappearance based on children's b...,United States of America,15/12/1995,Released
2,15602,Grumpier Old Men,Romance Comedy,en,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,fishing best friend duringcreditsstinger old men,United States of America,22/12/1995,Released
3,31357,Waiting to Exhale,Comedy Drama Romance,en,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,based on novel interracial relationship single...,United States of America,22/12/1995,Released
4,11862,Father of the Bride Part II,Comedy,en,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,baby midlife crisis confidence aging daughter ...,United States of America,10/2/1995,Released


In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45001 entries, 0 to 45000
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   id                    45001 non-null  int64 
 1   title                 45001 non-null  object
 2   genres                42791 non-null  object
 3   original_language     44992 non-null  object
 4   overview              45001 non-null  object
 5   tagline               20581 non-null  object
 6   keywords              31221 non-null  object
 7   production_countries  38957 non-null  object
 8   release_date          44936 non-null  object
 9   status                45001 non-null  object
 10  soup                  45001 non-null  object
dtypes: int64(1), object(10)
memory usage: 3.8+ MB


## Create a new column named 'soup', which is a string contains all the data that we want to feed to the BERT model.  
- Soup: genres, original language, overview, tagline, keywords,  production countries

In [49]:
def create_soup(row):
    try:
        return (row.genres + " "+ row.original_language + " "  \
                + row.overview + " " + row.tagline + " " + row.keywords + " " \
                + row.production_countries).lower()
    except:
        return 'Failed'

In [56]:
df_merge["soup"] = df_merge.apply(create_soup, axis = 1)

In [58]:
df_merge.soup

0        animation comedy family en led by woody, andy'...
1        adventure fantasy family en when siblings judy...
2        romance comedy en a family wedding reignites t...
3        comedy drama romance en cheated on, mistreated...
4        comedy en just when george banks has recovered...
                               ...                        
44996    drama family fa rising and falling between a m...
44997    drama tl an artist struggles to finish his wor...
44998    action drama thriller en when one of her hits ...
44999     en in a small town live two brothers, one a m...
45000     en 50 years after decriminalisation of homose...
Name: soup, Length: 45001, dtype: object