<span style='color:#FFFF99; font-size:20pt'>Import Libraries</span>

In [70]:
import pandas as pd
import re

<span style='color:#FFFF99; font-size:20pt'>Loading Dataset</span>

In [71]:
movielens_movies = pd.read_csv('movielens-1m/movies.dat', sep='::', engine='python', 
                        names=['movieId', 'title', 'genres'], encoding='ISO-8859-1')

In [75]:
tmdb = pd.read_csv('top10K-TMDB-movies.csv')

In [72]:
movielens_movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


<span style='color:#FFFF99; font-size:20pt'>Data Cleaning</span>

In [None]:
# Split year out of title into a new column
movielens_movies['year'] = movielens_movies['title'].str.extract(r'\((\d{4})\)')
movielens_movies['title'] = movielens_movies['title'].str.replace(r'\(\d{4}\)', '', regex=True).str.strip()

In [74]:
movielens_movies

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Animation|Children's|Comedy,1995
1,2,Jumanji,Adventure|Children's|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama,1995
4,5,Father of the Bride Part II,Comedy,1995
...,...,...,...,...
3878,3948,Meet the Parents,Comedy,2000
3879,3949,Requiem for a Dream,Drama,2000
3880,3950,Tigerland,Drama,2000
3881,3951,Two Family House,Drama,2000


In [76]:
tmdb['release_date'] = tmdb['release_date'].astype(str).str[:4]

In [77]:
tmdb.head(2)

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995,8.7,3731


In [78]:
merged = pd.merge(movielens_movies, tmdb, left_on=['title', 'year'], right_on=['title', 'release_date'], how='left')

In [87]:
merged.head(5)

Unnamed: 0,movieId,title,genres,year,description
0,1,Toy Story,Animation|Children's|Comedy,1995,"Led by Woody, Andy's toys live happily in his ..."
1,2,Jumanji,Adventure|Children's|Fantasy,1995,When siblings Judy and Peter discover an encha...
2,3,Grumpier Old Men,Comedy|Romance,1995,A family wedding reignites the ancient feud be...
3,4,Waiting to Exhale,Comedy|Drama,1995,
4,5,Father of the Bride Part II,Comedy,1995,Just when George Banks has recovered from his ...


In [80]:
merged = merged.drop(['id', 'genre', 'original_language', 'popularity', 'release_date', 'vote_average', 'vote_count'], axis=1)

In [81]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movieId   3883 non-null   int64 
 1   title     3883 non-null   object
 2   genres    3883 non-null   object
 3   year      3883 non-null   object
 4   overview  1234 non-null   object
dtypes: int64(1), object(4)
memory usage: 151.8+ KB


In [82]:
merged.isnull().sum()

movieId        0
title          0
genres         0
year           0
overview    2649
dtype: int64

In [83]:
merged.isna().sum()

movieId        0
title          0
genres         0
year           0
overview    2649
dtype: int64

In [None]:
# pd.set_option('display.max_rows', None) # This code will display all of the dataframe
pd.reset_option('all') # Reset to default

# merged[['title', 'overview']].head(5)
merged[pd.isna(merged['overview'])]

  pd.reset_option('all') # Reset to default
  pd.reset_option('all') # Reset to default


Unnamed: 0,movieId,title,genres,year,overview
3,4,Waiting to Exhale,Comedy|Drama,1995,
7,8,Tom and Huck,Adventure|Children's,1995,
10,11,"American President, The",Comedy|Drama|Romance,1995,
25,26,Othello,Drama,1995,
27,28,Persuasion,Romance,1995,
...,...,...,...,...,...
3872,3942,Sorority House Massacre II,Horror,1990,
3873,3943,Bamboozled,Comedy,2000,
3874,3944,Bootmen,Comedy|Drama,2000,
3881,3951,Two Family House,Drama,2000,


In [85]:
# Title got error E.g: "Fear, The"
# Fix titles by using regex
def fix_title_regex(title):
    return re.sub(r"^(.*), (The|A|An|L'|Le)( \(.+\))?$", r'\2 \1\3', title)
merged['title'] = merged['title'].apply(fix_title_regex)

In [86]:
merged = merged.rename(columns={'overview':'description'})
merged

Unnamed: 0,movieId,title,genres,year,description
0,1,Toy Story,Animation|Children's|Comedy,1995,"Led by Woody, Andy's toys live happily in his ..."
1,2,Jumanji,Adventure|Children's|Fantasy,1995,When siblings Judy and Peter discover an encha...
2,3,Grumpier Old Men,Comedy|Romance,1995,A family wedding reignites the ancient feud be...
3,4,Waiting to Exhale,Comedy|Drama,1995,
4,5,Father of the Bride Part II,Comedy,1995,Just when George Banks has recovered from his ...
...,...,...,...,...,...
3878,3948,Meet the Parents,Comedy,2000,"Greg Focker is ready to marry his girlfriend, ..."
3879,3949,Requiem for a Dream,Drama,2000,The hopes and dreams of four ambitious people ...
3880,3950,Tigerland,Drama,2000,A group of recruits go through Advanced Infant...
3881,3951,Two Family House,Drama,2000,
