In [15]:
import pandas as pd

In [16]:
#Loading Movielens Movies dataset
movies_df = pd.read_csv('./data/movielens/movies.csv')
movies_df.head(5)

#Extracticg and creating new column "year" from the title column
movies_df['year'] = movies_df['title'].str.extract(r'(\(\d{4}\)$)') #Extracts '(year)' from title
movies_df['year'] = movies_df['year'].str.extract(r'(\d{4})') #Extracts 'year' from '(year)'

#Checking and counting the number of 'NA' values in the 'year' column
print("Is NA values present: {}".format(movies_df['year'].isnull().values.any())) #Find if NA is present in the column 
print("NA values count: {}".format(movies_df['year'].isnull().values.sum())) #Count the number of NA in the column

#Replacing 'NA' values with '0'
movies_df['year'].fillna('0', inplace=True) #Replace NA with value 0

#Converting the datatype of 'year' column from string to int
movies_df = movies_df.astype({'year': int})
movies_df.dtypes

#Sorting the movies in ascending order by year
movies_df.sort_values(by='year', inplace=True)

#Creating a new dataframe with movies starting from 2008
new_movies_df = movies_df.loc[movies_df['year'] >= 2008]
print("Movies count {}".format(len(new_movies_df.index)))
new_movies_df.head(20)

Is NA values present: True
NA values count: 24
Movies count 2477


Unnamed: 0,movieId,title,genres,year
6850,61986,Appaloosa (2008),Western,2008
6842,61406,John Adams (2008),Drama,2008
6734,59126,Religulous (2008),Comedy|Documentary,2008
7420,80584,"Patrik Age 1.5 (Patrik 1,5) (2008)",Comedy|Drama|Romance,2008
6735,59129,Outpost (2008),Action|Horror,2008
6830,61250,"House Bunny, The (2008)",Comedy,2008
6732,59103,"Forbidden Kingdom, The (2008)",Action|Adventure|Comedy|Fantasy,2008
6922,64614,Gran Torino (2008),Crime|Drama,2008
8671,121374,Bill Burr: Why Do I Do This? (2008),Comedy,2008
6833,61262,Mirrors (2008),Horror|Mystery|Thriller,2008


In [17]:
#Movielens Links dataset
links_df = pd.read_csv('./data/movielens/external_links.csv')

#Checking and counting the number of 'NA' values in the 'tmdbid' and 'imdbid' columns
print("Does 'tmdbid' column contain NA values ?: {}".format(links_df['tmdbId'].isnull().values.any()))
print("NA Values count: {}".format(links_df['tmdbId'].isnull().values.sum()))
print("Does 'imdbid' column contain NA values ?: {}".format(links_df['imdbId'].isnull().values.any()))
print("NA Values count: {}".format(links_df['imdbId'].isnull().values.sum()))

#Replacing 'NA' values with '0'
links_df['tmdbId'].fillna(0, inplace=True) #Replace NA values with 0

#Checking the column datatypes
print("\n",links_df.dtypes)

#Converting the datatype of 'tmdbId' column from float to int
links_df = links_df.astype({'tmdbId': int})
links_df.head(5)

Does 'tmdbid' column contain NA values ?: True
NA Values count: 8
Does 'imdbid' column contain NA values ?: False
NA Values count: 0

 movieId      int64
imdbId       int64
tmdbId     float64
dtype: object


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862
1,2,113497,8844
2,3,113228,15602
3,4,114885,31357
4,5,113041,11862


In [18]:
#merge movies and link dataframe on movieid
merged_movies_df = pd.merge(new_movies_df, links_df, how='inner', on = 'movieId')
merged_movies_df.head(5)

Unnamed: 0,movieId,title,genres,year,imdbId,tmdbId
0,61986,Appaloosa (2008),Western,2008,800308,12690
1,61406,John Adams (2008),Drama,2008,472027,118309
2,59126,Religulous (2008),Comedy|Documentary,2008,815241,13007
3,80584,"Patrik Age 1.5 (Patrik 1,5) (2008)",Comedy|Drama|Romance,2008,1067733,15179
4,59129,Outpost (2008),Action|Horror,2008,892899,9017
