## MovieLens 20M Dataset

In [1]:
#Import packages
import pandas as pd
import numpy as np

### Load Datasets
Load CSV files into corresponding dataframes.

In [46]:
movies = pd.read_csv('./data/ml-small/movies.csv')
ratings = pd.read_csv('./data/ml-small/ratings.csv')
tags = pd.read_csv('./data/ml-small/tags.csv')

### Preview Datasets
Display the top 5 rows of each dataframe.

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


### Data Cleaning
Check duplicates and missing values. Drop unwanted columns.

In [6]:
#Check Missing values
print('Number of missing values in ratings:',ratings.isnull().sum().sum())
print('Number of missing values in tags:',tags.isnull().sum().sum())
print('Number of missing values in movies:',movies.isnull().sum().sum())

Number of missing values in ratings: 0
Number of missing values in tags: 0
Number of missing values in movies: 0


In [7]:
#Check Duplicate Values
print('Number of duplicate values in ratings:',len(ratings[ratings.duplicated(['movieId','userId'])].index))
print('Number of duplicate values in tags:',len(tags[tags.duplicated(['movieId','userId','tag'])].index))
print('Number of duplicate values in movies:',len(movies[movies.duplicated(['movieId'])].index))

Number of duplicate values in ratings: 0
Number of duplicate values in tags: 0
Number of duplicate values in movies: 0


This dataset is very clean with no missing or duplicate values.

In [8]:
#Based on my research on algorithms, timestamp is irrelevent and would be remove from the dataset.
tags = tags.drop(['timestamp'],axis=1)
ratings = ratings.drop(['timestamp'],axis=1)

### Rating Normalization
Considering different users have different rating principles, some may be very tolerant to movies they don't like and some users might be harsh on ratings. A standard normalization is performed based on each users' record.

In [9]:
#Transform for calculation
ratings_mat = pd.pivot_table(ratings,values='rating',index=['movieId'],
                                  columns=['userId'],fill_value=np.nan)
ratings_mat.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,,,4.0,,4.5,,,,...,4.0,,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,,,,,,4.0,,4.0,,,...,,4.0,,5.0,3.5,,,2.0,,
3,4.0,,,,,5.0,,,,,...,,,,,,,,2.0,,
4,,,,,,3.0,,,,,...,,,,,,,,,,
5,,,,,,5.0,,,,,...,,,,3.0,,,,,,


In [10]:
ratings['rating'].describe() 

count    100836.000000
mean          3.501557
std           1.042529
min           0.500000
25%           3.000000
50%           3.500000
75%           4.000000
max           5.000000
Name: rating, dtype: float64

In [11]:
#Normalization
ratings_norm = ratings_mat-ratings_mat.mean()
ratings_norm = ratings_norm.div(ratings_mat.std(),axis='columns')
ratings_norm = ratings_norm.transpose()
ratings_norm.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.457947,,-0.457947,,,-0.457947,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,0.367146,,,,,,,,,,...,,,,,,,,,,


In [12]:
#Stack to original format
ratings_new = ratings_norm.stack().to_frame(name='ratings')
ratings_new.reset_index(inplace=True)
ratings_new.head()

Unnamed: 0,userId,movieId,ratings
0,1,1,-0.457947
1,1,3,-0.457947
2,1,6,-0.457947
3,1,47,0.791978
4,1,50,0.791978


In [13]:
#New rating stats
ratings_new['ratings'].describe()

count    1.008160e+05
mean     3.876354e-18
std      9.969800e-01
min     -5.901622e+00
25%     -6.182146e-01
50%      1.270298e-01
75%      6.904992e-01
max      2.974428e+00
Name: ratings, dtype: float64

In [14]:
#Save to File
ratings_new.to_csv('./data/ratings_small_new.csv',index=False)

### Transform movies dataframe

In [47]:
#Split year from title and stats into datafram
movies = movies.set_index('movieId')
movies['num_ratings'] = ratings.groupby('movieId').size()
mean_ratings = ratings.groupby('movieId')['rating'].mean()
movies['avg_rating'] = mean_ratings
movies['year'] = movies.title.str[-5:-1]
movies['title'] = movies.title.str[:-6]
movies.head()

Unnamed: 0_level_0,title,genres,num_ratings,avg_rating,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,215.0,3.92093,1995
2,Jumanji,Adventure|Children|Fantasy,110.0,3.431818,1995
3,Grumpier Old Men,Comedy|Romance,52.0,3.259615,1995
4,Waiting to Exhale,Comedy|Drama|Romance,7.0,2.357143,1995
5,Father of the Bride Part II,Comedy,49.0,3.071429,1995


In [40]:
movies.head()

Unnamed: 0,movieId,title,genres,num_ratings,avg_rating,year
0,1,,Adventure|Animation|Children|Comedy|Fantasy,,,
1,2,,Adventure|Children|Fantasy,110.0,,
2,3,,Comedy|Romance,52.0,3.92093,
3,4,,Comedy|Drama|Romance,7.0,3.431818,
4,5,Fath,Comedy,49.0,3.259615,r of


In [None]:
#plot by year

In [None]:
#Genre analysis

In [None]:
#tag analysis