# Analysis of MovieLens dataset (Beginner'sAnalysis)
https://www.kaggle.com/jneupane12/analysis-of-movielens-dataset-beginner-sanalysis

In [1]:
import sys
print(sys.executable)

## 1. First we import necessary Libaries

In [2]:
import pandas as pd # pandas is a data manipulation library
import numpy as np #provides numerical arrays and functions to manipulate the arrays efficiently
import random
import matplotlib.pyplot as plt # data visualization library
import wordcloud #used to generate world cloud
import time
import datetime
import re

## 2.Reading and Exploring the Data

### 2.1 Load Movies Data

In [3]:
movies = pd.read_csv("datasets/movies.csv")

In [4]:
movies.sample(5)

Unnamed: 0,movieId,title,genres
1285,1707,Home Alone 3 (1997),Children|Comedy
3895,5471,Perfect (1985),Drama|Romance
7057,69436,Year One (2009),Adventure|Comedy
4701,7017,Passenger 57 (1992),Action|Thriller
5681,27708,Helen of Troy (2003),Action|Adventure|Drama|Romance


In [5]:
movies.shape, movies.info(), len(movies['movieId'].unique().tolist())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


((9742, 3), None, 9742)

### 2.2 Load Ratings Data

In [6]:
ratings = pd.read_csv("datasets/ratings.csv")
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [7]:
ratings.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
70244,448,112303,1.5,1440358927
98426,606,48744,3.5,1173993468
38258,263,440,4.0,941591657
77442,483,2053,1.0,1215897640
59932,387,5570,3.0,1214030229


In [8]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [9]:
ratings.shape, len(ratings['movieId'].unique().tolist())

((100836, 4), 9724)

### 2.3 Load Tags Data

In [10]:
tags = pd.read_csv("datasets/tags.csv")

In [11]:
ratings.shape, len(ratings['movieId'].unique().tolist())

((100836, 4), 9724)

In [12]:
tags.sample(5)

Unnamed: 0,userId,movieId,tag,timestamp
2979,567,3676,paranoid,1525282868
2682,477,62336,2D animation,1282924611
298,62,116897,dark comedy,1528152859
2427,474,30812,movie business,1138038949
928,424,74458,thought-provoking,1457843122


## 3.Cleaning of data

In [13]:
movies.isnull().any()

movieId    False
title      False
genres     False
dtype: bool

In [14]:
ratings.isnull().any()

userId       False
movieId      False
rating       False
timestamp    False
dtype: bool

In [15]:
tags.isnull().any()

userId       False
movieId      False
tag          False
timestamp    False
dtype: bool

In [16]:
# # lets drop null rows
# tags=tags.dropna()

## 4.Data Analysis

### 4.1 Make dates more readable

In [17]:
# https://stackoverflow.com/a/62968313/2049763
def timestamp_to_date_converter(t):
    # https://realpython.com/python-time-module/
    return datetime.datetime.fromtimestamp(t).strftime("%A, %B %d, %Y %I:%M:%S")

In [18]:
ratings['date'] = ratings['timestamp'].apply(timestamp_to_date_converter)
tags['date']    = tags['timestamp'].apply(timestamp_to_date_converter)

In [19]:
ratings.sort_values(by=['timestamp'], ascending=True).tail(5)

Unnamed: 0,userId,movieId,rating,timestamp,date
81475,514,187031,2.5,1537674927,"Saturday, September 22, 2018 10:55:27"
81477,514,187595,3.0,1537674946,"Saturday, September 22, 2018 10:55:46"
81336,514,5247,2.5,1537757040,"Sunday, September 23, 2018 09:44:00"
81335,514,5246,1.5,1537757059,"Sunday, September 23, 2018 09:44:19"
81092,514,162,4.0,1537799250,"Monday, September 24, 2018 09:27:30"


In [20]:
s = "01/01/2018"
t = time.mktime(datetime.datetime.strptime(s, "%m/%d/%Y").timetuple())

In [21]:
ratings.loc[ratings.timestamp>t].sort_values(by=['timestamp'], ascending=True).head(5)

Unnamed: 0,userId,movieId,rating,timestamp,date
7181,50,2420,2.5,1514842717,"Monday, January 01, 2018 03:38:37"
7357,50,117529,1.5,1514842744,"Monday, January 01, 2018 03:39:04"
7360,50,122904,1.5,1514891080,"Tuesday, January 02, 2018 05:04:40"
7299,50,72998,2.0,1514891142,"Tuesday, January 02, 2018 05:05:42"
7291,50,63312,2.0,1514891165,"Tuesday, January 02, 2018 05:06:05"


In [22]:
ratings.loc[ratings.timestamp>t].sort_values(by=['timestamp'], ascending=True).shape

(6413, 5)

### 4.2 Reading Movie Release Year

In [23]:
# https://stackoverflow.com/a/8569258
def title_to_release_year(s):
    m = re.findall(r"\(([0-9]+)\)", s)
    
    # https://www.guru99.com/python-regular-expressions-complete-tutorial.html
    if m is None or len(m) <= 0:
        return None        
    return m[-1]

In [24]:
movies['year']  = movies['title'].apply(title_to_release_year)

In [25]:
movies.sample(5)

Unnamed: 0,movieId,title,genres,year
1412,1933,"Life of Emile Zola, The (1937)",Drama,1937
3116,4191,Alfie (1966),Comedy|Drama|Romance,1966
6775,60126,Get Smart (2008),Action|Comedy,2008
3133,4217,4 Little Girls (1997),Documentary,1997
1336,1809,Fireworks (Hana-bi) (1997),Crime|Drama,1997


In [26]:
movies.isnull().any()

movieId    False
title      False
genres     False
year        True
dtype: bool

In [27]:
# https://datatofish.com/rows-with-nan-pandas-dataframe/
movies[movies.isnull().any(axis=1)]

Unnamed: 0,movieId,title,genres,year
6059,40697,Babylon 5,Sci-Fi,
9031,140956,Ready Player One,Action|Sci-Fi|Thriller,
9091,143410,Hyena Road,(no genres listed),
9138,147250,The Adventures of Sherlock Holmes and Doctor W...,(no genres listed),
9179,149334,Nocturnal Animals,Drama|Thriller,
9259,156605,Paterson,(no genres listed),
9367,162414,Moonlight,Drama,
9448,167570,The OA,(no genres listed),
9514,171495,Cosmos,(no genres listed),
9515,171631,Maria Bamford: Old Baby,(no genres listed),


In [28]:
movies = movies.dropna()

## 5. Basic Recomendation 

https://www.kaggle.com/subhamoybhaduri/diff-approaches-of-building-recommender-system

In [29]:
ratings.shape

(100836, 5)

In [30]:
# https://stackoverflow.com/a/39881230
# ratings =  ratings.loc[ratings.movieId.isin(movies.movieId)] 

movies_ratings = ratings.merge(movies, on = 'movieId', how = 'inner')
movies_ratings.shape

(100818, 8)

In [34]:
movies_ratings = movies_ratings.sort_values(['year', 'timestamp'], ascending=[True, True])
movies_ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp,date,title,genres,year
78683,318,32898,3.0,1414922361,"Sunday, November 02, 2014 03:59:21","Trip to the Moon, A (Voyage dans la lune, Le) ...",Action|Adventure|Fantasy|Sci-Fi,1902
78682,105,32898,4.5,1446610827,"Tuesday, November 03, 2015 10:20:27","Trip to the Moon, A (Voyage dans la lune, Le) ...",Action|Adventure|Fantasy|Sci-Fi,1902
78685,599,32898,3.0,1498689948,"Wednesday, June 28, 2017 05:45:48","Trip to the Moon, A (Voyage dans la lune, Le) ...",Action|Adventure|Fantasy|Sci-Fi,1902
78684,567,32898,3.5,1525286393,"Wednesday, May 02, 2018 01:39:53","Trip to the Moon, A (Voyage dans la lune, Le) ...",Action|Adventure|Fantasy|Sci-Fi,1902
78681,50,32898,3.5,1525359162,"Thursday, May 03, 2018 09:52:42","Trip to the Moon, A (Voyage dans la lune, Le) ...",Action|Adventure|Fantasy|Sci-Fi,1902


In [32]:
nb_users  = movies_ratings['userId'].nunique()
nb_movies = movies_ratings['movieId'].nunique()

nb_users, nb_movies 

(610, 9711)

In [33]:
ratings_matrix = movies_ratings.pivot_table(index=['userId'],columns=['movieId'],values='rating').reset_index(drop=True)
ratings_matrix.fillna(0, inplace = True)

ratings_matrix.sample(5)

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
27,0.0,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
116,0.0,3.0,3.0,0.0,3.0,3.0,4.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
273,4.0,3.5,0.0,0.0,0.0,4.0,0.0,3.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
585,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
data_matrix = np.array(ratings_matrix)
print(data_matrix.shape)

(610, 9711)


### 5.1 Gaussian Mixture Model and Expectation-Maximization Algorithm

In [39]:
from sklearn.model_selection import train_test_split

from sklearn.mixture import GaussianMixture
from scipy.special import logsumexp
import itertools

In [41]:
# split the data into train and test set
train, test = train_test_split(data_matrix, test_size=0.2, random_state=42, shuffle=True)

In [44]:
test.shape, test

((122, 9711),
 array([[2.5, 3. , 0. , ..., 0. , 0. , 0. ],
        [3.5, 2.5, 0. , ..., 0. , 0. , 0. ],
        [0. , 0. , 0. , ..., 0. , 0. , 0. ],
        ...,
        [3. , 0. , 0. , ..., 0. , 0. , 0. ],
        [3.5, 0. , 0. , ..., 0. , 0. , 0. ],
        [0. , 0. , 0. , ..., 0. , 0. , 0. ]]))

In [43]:
# https://jakevdp.github.io/PythonDataScienceHandbook/05.12-gaussian-mixtures.html#How-many-components?
gmm_model = GaussianMixture(n_components=2, covariance_type='full', 
                            tol=0.001, reg_covar=1e-06, max_iter=100, 
                            n_init=1, init_params='kmeans', weights_init=None, 
                            means_init=None, precisions_init=None, random_state=42, 
                            warm_start=False, verbose=0, verbose_interval=10)
gmm_model.fit(train)

GaussianMixture(n_components=2, random_state=42)

In [45]:
print(gmm_model.means_.shape)
print(gmm_model.covariances_.shape)
print(gmm_model.weights_.shape)

(2, 9711)
(2, 9711, 9711)
(2,)


In [50]:
gmm_model.predict(test)

array([1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [49]:
# Fill Missing Values i.e Recommend
inver0, inver1 = gmm_model.covariances_[0], gmm_model.covariances_[1] 
inver0, inver1 = np.linalg.inv(inver0), np.linalg.inv(inver1)


deter0, deter1 = gmm_model.covariances_[0], gmm_model.covariances_[1] 
deter0, deter1 = np.linalg.det(deter0), np.linalg.det(deter1)

n, d = train.shape
K = gmm_model.means_.shape[0]
print(n, d, K)

mean = gmm_model.means_
variance = gmm_model.covariances_
weight = np.log(gmm_model.weights_)

calc = np.zeros((n, K))
ind = np.zeros((n, d))

soft = calc
X_pred = ind

add = np.zeros((n,))
dim = np.zeros((n,))
    
ind = np.where(train != 0, 1, 0)            
dim = np.sum(ind, axis=1)

for i in range(n):
    for j in range(K):
        res = train[i] - mean[j]
        res = np.multiply(res, ind[i])
        
        # Multivariate Gaussian
        if j == 0:
            A = (res.T @ inver0) @ res
            C = (dim[i]/2)*np.log(2*np.pi) + np.log(deter0 + 1e-16)/2
        else:
            A = (res.T @ inver1) @ res
            C = (dim[i]/2)*np.log(2*np.pi) + np.log(deter1 + 1e-16)/2
        
        B = 2
        calc[i, j] = weight[j] + (-A/B) - C

add = logsumexp(calc, axis = 1)

#Since the entire computation is done in log-domain to avoid Numerical instability
#we need to bring it back in its original domain
soft = np.exp(np.subtract(np.transpose(calc), add))

lg = np.sum(add)
    
X_calc = np.transpose(soft) @ gmm_model.means_

488 9711 2


In [None]:
#We will use predicted value if the entry is 0 in original rating matrix
data_matrix_pred_GMM = np.where(data_matrix == 0, X_calc, data_matrix)

for i in range(data_matrix_pred_GMM.shape[0]):
    for j in range(data_matrix_pred_GMM.shape[1]):
        data_matrix_pred_GMM[i, j] = round(data_matrix_pred_GMM[i, j])

#For measuring the performance we have to use the predicted matrix
for i in range(X_calc.shape[0]):
    for j in range(X_calc.shape[1]):
        X_pred[i, j] = round(X_calc[i, j])

In [None]:
ind_matrix = np.zeros((nb_users, nb_movies))
ind_matrix = np.where(data_matrix != 0, 1, 0)

x = np.multiply(X_pred, ind_matrix)

RMSE_GMM = np.sqrt(np.mean((x - data_matrix)**2))
print("RMSE of GMM Model is %f." %RMSE_GMM)

### 5.2 Altenating Least Squares using Non-Negative Matrix Factorization