# Exploratory Data Analysis

1.  Which movies have maximum views/ratings?
2.  What is the average rating for each movie? Define the top 5 movies with the maximum ratings.
3.  Define the top 5 movies with the least audience.

In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv('Amazon - Movies and TV Ratings.csv')

In [None]:
data.head(5)

Unnamed: 0,user_id,Movie1,Movie2,Movie3,Movie4,Movie5,Movie6,Movie7,Movie8,Movie9,...,Movie197,Movie198,Movie199,Movie200,Movie201,Movie202,Movie203,Movie204,Movie205,Movie206
0,A3R5OBKS7OM2IR,5.0,5.0,,,,,,,,...,,,,,,,,,,
1,AH3QC2PC1VTGP,,,2.0,,,,,,,...,,,,,,,,,,
2,A3LKP6WPMP9UKX,,,,5.0,,,,,,...,,,,,,,,,,
3,AVIY68KEPQ5ZD,,,,5.0,,,,,,...,,,,,,,,,,
4,A1CV1WROP5KTTW,,,,,5.0,,,,,...,,,,,,,,,,


In [None]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Movie1,1.0,5.000000,,5.0,5.00,5.0,5.0,5.0
Movie2,1.0,5.000000,,5.0,5.00,5.0,5.0,5.0
Movie3,1.0,2.000000,,2.0,2.00,2.0,2.0,2.0
Movie4,2.0,5.000000,0.000000,5.0,5.00,5.0,5.0,5.0
Movie5,29.0,4.103448,1.496301,1.0,4.00,5.0,5.0,5.0
...,...,...,...,...,...,...,...,...
Movie202,6.0,4.333333,1.632993,1.0,5.00,5.0,5.0,5.0
Movie203,1.0,3.000000,,3.0,3.00,3.0,3.0,3.0
Movie204,8.0,4.375000,1.407886,1.0,4.75,5.0,5.0,5.0
Movie205,35.0,4.628571,0.910259,1.0,5.00,5.0,5.0,5.0


In [None]:
# Which movies have maximum views?
data.describe().T['count'].sort_values(ascending=False).head(1).to_frame()

Unnamed: 0,count
Movie127,2313.0


In [None]:
# Which movies have maximum rating?
data.sum().drop('user_id').sort_values(ascending=False).head(1).to_frame()

Unnamed: 0,0
Movie127,9511.0


In [None]:
# What is the average rating for each movie?
data.drop('user_id',axis=1).mean()

Movie1      5.000000
Movie2      5.000000
Movie3      2.000000
Movie4      5.000000
Movie5      4.103448
              ...   
Movie202    4.333333
Movie203    3.000000
Movie204    4.375000
Movie205    4.628571
Movie206    4.923077
Length: 206, dtype: float64

In [None]:
# Define the top 5 movies with the maximum ratings.
data.drop('user_id',axis=1).mean().sort_values(ascending=False).head(5).to_frame()

Unnamed: 0,0
Movie1,5.0
Movie66,5.0
Movie76,5.0
Movie75,5.0
Movie74,5.0


In [None]:
#Define the top 5 movies with the least audience.
data.describe().T['count'].sort_values(ascending=True).head(5).to_frame()

Unnamed: 0,count
Movie1,1.0
Movie71,1.0
Movie145,1.0
Movie69,1.0
Movie68,1.0


# Recommendation Model

1.   Divide the data into training and test data
2.   Build a recommendation model on training data
3.   Make predictions on the test data

In [None]:
!pip install scikit-surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 4.6 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1633725 sha256=ec4a23e0b1ecc396c509a5e7327f8a56673eac147bca64cf01aabfb53e5313b1
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [None]:
from surprise import Reader
from surprise import accuracy
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise.model_selection import cross_validate

In [None]:
#Prepare data
dataset = data.melt(id_vars=data.columns[0],value_vars=data.columns[1:],var_name="Movie_id",value_name="Rating")

In [None]:
dataset

Unnamed: 0,user_id,Movie_id,Rating
0,A3R5OBKS7OM2IR,Movie1,5.0
1,AH3QC2PC1VTGP,Movie1,
2,A3LKP6WPMP9UKX,Movie1,
3,AVIY68KEPQ5ZD,Movie1,
4,A1CV1WROP5KTTW,Movie1,
...,...,...,...
998683,A1IMQ9WMFYKWH5,Movie206,5.0
998684,A1KLIKPUF5E88I,Movie206,5.0
998685,A5HG6WFZLO10D,Movie206,5.0
998686,A3UU690TWXCG1X,Movie206,5.0


In [None]:
#Create Reader
rd = Reader(rating_scale=(-1,10))

In [None]:
#Load data into surprise dataset
surprise_dataset = Dataset.load_from_df(dataset.fillna(0),reader=rd)
surprise_dataset

<surprise.dataset.DatasetAutoFolds at 0x7fe2f0024c90>

In [None]:
trainset, testset = train_test_split(surprise_dataset,test_size=0.20)

In [None]:
svd = SVD()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fa740bbed50>

In [None]:
prediction_score = svd.test(testset)
accuracy.rmse(prediction_score)

RMSE: 0.2823


0.2822943671185837

In [None]:
accuracy.mae(prediction_score)

MAE:  0.0410


0.04100575379332971

In [None]:
#Test with single known data
svd.predict('A2647CKYEBQE7N', 'Movie12', r_ui=5.0, verbose= True)

user: A2647CKYEBQE7N item: Movie12    r_ui = 5.00   est = 0.07   {'was_impossible': False}


Prediction(uid='A2647CKYEBQE7N', iid='Movie12', r_ui=5.0, est=0.06807037244162464, details={'was_impossible': False})

In [None]:
#Perform Cross validation
cross_validate(svd, surprise_dataset, measures = ['RMSE', 'MAE'], cv = 3, verbose = True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.2832  0.2833  0.2810  0.2825  0.0011  
MAE (testset)     0.0435  0.0428  0.0425  0.0429  0.0004  
Fit time          42.23   42.26   42.92   42.47   0.32    
Test time         4.05    3.54    3.84    3.81    0.21    


{'fit_time': (42.23206281661987, 42.26110243797302, 42.91590714454651),
 'test_mae': array([0.04351911, 0.0428004 , 0.04247786]),
 'test_rmse': array([0.28324359, 0.28331396, 0.2810348 ]),
 'test_time': (4.0454792976379395, 3.5432589054107666, 3.841055154800415)}

In [None]:
def validate(dframe,min_,max_):
    svd = SVD()
    rd = Reader(rating_scale=(-1,10))
    data = Dataset.load_from_df(dframe,reader=rd)
    print(cross_validate(svd, data, measures = ['RMSE', 'MAE'], cv = 3, verbose = True))
    print("**"*10)
    u_id = 'A2647CKYEBQE7N'
    m_id = 'Movie12'
    ra_u = 5.0
    print(svd.predict(u_id,m_id,r_ui=ra_u,verbose=True))
    print("**"*10)
    print()

In [None]:
validate(dataset.fillna(0),-1,10)


Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.2847  0.2807  0.2804  0.2819  0.0019  
MAE (testset)     0.0434  0.0422  0.0421  0.0426  0.0006  
Fit time          43.08   43.50   44.13   43.57   0.43    
Test time         4.19    3.77    4.27    4.08    0.22    
{'test_rmse': array([0.28468114, 0.28073877, 0.28042952]), 'test_mae': array([0.04339438, 0.04221309, 0.04211762]), 'fit_time': (43.08428645133972, 43.499834299087524, 44.13115382194519), 'test_time': (4.19275689125061, 3.769821882247925, 4.270899057388306)}
********************
user: A2647CKYEBQE7N item: Movie12    r_ui = 5.00   est = 0.08   {'was_impossible': False}
user: A2647CKYEBQE7N item: Movie12    r_ui = 5.00   est = 0.08   {'was_impossible': False}
********************



In [None]:
validate(dataset.fillna(dataset.mean()),-1,10)

In [None]:
validate(dataset.fillna(dataset.median()),-1,10)

  """Entry point for launching an IPython kernel.


Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.0914  0.0902  0.0962  0.0926  0.0026  
MAE (testset)     0.0087  0.0087  0.0091  0.0088  0.0002  
Fit time          43.85   43.53   44.80   44.06   0.54    
Test time         4.63    4.67    7.26    5.52    1.23    
{'test_rmse': array([0.09141018, 0.09020508, 0.09617054]), 'test_mae': array([0.00873463, 0.00868977, 0.00905194]), 'fit_time': (43.84948396682739, 43.52930974960327, 44.800671339035034), 'test_time': (4.627399921417236, 4.665792226791382, 7.263259172439575)}
********************
user: A2647CKYEBQE7N item: Movie12    r_ui = 5.00   est = 5.00   {'was_impossible': False}
user: A2647CKYEBQE7N item: Movie12    r_ui = 5.00   est = 5.00   {'was_impossible': False}
********************



In [None]:
#trying grid search and find optimum hyperparameter value for n_factors
from surprise.model_selection import GridSearchCV

In [None]:
param_grid = {'n_epochs':[20,30],
             'lr_all':[0.005,0.001],
             'n_factors':[50,100]}

In [None]:
gs = GridSearchCV(SVD,param_grid,measures=['rmse','mae'],cv=3)
gs.fit(surprise_dataset)

In [None]:
gs.best_score

In [None]:
print(gs.best_score["rmse"])
print(gs.best_params["rmse"])