
## Build user-based recommendation model for Amazon

In [1]:
import warnings
warnings.filterwarnings('ignore')


import pandas as pd
import numpy as np

In [2]:
df1 = pd.read_csv('Amazon - Movies and TV Ratings.csv')

In [3]:
df1.head()

Unnamed: 0,user_id,Movie1,Movie2,Movie3,Movie4,Movie5,Movie6,Movie7,Movie8,Movie9,...,Movie197,Movie198,Movie199,Movie200,Movie201,Movie202,Movie203,Movie204,Movie205,Movie206
0,A3R5OBKS7OM2IR,5.0,5.0,,,,,,,,...,,,,,,,,,,
1,AH3QC2PC1VTGP,,,2.0,,,,,,,...,,,,,,,,,,
2,A3LKP6WPMP9UKX,,,,5.0,,,,,,...,,,,,,,,,,
3,AVIY68KEPQ5ZD,,,,5.0,,,,,,...,,,,,,,,,,
4,A1CV1WROP5KTTW,,,,,5.0,,,,,...,,,,,,,,,,


# Exploratory Data Analysis
## Which movies have maximum views/ratings
* Printing First 10 movies having maximum views/ratings

In [4]:
movie_list = df1.notna().sum(axis=0).sort_values(ascending = False).head(10)
movie_list.index.tolist()[1:]

['Movie127',
 'Movie140',
 'Movie16',
 'Movie103',
 'Movie29',
 'Movie91',
 'Movie92',
 'Movie89',
 'Movie158']

## What is the average rating for each movie ? 
* Printing Average rating of first 10 movies

In [5]:
df2 = df1.drop('user_id',axis=1).mean()
df3 = df1.notna().sum(axis=0)
df3.drop(df3.index[0],inplace=True)

In [6]:
df_WithAvgRatingAndVotingCount = pd.concat([df2, df3], axis=1, sort=False)
df_WithAvgRatingAndVotingCount.columns=['AvgRating','RatingCount']
df_WithAvgRatingAndVotingCount.head(10)

Unnamed: 0,AvgRating,RatingCount
Movie1,5.0,1
Movie2,5.0,1
Movie3,2.0,1
Movie4,5.0,2
Movie5,4.103448,29
Movie6,4.0,1
Movie7,5.0,1
Movie8,5.0,1
Movie9,5.0,1
Movie10,5.0,1


In [7]:
C = df_WithAvgRatingAndVotingCount['AvgRating'].mean() #Mean vote average on the whole dataset
m = df_WithAvgRatingAndVotingCount['RatingCount'].quantile(0.85) # Minimum number of votes required to be listed

print(f'Average rating on dataset {C} and Minimum no. of votes Required {m}')

Average rating on dataset 4.448436665448387 and Minimum no. of votes Required 9.0


In [8]:
def wtdrt(x,m=m,C=C):
    V = x['RatingCount']
    R = x['AvgRating']
    
    return (R*(V/(V+m)) + (C*(m/(V+m))))

In [9]:
df_MovieListConsidredForScore = df_WithAvgRatingAndVotingCount.loc[df_WithAvgRatingAndVotingCount['RatingCount']>=m]

In [10]:
df_MovieListConsidredForScore['Score'] = df_MovieListConsidredForScore.apply(wtdrt,axis=1)

## Top5 movies with maximum Score

In [11]:
df_MovieListConsidredForScore['Score'].sort_values(ascending=False).head(5)

Movie140    4.828000
Movie29     4.793793
Movie158    4.773812
Movie92     4.745781
Movie206    4.728906
Name: Score, dtype: float64

## Top5 Movies with least Audience

In [12]:
movie_list_with_least_audience = \
df_WithAvgRatingAndVotingCount[df_WithAvgRatingAndVotingCount['RatingCount'] == 1]['AvgRating'].head().sort_values \
(ascending=True)
movie_list_with_least_audience.index.tolist()

['Movie3', 'Movie6', 'Movie1', 'Movie2', 'Movie7']

# Build Recommendation Model
## Adjust the dataset to use Recommendation algorithm

In [13]:
df_adjusted = df1.melt(id_vars= df1.columns[0], value_vars=df1.columns[1:], var_name='MovieName',value_name='Rating')
df_adjusted.head()

Unnamed: 0,user_id,MovieName,Rating
0,A3R5OBKS7OM2IR,Movie1,5.0
1,AH3QC2PC1VTGP,Movie1,
2,A3LKP6WPMP9UKX,Movie1,
3,AVIY68KEPQ5ZD,Movie1,
4,A1CV1WROP5KTTW,Movie1,


## Divide the Data into Training and Test data

In [14]:
from surprise import Reader, Dataset, SVD, accuracy
from surprise.model_selection import cross_validate, train_test_split

In [15]:
reader = Reader()
df_adjusted.fillna(df_adjusted['Rating'].mean(), inplace=True)

data = Dataset.load_from_df(df_adjusted, reader=reader)
TrainData, testData = train_test_split(data, test_size=0.40)

### Build a recommendation model on training data

In [16]:
algo = SVD()
algo.fit(TrainData)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1ec09439708>

### Make predictions on the test data

In [17]:
accuracy.rmse(algo.test(testData))

RMSE: 0.0879


0.08788572613164347

In [18]:
cross_validate(algo,data,measures=['RMSE','MAE'],cv=5,verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.0850  0.0909  0.0840  0.0837  0.0852  0.0858  0.0026  
MAE (testset)     0.0096  0.0096  0.0093  0.0095  0.0095  0.0095  0.0001  
Fit time          62.29   60.30   58.49   58.40   59.09   59.71   1.46    
Test time         2.22    2.49    2.66    2.77    2.70    2.57    0.20    


{'test_rmse': array([0.08503591, 0.0909412 , 0.08403075, 0.08373605, 0.08518658]),
 'test_mae': array([0.00959614, 0.0095698 , 0.00932803, 0.00948032, 0.0095136 ]),
 'fit_time': (62.29083013534546,
  60.30123496055603,
  58.48602223396301,
  58.395713806152344,
  59.08691668510437),
 'test_time': (2.2183234691619873,
  2.4899041652679443,
  2.655928134918213,
  2.771040201187134,
  2.704331874847412)}

In [19]:
algo.predict('A3R5OBKS7OM2IR', 'Movie1', 5.0, verbose=True)

user: A3R5OBKS7OM2IR item: Movie1     r_ui = 5.00   est = 4.40   {'was_impossible': False}


Prediction(uid='A3R5OBKS7OM2IR', iid='Movie1', r_ui=5.0, est=4.397581602898659, details={'was_impossible': False})

In [20]:
algo.predict('AH3QC2PC1VTGP', 'Movie3', 2.0, verbose=True)

user: AH3QC2PC1VTGP item: Movie3     r_ui = 2.00   est = 4.35   {'was_impossible': False}


Prediction(uid='AH3QC2PC1VTGP', iid='Movie3', r_ui=2.0, est=4.353409400664826, details={'was_impossible': False})

In [22]:
algo.predict('A3R5OBKS7OM2IR', 'Movie3', verbose=True)

user: A3R5OBKS7OM2IR item: Movie3     r_ui = None   est = 4.39   {'was_impossible': False}


Prediction(uid='A3R5OBKS7OM2IR', iid='Movie3', r_ui=None, est=4.390100417316281, details={'was_impossible': False})