# Recommendation System Notebook
- User based recommendation
- User based prediction
- Item based recommendation
- Item based prediction
- Evaluation

In [1]:
# import libraties
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Reading ratings file
ratings = pd.read_csv('ratings.csv', encoding='latin-1')

In [2]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


## Dividing the dataset into train and test

In [3]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(ratings, test_size=0.30, random_state=31)

In [4]:
print(train.shape)
print(test.shape)

(70585, 4)
(30251, 4)


In [5]:
# pivot ratings into movie features
df_movie_features = train.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)

In [6]:
df_movie_features.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,190215,190219,190221,191005,193565,193567,193571,193579,193583,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Copy train and test dataset
These dataset will be used for prediction and evaluation. 
- Dummy train will be used later for prediction of the movies which has not been rated by the user. To ignore the movies rated by the user, we will mark it as 0 during prediction. The movies not rated by user is marked as 1 for prediction. 
- Dummy test will be used for evaluation. To evaluate, we will only make prediction on the movies rated by the user. So, this is marked as 1. This is just opposite of dummy_train

In [7]:
dummy_train = train.copy()
dummy_test = test.copy()

In [8]:
dummy_train['rating'] = dummy_train['rating'].apply(lambda x: 0 if x>=1 else 1)
dummy_test['rating'] = dummy_test['rating'].apply(lambda x: 1 if x>=1 else 0)

In [9]:
# The movies not rated by user is marked as 1 for prediction. 
dummy_train = dummy_train.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(1)

# The movies not rated by user is marked as 0 for evaluation. 
dummy_test = dummy_test.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)

In [10]:
dummy_train.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,190215,190219,190221,191005,193565,193567,193571,193579,193583,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
dummy_test.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,187595,188189,188797,188833,189111,190209,193573,193581,193585,193587
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# User Similarity Matrix

## Using Cosine Similarity

In [12]:
from sklearn.metrics.pairwise import pairwise_distances

# User Similarity Matrix
user_correlation = 1 - pairwise_distances(df_movie_features, metric='cosine')
user_correlation[np.isnan(user_correlation)] = 0
print(user_correlation)

[[1.         0.03844192 0.08536622 ... 0.19408656 0.06817714 0.08887333]
 [0.03844192 1.         0.         ... 0.0422197  0.         0.0883011 ]
 [0.08536622 0.         1.         ... 0.01647758 0.         0.02845466]
 ...
 [0.19408656 0.0422197  0.01647758 ... 1.         0.07774534 0.21316911]
 [0.06817714 0.         0.         ... 0.07774534 1.         0.04861171]
 [0.08887333 0.0883011  0.02845466 ... 0.21316911 0.04861171 1.        ]]


In [13]:
user_correlation.shape

(610, 610)

## Using adjusted Cosine 

### Here, not removing the NaN values and calculating the mean only for the movies rated by the user

In [14]:
movie_features = train.pivot(
    index='userId',
    columns='movieId',
    values='rating'
)

In [15]:
movie_features.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,190215,190219,190221,191005,193565,193567,193571,193579,193583,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


### Normalising the rating of the movie for each user aroung 0 mean

In [16]:
mean = np.nanmean(movie_features, axis=1)
df_subtracted = (movie_features.T-mean).T

In [17]:
df_subtracted.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,190215,190219,190221,191005,193565,193567,193571,193579,193583,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.35,,-0.35,,,-0.35,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,0.4,,,,,,,,,,...,,,,,,,,,,


### Finding cosine similarity

In [18]:
from sklearn.metrics.pairwise import pairwise_distances

# User Similarity Matrix
user_correlation = 1 - pairwise_distances(df_subtracted.fillna(0), metric='cosine')
user_correlation[np.isnan(user_correlation)] = 0
print(user_correlation)

[[ 1.          0.00694602  0.00160449 ...  0.08373762 -0.03376757
   0.01625135]
 [ 0.00694602  1.          0.         ...  0.00263096  0.
   0.02505163]
 [ 0.00160449  0.          1.         ... -0.00666446  0.
   0.02409337]
 ...
 [ 0.08373762  0.00263096 -0.00666446 ...  1.          0.02601342
   0.03271935]
 [-0.03376757  0.          0.         ...  0.02601342  1.
  -0.04036397]
 [ 0.01625135  0.02505163  0.02409337 ...  0.03271935 -0.04036397
   1.        ]]


## Prediction

Doing the prediction for the users which are positively related with other users, and not the users which are negatively related as we are interested in the users which are more similar to the current users. So, ignoring the correlation for values less than 0. 

In [19]:
user_correlation[user_correlation<0]=0
user_correlation

array([[1.        , 0.00694602, 0.00160449, ..., 0.08373762, 0.        ,
        0.01625135],
       [0.00694602, 1.        , 0.        , ..., 0.00263096, 0.        ,
        0.02505163],
       [0.00160449, 0.        , 1.        , ..., 0.        , 0.        ,
        0.02409337],
       ...,
       [0.08373762, 0.00263096, 0.        , ..., 1.        , 0.02601342,
        0.03271935],
       [0.        , 0.        , 0.        , ..., 0.02601342, 1.        ,
        0.        ],
       [0.01625135, 0.02505163, 0.02409337, ..., 0.03271935, 0.        ,
        1.        ]])

Rating predicted by the user (for movies rated as well as not rated) is the weighted sum of correlation with the movie rating (as present in the rating dataset). 

In [20]:
user_predicted_ratings = np.dot(user_correlation, movie_features.fillna(0))
user_predicted_ratings

array([[1.58446164e+01, 5.54354123e+00, 7.08245905e+00, ...,
        0.00000000e+00, 0.00000000e+00, 2.80275213e-02],
       [5.07686451e+00, 2.06757205e+00, 6.00874556e-01, ...,
        1.81453987e-02, 1.81453987e-02, 0.00000000e+00],
       [2.24241909e+00, 1.17788776e+00, 2.90450622e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [1.90584218e+01, 1.02243416e+01, 6.91134419e+00, ...,
        2.06953826e-02, 2.06953826e-02, 3.14163689e-02],
       [1.90831554e+01, 8.84849001e+00, 4.23018218e+00, ...,
        0.00000000e+00, 0.00000000e+00, 8.24765355e-02],
       [2.68340188e+01, 1.02826443e+01, 4.23972094e+00, ...,
        1.25133258e-01, 1.25133258e-01, 5.45579834e-01]])

In [21]:
user_predicted_ratings.shape

(610, 8536)

Since we are interested only in the movies not rated by the user, we will ignore the movies rated by the user by making it zero. 

In [22]:
user_final_rating = np.multiply(user_predicted_ratings,dummy_train)
user_final_rating.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,190215,190219,190221,191005,193565,193567,193571,193579,193583,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,5.543541,0.0,0.094025,1.916126,0.0,2.631891,0.34103,0.247799,6.278434,...,0.023942,0.015961,0.015961,0.0,0.0,0.0,0.0,0.0,0.0,0.028028
2,5.076865,2.067572,0.600875,0.031692,0.584751,1.73199,0.393717,0.090132,0.048347,1.57314,...,0.0,0.0,0.0,0.02333,0.018145,0.015553,0.020738,0.018145,0.018145,0.0
3,2.242419,1.177888,0.290451,0.013938,0.123168,1.608382,0.156013,0.027674,0.019174,1.574436,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7.362739,3.79765,2.323971,0.17868,1.499134,3.66261,2.192954,0.074205,0.360735,3.787689,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,8.908434,3.050563,0.389251,3.424406,7.754833,4.715943,0.30314,1.036136,7.51468,...,0.186165,0.12411,0.12411,0.0,0.0,0.0,0.0,0.0,0.0,0.086721


### Finding the top 5 recommendation for the user 1 

In [23]:
user_final_rating.iloc[1].sort_values(ascending=False)[0:5]

movieId
356     9.838672
318     8.903198
2571    8.521299
2959    7.644745
7153    7.622569
Name: 2, dtype: float64

# Item Based Similarity

Using Correlation

Taking the transpose of the rating matrix to normalize the rating around the mean for different movie ID. In the user based similarity, we had taken mean for each user intead of each movie. 

In [24]:
movie_features = train.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).T

movie_features.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,,,4.0,,4.5,,,,...,,,4.0,,4.0,2.5,4.0,2.5,3.0,5.0
2,,,,,,,,4.0,,,...,,4.0,,5.0,3.5,,,2.0,,
3,4.0,,,,,5.0,,,,,...,,,,,,,,2.0,,
4,,,,,,3.0,,,,,...,,,,,,,,,,
5,,,,,,5.0,,,,,...,,,,3.0,,,,,,


Normalising the movie rating for each movie

In [25]:
mean = np.nanmean(movie_features, axis=1)
df_subtracted = (movie_features.T-mean).T

In [26]:
df_subtracted.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.06962,,,,0.06962,,0.56962,,,,...,,,0.06962,,0.06962,-1.43038,0.06962,-1.43038,-0.93038,1.06962
2,,,,,,,,0.584337,,,...,,0.584337,,1.584337,0.084337,,,-1.415663,,
3,0.717949,,,,,1.717949,,,,,...,,,,,,,,-1.282051,,
4,,,,,,0.9,,,,,...,,,,,,,,,,
5,,,,,,1.75,,,,,...,,,,-0.25,,,,,,


Finding the cosine similarity. Note that since the data is normalised, both the cosine metric and correlation metric will give the same value. 

In [27]:
from sklearn.metrics.pairwise import pairwise_distances

# User Similarity Matrix
item_correlation = 1 - pairwise_distances(df_subtracted.fillna(0), metric='cosine')
item_correlation[np.isnan(item_correlation)] = 0
print(item_correlation)

[[1.         0.06017539 0.11878665 ... 0.         0.         0.        ]
 [0.06017539 1.         0.13996251 ... 0.         0.         0.        ]
 [0.11878665 0.13996251 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


Filtering the correlation only for which the value is greater than 0. (Positively correlated)

In [28]:
item_correlation[item_correlation<0]=0
item_correlation

array([[1.        , 0.06017539, 0.11878665, ..., 0.        , 0.        ,
        0.        ],
       [0.06017539, 1.        , 0.13996251, ..., 0.        , 0.        ,
        0.        ],
       [0.11878665, 0.13996251, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

# Prediction

In [29]:
item_predicted_ratings = np.dot((movie_features.fillna(0).T),item_correlation)
item_predicted_ratings

array([[ 33.7587947 ,  30.70664739,  49.9834095 , ...,   0.        ,
          0.        ,   0.        ],
       [  2.45384103,   5.131285  ,   4.58939   , ...,   0.        ,
          0.        ,   0.        ],
       [  1.09463165,   1.46896937,   0.3293991 , ...,   0.        ,
          0.        ,   0.        ],
       ...,
       [ 66.73643561,  93.68951884,  90.95099905, ...,   0.        ,
          0.        ,   0.        ],
       [  5.21052011,   3.20021999,   5.44174344, ...,   0.        ,
          0.        ,   0.        ],
       [129.51705246, 163.88499426, 101.69761379, ...,   0.        ,
          0.        ,   0.        ]])

In [30]:
item_predicted_ratings.shape

(610, 8536)

In [31]:
dummy_train.shape

(610, 8536)

### Filtering the rating only for the movies not rated by the user for recommendation

In [32]:
item_final_rating = np.multiply(item_predicted_ratings,dummy_train)
item_final_rating.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,190215,190219,190221,191005,193565,193567,193571,193579,193583,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,30.706647,0.0,10.129411,14.393217,0.0,21.080665,11.433819,22.684844,31.192477,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.453841,5.131285,4.58939,0.504185,1.67422,2.422259,2.251769,0.847783,3.549239,3.083713,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.094632,1.468969,0.329399,0.181983,0.579272,1.312426,0.568846,0.213224,0.557135,1.062552,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,19.980276,16.275975,26.10829,12.711843,10.404068,12.505439,12.565088,7.791429,15.202251,15.968412,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,6.080189,6.390913,5.20539,4.84728,4.037585,3.382196,0.716075,2.315031,3.832908,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Top 5 prediction for the user -1

In [33]:
item_final_rating.iloc[1].sort_values(ascending=False)[0:5]

movieId
98809    10.457611
63113    10.339299
97921    10.026339
66171     9.971467
4470      9.971467
Name: 2, dtype: float64

# Evaluation

Evaluation will we same as you have seen above for the prediction. The only difference being, you will evaluate for the movie already rated by the user insead of predicting it for the movie not rated by the user. 

## Using User Similarity

In [34]:
test_movie_features = test.pivot(
    index='userId',
    columns='movieId',
    values='rating'
)
mean = np.nanmean(test_movie_features, axis=1)
test_df_subtracted = (test_movie_features.T-mean).T

# User Similarity Matrix
test_user_correlation = 1 - pairwise_distances(test_df_subtracted.fillna(0), metric='cosine')
test_user_correlation[np.isnan(test_user_correlation)] = 0
print(test_user_correlation)

[[ 1.00000000e+00  0.00000000e+00  0.00000000e+00 ...  5.04902580e-02
   6.14789798e-02  1.34521517e-02]
 [ 0.00000000e+00  1.00000000e+00  0.00000000e+00 ... -6.50167804e-02
   0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  1.00000000e+00 ... -2.78598662e-02
   0.00000000e+00  0.00000000e+00]
 ...
 [ 5.04902580e-02 -6.50167804e-02 -2.78598662e-02 ...  1.00000000e+00
   6.27848795e-04  1.53325960e-03]
 [ 6.14789798e-02  0.00000000e+00  0.00000000e+00 ...  6.27848795e-04
   1.00000000e+00  2.87638538e-02]
 [ 1.34521517e-02  0.00000000e+00  0.00000000e+00 ...  1.53325960e-03
   2.87638538e-02  1.00000000e+00]]


In [35]:
test_user_correlation[test_user_correlation<0]=0
test_user_predicted_ratings = np.dot(test_user_correlation, test_movie_features.fillna(0))
test_user_predicted_ratings

array([[3.37630738, 1.31962575, 0.72443847, ..., 0.00689604, 0.00603404,
        0.00603404],
       [0.38277851, 0.3778047 , 0.03940076, ..., 0.25156317, 0.22011777,
        0.22011777],
       [0.31720425, 0.07143767, 0.05487212, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [5.06286851, 2.33642017, 0.9101591 , ..., 0.        , 0.        ,
        0.        ],
       [1.74411323, 2.07372097, 1.03164336, ..., 0.        , 0.        ,
        0.        ],
       [4.01505146, 2.02705905, 0.71142302, ..., 0.0105728 , 0.0092512 ,
        0.0092512 ]])

### Doing prediction for the movies rated by the user

In [36]:
test_user_final_rating = np.multiply(test_user_predicted_ratings,dummy_test)

In [37]:
test_user_final_rating.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,187595,188189,188797,188833,189111,190209,193573,193581,193585,193587
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Calculating the RMSE for only the movies rated by user. For  RMSE, normalising the rating to (1,5) range. 

In [38]:
from sklearn.preprocessing import MinMaxScaler
from numpy import *

X  = test_user_final_rating.copy() 
X = X[X>0]

scaler = MinMaxScaler(feature_range=(1, 5))
print(scaler.fit(X))
y = (scaler.transform(X))

print(y)

  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


MinMaxScaler(copy=True, feature_range=(1, 5))
[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]


In [39]:
test_ = test.pivot(
    index='userId',
    columns='movieId',
    values='rating'
)

In [40]:
# Finding total non-NaN value
total_non_nan = np.count_nonzero(~np.isnan(y))

In [41]:
rmse = (sum(sum((test_ - y )**2))/total_non_nan)**0.5
print(rmse)

1.3567836139396485


## Using Item similarity

In [42]:
test_movie_features = test.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).T

mean = np.nanmean(test_movie_features, axis=1)
test_df_subtracted = (test_movie_features.T-mean).T

test_item_correlation = 1 - pairwise_distances(test_df_subtracted.fillna(0), metric='cosine')
test_item_correlation[np.isnan(test_item_correlation)] = 0
test_item_correlation[test_item_correlation<0]=0

In [43]:
test_item_correlation.shape

(6089, 6089)

In [44]:
test_movie_features.shape

(6089, 609)

In [45]:
test_item_predicted_ratings = (np.dot(test_item_correlation, test_movie_features.fillna(0))).T
test_item_final_rating = np.multiply(test_item_predicted_ratings,dummy_test)
test_item_final_rating.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,187595,188189,188797,188833,189111,190209,193573,193581,193585,193587
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
test_ = test.pivot(
    index='userId',
    columns='movieId',
    values='rating'
)

In [47]:
from sklearn.preprocessing import MinMaxScaler
from numpy import *

X  = test_item_final_rating.copy() 
X = X[X>0]

scaler = MinMaxScaler(feature_range=(1, 5))
print(scaler.fit(X))
y = (scaler.transform(X))


test_ = test.pivot(
    index='userId',
    columns='movieId',
    values='rating'
)

# Finding total non-NaN value
total_non_nan = np.count_nonzero(~np.isnan(y))

  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


MinMaxScaler(copy=True, feature_range=(1, 5))


### Finding RMSE

In [48]:
rmse = (sum(sum((test_ - y )**2))/total_non_nan)**0.5
print(rmse)

2.1419572555720694


Thank-you