### Simple Recommend System

#### 1. Configuration

In [1]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix

import warnings
warnings.filterwarnings("ignore")

#### 2. Dataset 분석

In [2]:
path = './ml-latest-small/'
ratings_df = pd.read_csv(os.path.join(path + 'ratings.csv'), encoding='utf-8')
tags_df = pd.read_csv(os.path.join(path + 'tags.csv'), encoding='utf-8')
movies_df = pd.read_csv(os.path.join(path + 'movies.csv'),
                        index_col='movieId', encoding='utf-8')

In [3]:
num_users = ratings_df['userId'].unique()
num_movies = ratings_df['movieId'].unique()

print('총 유저 수: ', len(num_users))
print('총 영화 수: ', len(num_movies))

총 유저 수:  610
총 영화 수:  9724


In [9]:
# 영화에 대한 user들의 rating
user_movie_matrix = ratings_df.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0)

user_movie_matrix

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
sparse_mat = csr_matrix(user_movie_matrix.values)

print(sparse_mat)

  (0, 0)	4.0
  (0, 4)	4.0
  (0, 6)	4.5
  (0, 14)	2.5
  (0, 16)	4.5
  (0, 17)	3.5
  (0, 18)	4.0
  (0, 20)	3.5
  (0, 26)	3.0
  (0, 30)	5.0
  (0, 31)	3.0
  (0, 32)	3.0
  (0, 39)	5.0
  (0, 42)	5.0
  (0, 43)	3.0
  (0, 44)	4.0
  (0, 45)	5.0
  (0, 49)	3.0
  (0, 53)	3.0
  (0, 56)	5.0
  (0, 62)	5.0
  (0, 63)	4.0
  (0, 65)	4.0
  (0, 67)	2.5
  (0, 70)	5.0
  :	:
  (9700, 337)	2.5
  (9701, 337)	3.0
  (9702, 183)	4.0
  (9702, 247)	3.5
  (9703, 317)	2.5
  (9704, 209)	1.0
  (9705, 461)	2.5
  (9706, 49)	3.5
  (9707, 337)	1.5
  (9708, 337)	4.0
  (9709, 337)	1.0
  (9710, 337)	1.5
  (9711, 337)	1.0
  (9712, 337)	1.0
  (9713, 183)	4.5
  (9714, 183)	3.5
  (9715, 183)	3.0
  (9716, 183)	4.0
  (9717, 183)	4.0
  (9718, 183)	3.5
  (9719, 183)	4.0
  (9720, 183)	3.5
  (9721, 183)	3.5
  (9722, 183)	3.5
  (9723, 330)	4.0


In [12]:
# 각 user별로 rating한 영화의 수
user_info_df = pd.DataFrame(
    data=[sum(list(user_movie_matrix[int(x)].value_counts())[1:]) 
          for x in user_movie_matrix.columns],
    index=user_movie_matrix.columns,
    columns=['movies_rated']
)

user_info_df

Unnamed: 0_level_0,movies_rated
userId,Unnamed: 1_level_1
1,232
2,29
3,39
4,216
5,44
...,...
606,1115
607,187
608,831
609,37


In [13]:
# 각 movie별로 rating받은 횟수
movie_info_df = pd.DataFrame(
    data=[sum(list(user_movie_matrix.loc[int(x)].value_counts())[1:])
          for x in user_movie_matrix.index],
    index=user_movie_matrix.index,
    columns=['users_rated']
)

movie_info_df

Unnamed: 0_level_0,users_rated
movieId,Unnamed: 1_level_1
1,215
2,110
3,52
4,7
5,49
...,...
193581,1
193583,1
193585,1
193587,1


#### 4. Dataset preprocessing

In [15]:
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=1234)

In [16]:
print(train_df.shape)
print(test_df.shape)

(80668, 4)
(20168, 4)


In [17]:
# test set에만 존재하는 user, movie의 비율
print('test set에만 존재하는 user 수: ',len(list(set(test_df['userId'].unique()) - set(train_df['userId'].unique()))))
print('test set에만 존재하는 movie 수: ',len(list(set(test_df['movieId'].unique()) - set(train_df['movieId'].unique()))))

test set에만 존재하는 user 수:  0
test set에만 존재하는 movie 수:  786


#### 5-1. 랜덤 평점 예측

In [18]:
import random
ratings_range = np.arange(0.5, 5.5, step=0.5)
pred_random = [random.choice(ratings_range) for x in range(len(test_df))]

pred_random[:10]

[1.0, 2.5, 3.0, 1.5, 1.5, 4.5, 4.0, 1.5, 2.5, 1.5]

In [19]:
test_df['pred_ratings_random'] = pred_random
test_df

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random
99731,610,3527,5.0,1479545223,1.0
97583,606,1250,3.5,1171376891,2.5
38197,262,213,5.0,840310907,3.0
11474,68,69406,3.0,1261622505,1.5
34105,232,4728,3.0,1218166950,1.5
...,...,...,...,...,...
41080,279,593,4.0,1506394242,4.0
4897,31,780,4.0,850466616,4.0
8023,56,410,3.0,835799188,1.0
77467,483,2291,4.0,1415579167,1.0


In [20]:
mse = mean_squared_error(y_true=test_df['rating'].values, y_pred=test_df['pred_ratings_random'].values)
rmse = np.sqrt(mse)

print('RMSE: ', rmse)

RMSE:  1.9237465460664454


#### 5-2. 영화의 평점 평균

In [23]:
train_movie_df = train_df.groupby('movieId').mean()

train_movie_df

Unnamed: 0_level_0,userId,rating,timestamp
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,307.473373,3.893491,1.128439e+09
2,327.475610,3.396341,1.142893e+09
3,266.386364,3.454545,9.900434e+08
4,192.750000,2.250000,8.425133e+08
5,309.526316,3.039474,1.007415e+09
...,...,...,...
193573,184.000000,4.000000,1.537100e+09
193579,184.000000,3.500000,1.537107e+09
193581,184.000000,4.000000,1.537109e+09
193587,184.000000,3.500000,1.537110e+09


In [24]:
def avg_rating_prediction(training_set, x):
    if x in training_set.index:
        pred_rating = training_set.loc[x]['rating']
    else:
        pred_rating = random.choice(ratings_range)
        
    return pred_rating

In [25]:
test_df['pred_ratings_avg'] = test_df['movieId'].apply(lambda x: avg_rating_prediction(train_movie_df, x))
test_df

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random,pred_ratings_avg
99731,610,3527,5.0,1479545223,1.0,3.604167
97583,606,1250,3.5,1171376891,2.5,4.180556
38197,262,213,5.0,840310907,3.0,3.750000
11474,68,69406,3.0,1261622505,1.5,3.571429
34105,232,4728,3.0,1218166950,1.5,2.769231
...,...,...,...,...,...,...
41080,279,593,4.0,1506394242,4.0,4.127907
4897,31,780,4.0,850466616,4.0,3.470760
8023,56,410,3.0,835799188,1.0,3.131148
77467,483,2291,4.0,1415579167,1.0,3.734375


In [26]:
mse = mean_squared_error(
    y_true=test_df['rating'].values, y_pred=test_df['pred_ratings_avg'].values)
rmse = np.sqrt(mse)

print('RMSE: ', rmse)


RMSE:  1.0302869460326867


#### 5-3. 사용자의 평균 평점

In [27]:
train_user_df = train_df.groupby('userId').mean()

train_user_df

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1891.168478,4.320652,9.649865e+08
2,70402.760000,3.940000,1.445715e+09
3,8394.733333,2.516667,1.306464e+09
4,1957.923077,3.631868,9.655941e+08
5,337.606061,3.636364,8.474351e+08
...,...,...,...
606,9380.236158,3.649718,1.179733e+09
607,1906.558621,3.772414,9.649104e+08
608,4448.867669,3.145865,1.122822e+09
609,495.275862,3.275862,8.472210e+08


In [28]:
test_df['pred_ratings_user'] = test_df['userId'].apply(lambda x: avg_rating_prediction(train_user_df, x))
test_df

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random,pred_ratings_avg,pred_ratings_user
99731,610,3527,5.0,1479545223,1.0,3.604167,3.678709
97583,606,1250,3.5,1171376891,2.5,4.180556,3.649718
38197,262,213,5.0,840310907,3.0,3.750000,2.925000
11474,68,69406,3.0,1261622505,1.5,3.571429,3.229331
34105,232,4728,3.0,1218166950,1.5,2.769231,3.242268
...,...,...,...,...,...,...,...
41080,279,593,4.0,1506394242,4.0,4.127907,3.666667
4897,31,780,4.0,850466616,4.0,3.470760,3.911765
8023,56,410,3.0,835799188,1.0,3.131148,3.837838
77467,483,2291,4.0,1415579167,1.0,3.734375,3.598940


In [29]:
mse = mean_squared_error(
    y_true=test_df['rating'].values, y_pred=test_df['pred_ratings_user'].values)
rmse = np.sqrt(mse)

print('RMSE: ', rmse)


RMSE:  0.9437101798978504


#### 5-4. 특정 장르에 속하는 영화들의 평균 평점

In [30]:
train_user_movie_mat = train_df.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0)

train_user_movie_mat

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,0.0,2.5,0.0,5.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,0.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193573,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
genres_df = movies_df['genres'].str.get_dummies(sep='|')
genres_df = genres_df.loc[train_df.movieId.unique()]

genres_df

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
5943,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2571,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
8958,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2322,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
2959,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45648,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
6067,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
26861,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
6814,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [32]:
# train set에서 영화별 유저의 평균 평점
train_movie_avg_ratings_df = train_user_movie_mat.copy()
train_movie_avg_ratings_df = train_movie_avg_ratings_df.replace(0, np.NaN)
train_movie_avg_ratings_df = train_movie_avg_ratings_df.mean(axis=1)

train_movie_avg_ratings_df

movieId
1         3.893491
2         3.396341
3         3.454545
4         2.250000
5         3.039474
            ...   
193573    4.000000
193579    3.500000
193581    4.000000
193587    3.500000
193609    4.000000
Length: 8938, dtype: float64

In [33]:
genres_avg_ratings_df = pd.DataFrame(
    index=genres_df.columns,
    columns=['avg_ratings']
)

for genre in genres_avg_ratings_df.index:
    genre_avg_rating = train_movie_avg_ratings_df.loc[genres_df[genres_df[genre] == 1].index].mean()
    genres_avg_ratings_df.loc[genre]['avg_ratings'] = genre_avg_rating
    
genres_avg_ratings_df

Unnamed: 0,avg_ratings
(no genres listed),3.33642
Action,3.11085
Adventure,3.230721
Animation,3.492258
Children,3.101232
Comedy,3.18148
Crime,3.313588
Documentary,3.801026
Drama,3.429093
Fantasy,3.240257


In [34]:
def get_genre_avg_ratings(x):
    genres_list = movies_df.loc[x]['genres'].split('|')
    rating = genres_avg_ratings_df.loc[genres_list].mean()
    
    return rating

In [37]:
tqdm.pandas()
test_df['pred_ratings_genre'] = test_df['movieId'].progress_apply(lambda x: get_genre_avg_ratings(x))

test_df

100%|██████████| 20168/20168 [00:15<00:00, 1303.00it/s]


Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random,pred_ratings_avg,pred_ratings_user,pred_ratings_genre
99731,610,3527,5.0,1479545223,1.0,3.604167,3.678709,3.138325
97583,606,1250,3.5,1171376891,2.5,4.180556,3.649718,3.410377
38197,262,213,5.0,840310907,3.0,3.750000,2.925000,3.429093
11474,68,69406,3.0,1261622505,1.5,3.571429,3.229331,3.267870
34105,232,4728,3.0,1218166950,1.5,2.769231,3.242268,3.181480
...,...,...,...,...,...,...,...,...
41080,279,593,4.0,1506394242,4.0,4.127907,3.666667,3.132440
4897,31,780,4.0,850466616,4.0,3.470760,3.911765,3.161424
8023,56,410,3.0,835799188,1.0,3.131148,3.837838,3.174323
77467,483,2291,4.0,1415579167,1.0,3.734375,3.598940,3.341203


In [38]:
mse = mean_squared_error(
    y_true=test_df['rating'].values, y_pred=test_df['pred_ratings_genre'].values)
rmse = np.sqrt(mse)

print('RMSE: ', rmse)


RMSE:  1.0607500191128232


#### 5-5. User의 normalized 평균 평점

In [41]:
train_user_info_df = pd.DataFrame(
    {
    'avg_ratings': train_df.groupby('userId')['rating'].mean(),
    'std_ratings': train_df.groupby('userId')['rating'].std(),
    'count_ratings': train_df.groupby('userId')['rating'].count()
    }
)

train_user_info_df

Unnamed: 0_level_0,avg_ratings,std_ratings,count_ratings
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,4.320652,0.836600,184
2,3.940000,0.820569,25
3,2.516667,2.127340,30
4,3.631868,1.317823,182
5,3.636364,1.084498,33
...,...,...,...
606,3.649718,0.734887,885
607,3.772414,0.955574,145
608,3.145865,1.071503,665
609,3.275862,0.454859,29


In [43]:
min_count = train_user_info_df['count_ratings'].min()
max_count = train_user_info_df['count_ratings'].max()
avg_count = train_user_info_df['count_ratings'].mean()

train_user_info_df['weights'] = train_user_info_df['count_ratings'].apply(lambda x: (x-avg_count)/(max_count-min_count))

train_user_info_df

Unnamed: 0_level_0,avg_ratings,std_ratings,count_ratings,weights
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,4.320652,0.836600,184,0.023995
2,3.940000,0.820569,25,-0.049718
3,2.516667,2.127340,30,-0.047400
4,3.631868,1.317823,182,0.023068
5,3.636364,1.084498,33,-0.046010
...,...,...,...,...
606,3.649718,0.734887,885,0.348983
607,3.772414,0.955574,145,0.005914
608,3.145865,1.071503,665,0.246990
609,3.275862,0.454859,29,-0.047864


In [45]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(train_user_info_df)
df_normalized = pd.DataFrame(
    np_scaled,
    columns=train_user_info_df.columns,
    index=train_user_info_df.index
)

df_normalized['normalized_avg_ratings'] = df_normalized['avg_ratings'] * 5
df_normalized

Unnamed: 0_level_0,avg_ratings,std_ratings,count_ratings,weights,normalized_avg_ratings
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.822227,0.393261,0.079740,0.079740,4.111134
2,0.722617,0.385725,0.006027,0.006027,3.613084
3,0.350156,1.000000,0.008345,0.008345,1.750779
4,0.641984,0.619470,0.078813,0.078813,3.209921
5,0.643161,0.509791,0.009736,0.009736,3.215803
...,...,...,...,...,...
606,0.646655,0.345449,0.404729,0.404729,3.233275
607,0.678762,0.449188,0.061660,0.061660,3.393812
608,0.514806,0.503682,0.302735,0.302735,2.574029
609,0.548824,0.213816,0.007881,0.007881,2.744119


In [46]:
test_df['pred_ratings_normalized'] = test_df['userId'].apply(
    lambda x: df_normalized.loc[x]['normalized_avg_ratings'])

test_df


Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random,pred_ratings_avg,pred_ratings_user,pred_ratings_genre,pred_ratings_normalized
99731,610,3527,5.0,1479545223,1.0,3.604167,3.678709,3.138325,3.271208
97583,606,1250,3.5,1171376891,2.5,4.180556,3.649718,3.410377,3.233275
38197,262,213,5.0,840310907,3.0,3.750000,2.925000,3.429093,2.285047
11474,68,69406,3.0,1261622505,1.5,3.571429,3.229331,3.267870,2.683236
34105,232,4728,3.0,1218166950,1.5,2.769231,3.242268,3.181480,2.700164
...,...,...,...,...,...,...,...,...,...
41080,279,593,4.0,1506394242,4.0,4.127907,3.666667,3.132440,3.255452
4897,31,780,4.0,850466616,4.0,3.470760,3.911765,3.161424,3.576141
8023,56,410,3.0,835799188,1.0,3.131148,3.837838,3.174323,3.479414
77467,483,2291,4.0,1415579167,1.0,3.734375,3.598940,3.341203,3.166837


In [47]:
mse = mean_squared_error(
    y_true=test_df['rating'].values, y_pred=test_df['pred_ratings_normalized'].values)
rmse = np.sqrt(mse)

print('RMSE: ', rmse)


RMSE:  1.05857408624065
