In [32]:
# Data analysis libraries
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# import mplcursors # Use this is for creating a cursor-interactive plot with "%matplotlib notebook"

from sklearn.decomposition import NMF # Use this for training Non-negative Matrix Factorization
from sklearn.utils.extmath import randomized_svd # Use this for training Singular Value Decomposition
from sklearn.manifold import TSNE # Use this for training t-sne manifolding
from sklearn.model_selection import train_test_split
#### 원핫 인코딩
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error


from itertools import permutations # For making pairs

plt.style.use('ggplot') # You can also use different style

# just for plot checking, use this option
# %matplotlib inline

# for interactive plot
# If you use this option, plot will appear at first-drawn position
%matplotlib notebook

warnings.filterwarnings('ignore')


In [3]:
# Load flat file (Kaggle)
df_reviews_raw = pd.read_csv('beer_reviews.csv')

# Best Practice: Make a copy of the raw data to work on
df_reviews = df_reviews_raw.copy()

# Peep it
df_reviews.head()

# reviews = reviews.drop(["review_time", "brewery_name", "beer_name"], axis=1)
df_reviews = df_reviews.drop(["brewery_name", "beer_name"], axis=1) # timestamp 살려두기
#reviews_features = df_reviews.copy()

# Peep it
df_reviews.head()

Unnamed: 0,brewery_id,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_abv,beer_beerid
0,10325,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,5.0,47986
1,10325,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,6.2,48213
2,10325,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,6.5,48215
3,10325,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,5.0,47969
4,1075,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,7.7,64883


In [4]:
df_reviews_raw.head(5)

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


In [5]:
# Convert 'object' to 'category' 
df_reviews[df_reviews.select_dtypes(['object']).columns] = df_reviews.select_dtypes(['object']).\
                                                         apply(lambda x: x.astype('category'))
# Examine structure 
df_reviews.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586614 entries, 0 to 1586613
Data columns (total 11 columns):
 #   Column              Non-Null Count    Dtype   
---  ------              --------------    -----   
 0   brewery_id          1586614 non-null  int64   
 1   review_time         1586614 non-null  int64   
 2   review_overall      1586614 non-null  float64 
 3   review_aroma        1586614 non-null  float64 
 4   review_appearance   1586614 non-null  float64 
 5   review_profilename  1586266 non-null  category
 6   beer_style          1586614 non-null  category
 7   review_palate       1586614 non-null  float64 
 8   review_taste        1586614 non-null  float64 
 9   beer_abv            1518829 non-null  float64 
 10  beer_beerid         1586614 non-null  int64   
dtypes: category(2), float64(6), int64(3)
memory usage: 117.8 MB


In [6]:
# Check for null values
#print(df_reviews.isnull().sum())

# Percent of data missing `beer_abv` values
print("Percent Null Values of `beer_abv` column:", round(67785 / 1586614 * 100, 2),"%")

# Drop null row values
df_reviews = df_reviews.dropna()
#df_reviews.info()

# Check for null values
#print(df_reviews.isnull().sum())

# Percent of data that are null
print("Percent of Null Values:", round((1586614 - 1518478)/ 1586614 * 100, 2),"%")

Percent Null Values of `beer_abv` column: 4.27 %
Percent of Null Values: 4.29 %


In [7]:
# Sort by user overall rating first
df_reviews = df_reviews.sort_values('review_overall', ascending=False)

# Peep it
df_reviews.head()

# Keep the highest rating from each user and drop the rest 
df_reviews = df_reviews.drop_duplicates(subset= ['review_profilename','beer_beerid'], keep='first')

# Peep structure
#df_reviews.info()

# Percent of data that are duplicates
print("Percent of Duplicate Values:", round((1518478 - 1504037)/ 1518478 * 100, 2),"%")

Percent of Duplicate Values: 0.95 %


In [8]:
# Histogram of all numeric features
#reviews.hist(figsize=(12,12))
#plt.show()

# Sort by user overall rating first
df_reviews = df_reviews.sort_values('review_overall', ascending=True)

# Peep it
print(1504052 - 1504045, "개의 데이터에 문제 발생")
df_reviews[(df_reviews['review_overall'] < 1) | (df_reviews['review_appearance'] < 1)]


# Review scores of >= 1 : 문제 데이터 제거
df_reviews = df_reviews[(df_reviews['review_overall'] >= 1)]
#df_reviews = df_reviews[(df_reviews['review_appearance'] >= 1)]
# Peep it
df_reviews.head(8)

7 개의 데이터에 문제 발생


Unnamed: 0,brewery_id,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_abv,beer_beerid
1074603,677,1223493846,1.0,1.0,3.0,Marcus6679,Chile Beer,1.0,1.0,4.2,38763
1074607,677,1214606400,1.0,2.5,4.0,mikedgt,Chile Beer,3.0,1.5,4.2,38763
1074612,677,1178150667,1.0,1.0,1.0,zeff80,Chile Beer,1.0,1.0,4.2,2213
1074616,677,1206162572,1.0,1.0,2.5,ChainGangGuy,Chile Beer,2.0,1.0,4.2,38763
1074617,677,1204878541,1.0,2.0,2.0,mattster,Chile Beer,1.0,1.5,4.2,38763
1074620,677,1325988783,1.0,1.0,2.5,B967ierhunter177,Chile Beer,1.0,1.0,4.2,38763
534388,401,1123591124,1.0,1.5,2.0,Vancer,Euro Strong Lager,1.5,1.5,8.0,1967
1074630,677,1317421512,1.0,1.0,1.5,scottfrie,Chile Beer,1.0,1.0,4.2,38763


In [9]:
# Create Pandas DF of ratings by user and item
df_ratings = df_reviews[['review_profilename', 'beer_beerid', 'review_overall']]

# Rename columns
df_ratings = df_ratings.rename(columns = {'review_profilename':'userNm', 
                                    'review_overall':'rating',
                                    'beer_beerid' : 'beerId' })
df_ratings['reviewIdx'] = df_ratings.index


df_ratings.head()

# 고유 사용자, 고유 맥주 갯수 확인
n_users = len(df_ratings['userNm'].unique())
n_beers = len(df_ratings['beerId'].unique())

n_users, n_beers # (32908, 49011)

# df_ratings 기술통계량 확인
df_ratings['rating'].describe()
df_ratings.sort_values(by="beerId", ascending=True).head()

Unnamed: 0,userNm,beerId,rating,reviewIdx
393323,portia99,5,3.5,393323
393331,NODAK,5,4.0,393331
393594,Mustard,5,3.0,393594
393593,clvand0,5,3.0,393593
393406,everetends,5,4.0,393406


In [10]:
df_users = df_ratings.groupby('userNm').size().reset_index()
df_users.columns = ["userNm", "review_num"]

# 임의로(스펠링 순으로) 유저번호 붙이기
df_users['userIdx'] = df_users.index

print(df_users.head())

df_users = df_users.sort_values(by="review_num", ascending=False)

df_users['review_num'] = pd.to_numeric(df_users['review_num'])
df_users.head(1000) #479명의 리뷰 데이터가 없음

#df_users_500 = df_users[df_users["review_num"] >= 500]
#df_users_500# 리뷰를 500개 이상 작성한 654명의 유저만 사용


        userNm  review_num  userIdx
0     0110x011         137        0
1     01Ryan10           1        1
2     02maxima           4        2
3   03SVTCobra           3        3
4  04101Brewer           3        4


Unnamed: 0,userNm,review_num,userIdx
26927,northyorksammy,5311,26927
2351,BuckeyeNation,4240,2351
25703,mikesgroove,4226,25703
12378,Thorpe429,3272,12378
32912,womencantsail,3155,32912
...,...,...,...
17357,cnally,363,17357
32708,whartontallboy,363,32708
8564,Morey,363,8564
23068,jondeelee,363,23068


In [11]:
df_ratings_500 = df_ratings.merge(df_users, left_on = 'userNm', right_on = 'userNm', how = 'left')

df_ratings_500.sort_values(by="userIdx", ascending=True)

Unnamed: 0,userNm,beerId,rating,reviewIdx,review_num,userIdx
1459651,0110x011,7971,5.0,1134674,137,0
1465395,0110x011,2749,5.0,1088528,137,0
339278,0110x011,50331,3.5,224287,137,0
632169,0110x011,47922,4.0,132361,137,0
1407299,0110x011,35328,4.5,1520604,137,0
...,...,...,...,...,...,...
787326,zyzygy,28687,4.0,873579,5,33385
1293111,zyzygy,273,4.5,173551,5,33385
975916,zyzygy,1112,4.0,566468,5,33385
1146649,zyzygy,141,4.5,773245,5,33385


In [15]:
df_beers = df_ratings.groupby('beerId').size().reset_index()
df_beers.columns = ["beerId", "br_review_num"]
df_beers['br_review_num'] = pd.to_numeric(df_beers['br_review_num'])

#one_hot_movie = df_beers.copy()
# 임의로(스펠링 순으로) 유저번호 붙이기
#df_beers['userIdx'] = df_users.index

#print(df_beers.head())

df_beers = df_beers.sort_values(by="br_review_num", ascending=False)

df_beers.head() #479명의 리뷰 데이터가 없음

df_ratings_500 = df_ratings_500.merge(df_beers, left_on = 'beerId', right_on = 'beerId', how = 'left')
df_ratings_500.sort_values(by="beerId", ascending=True)
df_ratings_500 = df_ratings_500[df_ratings_500["br_review_num"] >= 500]
df_ratings_500# 리뷰를 500개 이상 작성한 654명의 유저만 사용

Unnamed: 0,userNm,beerId,rating,reviewIdx,review_num,userIdx,br_review_num_x,br_review_num_y,br_review_num
12,Maestro,299,1.0,1463891,214,7953,1421,1421,1421
34,DogFood11,1054,1.0,207423,1135,3852,767,767,767
76,Spica66,449,1.0,1431865,235,11400,1281,1281,1281
79,CHickman,47364,1.0,308291,601,2497,658,658,658
81,hardy008,1013,1.0,1549606,652,21028,2392,2392,2392
...,...,...,...,...,...,...,...,...,...
1504031,pumarocks,29015,5.0,1047221,54,28153,893,893,893
1504032,billmiky,29015,5.0,1047217,3,15463,893,893,893
1504034,TheLightweight,29015,5.0,1047208,34,12217,893,893,893
1504036,punisher31673,29015,5.0,1046894,78,28157,893,893,893


In [17]:
beer_user_rating = df_ratings_500.pivot_table('rating', index='beerId',
                                         columns='userNm').fillna(0)
beer_user_rating

userNm,0110x011,02maxima,03SVTCobra,05Harley,0Naught0,0beerguy0,0runkp0s,0tt0,0xFF,1000Bottles,...,zulufactor,zumicroom,zutmin,zwalk8,zwoehr,zymrgy,zymurgy4all,zymurgywhiz,zythus,zyzygy
beerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60420,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
62645,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
item_sim = cosine_similarity(beer_user_rating)
print(item_sim)

[[1.         0.37058184 0.27776857 ... 0.25020049 0.19925467 0.1853052 ]
 [0.37058184 1.         0.22879391 ... 0.2131871  0.15519846 0.13574433]
 [0.27776857 0.22879391 1.         ... 0.19046831 0.19331621 0.16316734]
 ...
 [0.25020049 0.2131871  0.19046831 ... 1.         0.33270456 0.32174753]
 [0.19925467 0.15519846 0.19331621 ... 0.33270456 1.         0.38750596]
 [0.1853052  0.13574433 0.16316734 ... 0.32174753 0.38750596 1.        ]]


In [19]:
item_based_col = pd.DataFrame(data = item_sim, index = beer_user_rating.index, columns = beer_user_rating.index)

In [20]:
item_based_col.head()

beerId,6,7,10,17,19,30,31,33,34,36,...,56761,56973,57252,57908,57912,59151,59369,60420,62645,72138
beerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,1.0,0.370582,0.277769,0.280092,0.267416,0.303807,0.278871,0.31519,0.309237,0.218875,...,0.207671,0.313511,0.260945,0.164487,0.166411,0.16559,0.216801,0.2502,0.199255,0.185305
7,0.370582,1.0,0.228794,0.251184,0.223766,0.24829,0.255819,0.262634,0.266324,0.168324,...,0.166025,0.25626,0.210357,0.148055,0.138092,0.148979,0.170648,0.213187,0.155198,0.135744
10,0.277769,0.228794,1.0,0.228956,0.264054,0.349353,0.330542,0.362725,0.338563,0.251207,...,0.215721,0.350571,0.210642,0.154122,0.176275,0.171102,0.215857,0.190468,0.193316,0.163167
17,0.280092,0.251184,0.228956,1.0,0.22967,0.276572,0.26168,0.289431,0.279237,0.195722,...,0.158499,0.279109,0.221064,0.131321,0.15349,0.110701,0.170677,0.206959,0.16239,0.13731
19,0.267416,0.223766,0.264054,0.22967,1.0,0.293518,0.246417,0.278601,0.268166,0.2721,...,0.124242,0.314043,0.111392,0.082298,0.098718,0.086289,0.117043,0.13057,0.116547,0.080466


In [21]:
item_based_col.shape

(586, 586)

In [23]:
item_based_col[6].sort_values(ascending=False)[:5]

beerId
6       1.000000
7       0.370582
102     0.343134
1003    0.339629
131     0.339248
Name: 6, dtype: float64

In [24]:
item_based_col[33].sort_values(ascending=False)[:5]

beerId
33     1.000000
30     0.587986
34     0.568554
694    0.544264
31     0.474450
Name: 33, dtype: float64

In [28]:
item_based_col[56973].sort_values(ascending=False)[:5]

beerId
56973    1.000000
1118     0.553215
1013     0.484803
782      0.482648
90       0.467542
Name: 56973, dtype: float64

In [29]:
def predict_rating(ratings_arr, item_sim_arr):
    ratings_pred = ratings_arr.dot(item_sim_arr) / np.array([np.abs(item_sim_arr).sum(axis=1)])
    return ratings_pred

ratings_matrix = beer_user_rating.transpose()
ratings_matrix

beerId,6,7,10,17,19,30,31,33,34,36,...,56761,56973,57252,57908,57912,59151,59369,60420,62645,72138
userNm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0110x011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
02maxima,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
03SVTCobra,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
05Harley,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0Naught0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zymrgy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zymurgy4all,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zymurgywhiz,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zythus,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
ratings_pred = predict_rating(ratings_matrix.values, item_based_col.values)
ratings_pred_matrix = pd.DataFrame(data=ratings_pred, index = ratings_matrix.index,
                                   columns = ratings_matrix.columns)
ratings_pred_matrix

beerId,6,7,10,17,19,30,31,33,34,36,...,56761,56973,57252,57908,57912,59151,59369,60420,62645,72138
userNm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0110x011,0.368897,0.367192,0.408616,0.365152,0.372764,0.429006,0.385363,0.420375,0.449804,0.417694,...,0.507909,0.440570,0.405698,0.470670,0.505383,0.524063,0.494356,0.418778,0.511989,0.505498
02maxima,0.017513,0.017590,0.018401,0.016684,0.016888,0.019638,0.017506,0.019629,0.020347,0.017500,...,0.019558,0.019790,0.018832,0.020949,0.021546,0.020844,0.019868,0.019091,0.020416,0.021972
03SVTCobra,0.016277,0.016579,0.014059,0.016309,0.016710,0.013333,0.014013,0.013616,0.013933,0.013537,...,0.011136,0.013851,0.016082,0.011496,0.010370,0.010937,0.011230,0.014908,0.011027,0.010925
05Harley,0.425716,0.416310,0.467185,0.410005,0.436825,0.485694,0.443798,0.476934,0.495848,0.477243,...,0.539460,0.487643,0.459506,0.515790,0.535659,0.553742,0.521198,0.465400,0.534546,0.535607
0Naught0,0.007845,0.007979,0.009244,0.007914,0.008983,0.010142,0.009876,0.009874,0.009785,0.010501,...,0.006979,0.008684,0.006190,0.007479,0.007332,0.006973,0.007379,0.007009,0.007008,0.007339
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zymrgy,0.021549,0.020612,0.023224,0.019458,0.021611,0.024174,0.021727,0.023807,0.023887,0.022992,...,0.025262,0.024422,0.021189,0.025096,0.025323,0.027701,0.023921,0.021499,0.025411,0.025432
zymurgy4all,0.304361,0.308751,0.276327,0.312252,0.321928,0.280794,0.287663,0.283614,0.280390,0.273804,...,0.235118,0.283178,0.276493,0.240140,0.234251,0.228535,0.239881,0.272248,0.233300,0.231624
zymurgywhiz,0.016986,0.017201,0.015026,0.017433,0.015031,0.015811,0.014911,0.016126,0.016858,0.013920,...,0.015135,0.015866,0.017759,0.015696,0.016069,0.014403,0.016025,0.018165,0.014667,0.015625
zythus,0.033838,0.033552,0.030873,0.034962,0.035527,0.033322,0.037011,0.034505,0.033375,0.034513,...,0.025366,0.033646,0.029277,0.024638,0.024309,0.023276,0.026137,0.031247,0.025631,0.025035


In [33]:
def get_mse(pred, actual):
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

print(get_mse(ratings_pred, ratings_matrix.values))

9.370954624144714
