In [2]:
# Data analysis libraries
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# import mplcursors # Use this is for creating a cursor-interactive plot with "%matplotlib notebook"

from sklearn.decomposition import NMF # Use this for training Non-negative Matrix Factorization
from sklearn.utils.extmath import randomized_svd # Use this for training Singular Value Decomposition
from sklearn.manifold import TSNE # Use this for training t-sne manifolding
from sklearn.model_selection import train_test_split
#### 원핫 인코딩
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error


from itertools import permutations # For making pairs

plt.style.use('ggplot') # You can also use different style

# just for plot checking, use this option
# %matplotlib inline

# for interactive plot
# If you use this option, plot will appear at first-drawn position
%matplotlib notebook

warnings.filterwarnings('ignore')


In [4]:
# Load flat file (Kaggle)
df_reviews_raw = pd.read_csv('beer_reviews.csv')

# Best Practice: Make a copy of the raw data to work on
df_reviews = df_reviews_raw.copy()

# Peep it
df_reviews.head()

# reviews = reviews.drop(["review_time", "brewery_name", "beer_name"], axis=1)
df_reviews = df_reviews.drop(["brewery_name", "beer_name"], axis=1) # timestamp 살려두기
#reviews_features = df_reviews.copy()

# Peep it
df_reviews.head()

Unnamed: 0,brewery_id,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_abv,beer_beerid
0,10325,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,5.0,47986
1,10325,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,6.2,48213
2,10325,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,6.5,48215
3,10325,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,5.0,47969
4,1075,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,7.7,64883


In [5]:
df_reviews_raw.head(5)

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


In [6]:
# Convert 'object' to 'category' 
df_reviews[df_reviews.select_dtypes(['object']).columns] = df_reviews.select_dtypes(['object']).\
                                                         apply(lambda x: x.astype('category'))
# Examine structure 
df_reviews.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586614 entries, 0 to 1586613
Data columns (total 11 columns):
 #   Column              Non-Null Count    Dtype   
---  ------              --------------    -----   
 0   brewery_id          1586614 non-null  int64   
 1   review_time         1586614 non-null  int64   
 2   review_overall      1586614 non-null  float64 
 3   review_aroma        1586614 non-null  float64 
 4   review_appearance   1586614 non-null  float64 
 5   review_profilename  1586266 non-null  category
 6   beer_style          1586614 non-null  category
 7   review_palate       1586614 non-null  float64 
 8   review_taste        1586614 non-null  float64 
 9   beer_abv            1518829 non-null  float64 
 10  beer_beerid         1586614 non-null  int64   
dtypes: category(2), float64(6), int64(3)
memory usage: 117.8 MB


In [7]:
# Check for null values
#print(df_reviews.isnull().sum())

# Percent of data missing `beer_abv` values
print("Percent Null Values of `beer_abv` column:", round(67785 / 1586614 * 100, 2),"%")

# Drop null row values
df_reviews = df_reviews.dropna()
#df_reviews.info()

# Check for null values
#print(df_reviews.isnull().sum())

# Percent of data that are null
print("Percent of Null Values:", round((1586614 - 1518478)/ 1586614 * 100, 2),"%")

Percent Null Values of `beer_abv` column: 4.27 %
Percent of Null Values: 4.29 %


In [8]:
# Sort by user overall rating first
df_reviews = df_reviews.sort_values('review_overall', ascending=False)

# Peep it
df_reviews.head()

# Keep the highest rating from each user and drop the rest 
df_reviews = df_reviews.drop_duplicates(subset= ['review_profilename','beer_beerid'], keep='first')

# Peep structure
#df_reviews.info()

# Percent of data that are duplicates
print("Percent of Duplicate Values:", round((1518478 - 1504037)/ 1518478 * 100, 2),"%")

Percent of Duplicate Values: 0.95 %


In [9]:
# Histogram of all numeric features
#reviews.hist(figsize=(12,12))
#plt.show()

# Sort by user overall rating first
df_reviews = df_reviews.sort_values('review_overall', ascending=True)

# Peep it
print(1504052 - 1504045, "개의 데이터에 문제 발생")
df_reviews[(df_reviews['review_overall'] < 1) | (df_reviews['review_appearance'] < 1)]


# Review scores of >= 1 : 문제 데이터 제거
df_reviews = df_reviews[(df_reviews['review_overall'] >= 1)]
#df_reviews = df_reviews[(df_reviews['review_appearance'] >= 1)]
# Peep it
df_reviews.head(8)

7 개의 데이터에 문제 발생


Unnamed: 0,brewery_id,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_abv,beer_beerid
1074603,677,1223493846,1.0,1.0,3.0,Marcus6679,Chile Beer,1.0,1.0,4.2,38763
1074607,677,1214606400,1.0,2.5,4.0,mikedgt,Chile Beer,3.0,1.5,4.2,38763
1074612,677,1178150667,1.0,1.0,1.0,zeff80,Chile Beer,1.0,1.0,4.2,2213
1074616,677,1206162572,1.0,1.0,2.5,ChainGangGuy,Chile Beer,2.0,1.0,4.2,38763
1074617,677,1204878541,1.0,2.0,2.0,mattster,Chile Beer,1.0,1.5,4.2,38763
1074620,677,1325988783,1.0,1.0,2.5,B967ierhunter177,Chile Beer,1.0,1.0,4.2,38763
534388,401,1123591124,1.0,1.5,2.0,Vancer,Euro Strong Lager,1.5,1.5,8.0,1967
1074630,677,1317421512,1.0,1.0,1.5,scottfrie,Chile Beer,1.0,1.0,4.2,38763


In [10]:
# Create Pandas DF of ratings by user and item
df_ratings = df_reviews[['review_profilename', 'beer_beerid', 'review_overall']]

# Rename columns
df_ratings = df_ratings.rename(columns = {'review_profilename':'userNm', 
                                    'review_overall':'rating',
                                    'beer_beerid' : 'beerId' })
df_ratings['reviewIdx'] = df_ratings.index


df_ratings.head()

# 고유 사용자, 고유 맥주 갯수 확인
n_users = len(df_ratings['userNm'].unique())
n_beers = len(df_ratings['beerId'].unique())

n_users, n_beers # (32908, 49011)

# df_ratings 기술통계량 확인
df_ratings['rating'].describe()
df_ratings.sort_values(by="beerId", ascending=True).head()

Unnamed: 0,userNm,beerId,rating,reviewIdx
393323,portia99,5,3.5,393323
393331,NODAK,5,4.0,393331
393594,Mustard,5,3.0,393594
393593,clvand0,5,3.0,393593
393406,everetends,5,4.0,393406


In [11]:
df_users = df_ratings.groupby('userNm').size().reset_index()
df_users.columns = ["userNm", "review_num"]

# 임의로(스펠링 순으로) 유저번호 붙이기
df_users['userIdx'] = df_users.index

print(df_users.head())

df_users = df_users.sort_values(by="review_num", ascending=False)

df_users['review_num'] = pd.to_numeric(df_users['review_num'])
df_users.head(1000) #479명의 리뷰 데이터가 없음

#df_users_500 = df_users[df_users["review_num"] >= 500]
#df_users_500# 리뷰를 500개 이상 작성한 654명의 유저만 사용


        userNm  review_num  userIdx
0     0110x011         137        0
1     01Ryan10           1        1
2     02maxima           4        2
3   03SVTCobra           3        3
4  04101Brewer           3        4


Unnamed: 0,userNm,review_num,userIdx
26927,northyorksammy,5311,26927
2351,BuckeyeNation,4240,2351
25703,mikesgroove,4226,25703
12378,Thorpe429,3272,12378
32912,womencantsail,3155,32912
...,...,...,...
17357,cnally,363,17357
32708,whartontallboy,363,32708
8564,Morey,363,8564
23068,jondeelee,363,23068


In [13]:
df_ratings_500 = df_ratings.merge(df_users, left_on = 'userNm', right_on = 'userNm', how = 'left')

df_ratings_500.sort_values(by="userIdx", ascending=True)

Unnamed: 0,userNm,beerId,rating,reviewIdx,review_num,userIdx
1459651,0110x011,7971,5.0,1134674,137,0
1465395,0110x011,2749,5.0,1088528,137,0
339278,0110x011,50331,3.5,224287,137,0
632169,0110x011,47922,4.0,132361,137,0
1407299,0110x011,35328,4.5,1520604,137,0
...,...,...,...,...,...,...
787326,zyzygy,28687,4.0,873579,5,33385
1293111,zyzygy,273,4.5,173551,5,33385
975916,zyzygy,1112,4.0,566468,5,33385
1146649,zyzygy,141,4.5,773245,5,33385


In [15]:
df_beers = df_ratings.groupby('beerId').size().reset_index()
df_beers.columns = ["beerId", "br_review_num"]
df_beers['br_review_num'] = pd.to_numeric(df_beers['br_review_num'])

#one_hot_movie = df_beers.copy()
# 임의로(스펠링 순으로) 유저번호 붙이기
#df_beers['userIdx'] = df_users.index

#print(df_beers.head())

df_beers = df_beers.sort_values(by="br_review_num", ascending=False)

df_beers.head() #479명의 리뷰 데이터가 없음

df_ratings_500 = df_ratings_500.merge(df_beers, left_on = 'beerId', right_on = 'beerId', how = 'left')
df_ratings_500.sort_values(by="beerId", ascending=True)
# df_ratings_500 = df_ratings_500[df_ratings_500["br_review_num"] >= 500]
# df_ratings_500# 리뷰를 500개 이상 작성한 654명의 유저만 사용

Unnamed: 0,userNm,beerId,rating,reviewIdx,review_num,userIdx,br_review_num
89435,ppoitras,5,4.5,393578,2104,28028,420
26944,TMoney2591,5,3.5,393689,2068,11876,420
26984,BuckeyeNation,5,3.5,393558,4240,2351,420
7934,Gusler,5,3.0,393627,2082,5382,420
26991,Gavage,5,3.5,393584,2338,4990,420
...,...,...,...,...,...,...,...
82486,tempest,77206,4.0,1233040,2400,31193,1
27236,northyorksammy,77247,3.5,1324642,5311,26927,1
90517,zeff80,77291,4.5,784997,2412,33297,1
22851,Phyl21ca,77309,3.5,544276,3135,9697,1


In [16]:
df_ratings_500 = df_ratings_500[df_ratings_500["review_num"]>2000]
df_ratings_500 = df_ratings_500[df_ratings_500["br_review_num"]>500]

In [19]:
beer_user_rating = df_ratings_500.pivot_table('rating', index='beerId',
                                         columns='userNm').fillna(0)
beer_user_rating

userNm,BEERchitect,Bighuge,Billolick,BuckeyeNation,ChainGangGuy,DrJay,Gavage,Gueuzedude,Gusler,Halcyondays,...,oberon,ppoitras,rhoadsrage,russpowell,smcolw,tempest,weeare138,wl0307,womencantsail,zeff80
beerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,3.0,3.0,3.5,3.0,4.0,0.0,2.5,3.5,3.5,4.0,...,3.5,4.0,3.0,4.0,3.5,3.0,4.0,0.0,0.0,3.0
7,3.5,0.0,4.0,3.5,3.0,0.0,0.0,0.0,3.5,4.0,...,3.0,3.0,0.0,3.5,0.0,2.5,3.5,0.0,0.0,2.0
10,4.5,3.5,4.0,4.5,3.5,4.0,0.0,3.0,4.5,3.5,...,4.0,4.0,3.5,3.5,4.0,4.5,4.0,0.0,2.5,0.0
17,4.0,3.0,4.0,3.5,3.5,0.0,0.0,3.5,4.0,4.0,...,0.0,0.0,0.0,0.0,4.0,0.0,3.0,0.0,0.0,4.0
19,4.0,4.5,3.5,4.5,3.5,3.5,0.0,4.0,4.0,0.0,...,4.0,0.0,0.0,3.5,0.0,4.0,4.0,0.0,0.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59151,4.5,0.0,4.0,0.0,0.0,0.0,4.0,0.0,0.0,3.0,...,4.0,0.0,4.0,4.0,0.0,0.0,4.0,0.0,3.0,4.0
59369,4.0,0.0,4.5,4.0,3.5,3.5,3.0,4.0,0.0,3.0,...,4.0,0.0,4.5,0.0,4.0,0.0,0.0,0.0,3.5,3.5
60420,4.0,0.0,4.0,4.0,4.5,3.5,4.5,3.5,0.0,3.5,...,4.5,0.0,4.0,4.5,4.0,4.5,3.5,0.0,2.5,4.5
62645,4.5,0.0,4.5,4.5,3.0,4.0,4.0,3.5,0.0,5.0,...,4.0,2.0,3.5,0.0,4.5,4.5,4.0,0.0,4.0,0.0


In [20]:
item_sim = cosine_similarity(beer_user_rating)
print(item_sim)

[[1.         0.85454921 0.87790756 ... 0.78199402 0.80310466 0.76043784]
 [0.85454921 1.         0.76844153 ... 0.67784426 0.66529162 0.60104635]
 [0.87790756 0.76844153 1.         ... 0.7459112  0.782199   0.70316605]
 ...
 [0.78199402 0.67784426 0.7459112  ... 1.         0.83833398 0.71534825]
 [0.80310466 0.66529162 0.782199   ... 0.83833398 1.         0.7330356 ]
 [0.76043784 0.60104635 0.70316605 ... 0.71534825 0.7330356  1.        ]]


In [21]:
item_based_col = pd.DataFrame(data = item_sim, index = beer_user_rating.index, columns = beer_user_rating.index)

In [22]:
item_based_col.head()

beerId,6,7,10,17,19,30,31,33,34,36,...,56761,56973,57252,57908,57912,59151,59369,60420,62645,72138
beerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,1.0,0.854549,0.877908,0.821035,0.77101,0.92371,0.886294,0.915085,0.900475,0.773028,...,0.833361,0.848296,0.817341,0.706598,0.767072,0.73008,0.79877,0.781994,0.803105,0.760438
7,0.854549,1.0,0.768442,0.738395,0.781007,0.78215,0.764632,0.771594,0.743312,0.707741,...,0.679122,0.727503,0.661734,0.585129,0.62235,0.677948,0.625937,0.677844,0.665292,0.601046
10,0.877908,0.768442,1.0,0.770553,0.796683,0.918477,0.894455,0.918191,0.901986,0.834339,...,0.837393,0.876436,0.800088,0.677053,0.700077,0.70243,0.779417,0.745911,0.782199,0.703166
17,0.821035,0.738395,0.770553,1.0,0.725627,0.801618,0.756935,0.813279,0.785496,0.677395,...,0.624617,0.783955,0.652406,0.592783,0.626601,0.578724,0.686879,0.622824,0.681224,0.599936
19,0.77101,0.781007,0.796683,0.725627,1.0,0.758844,0.720489,0.734357,0.727062,0.760267,...,0.693517,0.735414,0.643124,0.618981,0.647117,0.696087,0.621594,0.670626,0.652824,0.510465


In [23]:
item_based_col.shape

(584, 584)

In [24]:
item_based_col[6].sort_values(ascending=False)[:5]

beerId
6       1.000000
2264    0.938224
1658    0.933665
779     0.927475
101     0.926742
Name: 6, dtype: float64

In [25]:
item_based_col[33].sort_values(ascending=False)[:5]

beerId
33      1.000000
1009    0.973926
30      0.973637
1056    0.968838
131     0.966742
Name: 33, dtype: float64

In [26]:
item_based_col[56973].sort_values(ascending=False)[:5]

beerId
56973    1.000000
27804    0.971657
11922    0.969235
17060    0.967055
4083     0.966691
Name: 56973, dtype: float64

In [27]:
def predict_rating(ratings_arr, item_sim_arr):
    ratings_pred = ratings_arr.dot(item_sim_arr) / np.array([np.abs(item_sim_arr).sum(axis=1)])
    return ratings_pred

ratings_matrix = beer_user_rating.transpose()
ratings_matrix

beerId,6,7,10,17,19,30,31,33,34,36,...,56761,56973,57252,57908,57912,59151,59369,60420,62645,72138
userNm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BEERchitect,3.0,3.5,4.5,4.0,4.0,4.0,4.5,4.5,4.0,4.0,...,4.5,4.0,0.0,4.0,0.0,4.5,4.0,4.0,4.5,4.0
Bighuge,3.0,0.0,3.5,3.0,4.5,4.5,3.0,4.5,4.5,5.0,...,0.0,4.5,0.0,4.5,4.5,0.0,0.0,0.0,0.0,0.0
Billolick,3.5,4.0,4.0,4.0,3.5,3.5,4.5,3.5,4.0,0.0,...,4.5,4.5,3.5,4.0,4.5,4.0,4.5,4.0,4.5,4.5
BuckeyeNation,3.0,3.5,4.5,3.5,4.5,4.0,3.5,4.5,4.0,4.5,...,0.0,4.0,4.0,0.0,0.0,0.0,4.0,4.0,4.5,0.0
ChainGangGuy,4.0,3.0,3.5,3.5,3.5,4.0,3.5,3.5,3.5,4.0,...,4.0,4.0,4.5,3.5,4.0,0.0,3.5,4.5,3.0,0.0
DrJay,0.0,0.0,4.0,0.0,3.5,4.5,4.0,4.0,4.0,4.0,...,4.0,4.5,4.0,0.0,0.0,0.0,3.5,3.5,4.0,4.0
Gavage,2.5,0.0,0.0,0.0,0.0,4.0,3.5,4.5,4.5,0.0,...,4.5,4.5,4.5,4.5,0.0,4.0,3.0,4.5,4.0,4.0
Gueuzedude,3.5,0.0,3.0,3.5,4.0,4.0,4.0,4.0,4.5,4.0,...,4.5,3.5,3.5,0.0,4.5,0.0,4.0,3.5,3.5,0.0
Gusler,3.5,3.5,4.5,4.0,4.0,4.5,4.0,4.5,4.0,4.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Halcyondays,4.0,4.0,3.5,4.0,0.0,4.0,4.0,4.0,3.5,4.5,...,4.5,4.0,3.5,0.0,3.5,3.0,3.0,3.5,5.0,4.0


In [28]:
ratings_pred = predict_rating(ratings_matrix.values, item_based_col.values)
ratings_pred_matrix = pd.DataFrame(data=ratings_pred, index = ratings_matrix.index,
                                   columns = ratings_matrix.columns)
ratings_pred_matrix

beerId,6,7,10,17,19,30,31,33,34,36,...,56761,56973,57252,57908,57912,59151,59369,60420,62645,72138
userNm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BEERchitect,3.800143,3.801861,3.809339,3.803379,3.808193,3.808401,3.807441,3.804877,3.811116,3.823288,...,3.813368,3.814514,3.786721,3.803491,3.793108,3.812176,3.804437,3.803852,3.808878,3.803527
Bighuge,3.182254,3.15708,3.1944,3.184693,3.215509,3.194421,3.182865,3.192708,3.189589,3.207738,...,3.160733,3.194333,3.148588,3.188683,3.19814,3.156552,3.143863,3.146438,3.143258,3.140027
Billolick,3.28638,3.278176,3.289119,3.285485,3.290042,3.285507,3.280934,3.281039,3.284167,3.266361,...,3.295754,3.290728,3.288675,3.306546,3.301394,3.289074,3.29773,3.286954,3.300905,3.290015
BuckeyeNation,3.886505,3.875024,3.896448,3.880945,3.89501,3.89477,3.893378,3.893813,3.894391,3.897595,...,3.885531,3.899459,3.887725,3.880106,3.883998,3.872498,3.880333,3.882243,3.88965,3.87414
ChainGangGuy,3.218391,3.22045,3.220101,3.217968,3.225105,3.217105,3.213464,3.212134,3.212365,3.223662,...,3.217442,3.216182,3.219897,3.217776,3.224593,3.196035,3.218463,3.2202,3.2134,3.193433
DrJay,3.256728,3.237356,3.289094,3.245073,3.281297,3.28792,3.279551,3.281077,3.289014,3.295554,...,3.289426,3.289686,3.263824,3.252764,3.240417,3.251975,3.273116,3.264948,3.284539,3.277405
Gavage,3.1437,3.103258,3.133768,3.106165,3.11684,3.152516,3.14814,3.15213,3.15664,3.127143,...,3.171968,3.160106,3.166539,3.18153,3.135751,3.164348,3.158227,3.162495,3.169221,3.162104
Gueuzedude,2.96358,2.92744,2.977591,2.956083,2.977179,2.975816,2.973374,2.969675,2.981448,2.998755,...,2.992343,2.984539,2.959034,2.944275,2.987922,2.964875,2.970597,2.964259,2.978302,2.936806
Gusler,2.839873,2.871287,2.838502,2.867621,2.864262,2.844573,2.833588,2.836529,2.83112,2.857293,...,2.78145,2.843155,2.790338,2.752763,2.797758,2.776289,2.794322,2.788056,2.769097,2.784303
Halcyondays,3.106029,3.127913,3.092996,3.124955,3.092917,3.104286,3.103919,3.098401,3.102055,3.132033,...,3.103301,3.110111,3.097052,3.054765,3.114227,3.104281,3.103077,3.105886,3.100748,3.090269


In [29]:
def get_mse(pred, actual):
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

print(get_mse(ratings_pred, ratings_matrix.values))

1.164880564301752


In [30]:
top_n_items = [np.argsort(item_based_col.values[:,3])[:-5:-1]]
top_n_items

[array([  3, 278, 472, 184])]

In [31]:
def predict_rating_topsim(ratings_arr, item_sim_arr, n):
    # 사용자-아이템 평점 행렬 크기만큼 0으로 채운 예측 행렬 초기화
    pred = np.zeros(ratings_arr.shape)
    
    # 사용자-아이템 평점 행렬의 맥주 개수만큼 루프
    for col in range(ratings_arr.shape[1]):
        # 유사도 행렬에서 유사도가 큰 순으로 n개의 데이터 행렬의 인덱스 반환
        top_n_items = [np.argsort(item_sim_arr[:,col])[:-n-1:-1]]
        # 개인화된 예측 평점 계산 : 각 col 맥주별(1개), 3083 사용자들의 예측평점
        for row in range(ratings_arr.shape[0]):
            pred[row, col] = item_sim_arr[col, :][top_n_items].dot(
                ratings_arr[row, :][top_n_items].T)
            pred[row, col] /= np.sum(item_sim_arr[col, :][top_n_items])
    return pred

ratings_pred = predict_rating_topsim(ratings_matrix.values, item_based_col.values, n = 10)
print(get_mse(ratings_pred, ratings_matrix.values))


0.2748027143138868


In [32]:
ratings_pred_matrix = pd.DataFrame(data = ratings_pred, index=ratings_matrix.index,
                                  columns = ratings_matrix.columns)
ratings_pred_matrix

beerId,6,7,10,17,19,30,31,33,34,36,...,56761,56973,57252,57908,57912,59151,59369,60420,62645,72138
userNm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BEERchitect,4.041727,3.646622,4.252003,3.946778,3.951013,4.399399,4.300221,4.200763,4.449369,4.395231,...,4.154235,4.299401,3.596766,4.193512,2.827031,4.059571,4.099904,3.947159,4.205615,3.850239
Bighuge,3.841707,3.23114,4.141928,3.877648,4.160296,4.500429,4.192761,4.650007,4.450366,4.556429,...,2.924943,4.002325,2.896297,4.447021,4.403113,2.447618,2.803363,2.279056,2.067219,1.945099
Billolick,3.94528,3.99755,4.448261,3.998126,4.136071,4.0987,4.052483,3.997564,4.150337,3.302349,...,3.809942,4.350625,3.503578,4.198353,4.060198,3.662095,4.253771,3.999195,4.401777,3.863104
BuckeyeNation,4.241295,3.745148,4.303358,3.888953,4.35223,4.598881,4.24588,4.399596,4.449673,4.493198,...,3.466632,4.349179,4.044812,2.711215,3.167337,2.684478,3.950254,3.947622,4.400454,3.076525
ChainGangGuy,3.948973,3.489695,3.199973,3.643525,3.745673,3.95071,3.649623,3.997614,4.198542,3.951802,...,3.652238,3.85058,3.857743,3.554912,3.660125,3.037082,3.49807,3.814075,3.687491,2.39631
DrJay,3.276405,1.90065,4.148727,2.037998,3.553508,4.300463,4.248299,4.299772,4.199864,4.246252,...,4.099293,4.350441,3.593159,1.851828,2.243627,2.693925,3.842657,3.794964,4.09927,3.753478
Gavage,3.689968,2.002727,3.072989,2.633431,2.840069,4.449334,4.097614,4.051874,4.20171,3.200768,...,4.2026,4.449716,4.343589,4.056433,3.023264,3.806052,3.639779,4.310673,3.852106,3.900759
Gueuzedude,3.847928,2.746877,3.894826,3.535225,3.707727,4.101128,3.999425,4.248118,4.201055,3.999698,...,3.755722,3.948287,3.644807,1.065029,3.769804,2.685142,3.851715,3.310302,4.089009,1.895635
Gusler,4.144913,3.842929,4.450226,3.852047,4.0,4.500761,4.29815,4.450224,4.449081,4.15004,...,1.235223,4.299479,1.909766,0.393314,2.195174,1.131609,2.608035,1.8409,0.783309,1.462553
Halcyondays,3.902241,4.145887,3.551802,4.242359,3.032839,4.100143,4.199851,3.799269,3.850855,4.254037,...,4.155187,3.751514,3.947182,2.367735,3.986622,3.931666,3.640928,3.743209,3.327551,2.971436


In [33]:
username = 'Zorro'

user_rating_id = ratings_matrix.loc[username, :]
user_rating_id[user_rating_id > 0].sort_values(ascending=False)[:10]

beerId
19960    5.0
43491    5.0
1013     5.0
402      5.0
12770    5.0
354      5.0
11582    5.0
10672    5.0
35328    5.0
9478     5.0
Name: Zorro, dtype: float64

In [34]:
# 사용자가 안 먹어본 맥주를 추천
def get_not_tried_beer(ratings_matrix, userId):
    # userId로 입력받은 모든 맥주 정보를 추출해 Series로 반환
    # 반환된 user_rating은 맥주 이름을 인덱스로 가지는 series객체
    user_rating = ratings_matrix.loc[userId, :]
    
    # user_rating이 0보다 크면 먹어본 맥주
    # 대상 인덱스를 추출해 list 객체로 만든다
    tried = user_rating[user_rating>0].index.tolist()
    
    # 모든 맥주명을 list 객체로 만든다
    beer_list = ratings_matrix.columns.tolist()
    
    # list comprehension으로 tried에 해당하는 영화는 beer_list에서 제외
    not_tried = [beer for beer in beer_list if beer not in tried]
    
    return not_tried

# 예측 평점 Dataframe에서 사용자 id 인덱스와 not_tried로 들어온 맥주명 추출 후
# 예측 평점이 높은 순으로 정렬
def  recomm_beer_by_userid(pred_df, userId, not_tried, top_n):
    recomm_beer = pred_df.loc[userId, not_tried].sort_values(ascending=False)[:top_n]
    return recomm_beer

# 유저가 먹지 않은 맥주이름 추출
not_tried = get_not_tried_beer(ratings_matrix, username)
not_tried

[74,
 146,
 147,
 148,
 159,
 184,
 186,
 262,
 310,
 318,
 321,
 570,
 673,
 752,
 794,
 808,
 836,
 906,
 927,
 1015,
 1206,
 1252,
 1287,
 1351,
 1381,
 1576,
 1632,
 1856,
 1882,
 1912,
 1917,
 1924,
 2231,
 2557,
 2758,
 2894,
 2904,
 3338,
 3842,
 4108,
 4109,
 4318,
 5057,
 5385,
 5441,
 6076,
 6104,
 6368,
 7348,
 7463,
 7597,
 8322,
 8682,
 8998,
 9873,
 11819,
 18305,
 18721,
 19314,
 20168,
 20604,
 22227,
 22381,
 24905,
 25880,
 26233,
 27800,
 28176,
 28578,
 29602,
 30845,
 33127,
 34094,
 34146,
 38334,
 39639,
 40057,
 40058,
 40149,
 43687,
 44755,
 45073,
 45653,
 46070,
 46363,
 46385,
 46849,
 46987,
 47228,
 47658,
 47692,
 48139,
 48505,
 49286,
 52128,
 52361,
 53886,
 57908,
 72138]

In [35]:
ratings_pred = predict_rating_topsim(ratings_matrix.values, item_based_col.values, n=5)

# 계산된 예측 평점 데이터는 dataframe으로 재생성
ratings_pred_matrix = pd.DataFrame(data = ratings_pred, index=ratings_matrix.index,
                                  columns = ratings_matrix.columns)

# 아이템 기반의 최근접 이웃 cf로 맥주 추천
recomm_beer = recomm_beer_by_userid(ratings_pred_matrix, username,
                                   not_tried, top_n=3)
recomm_beer_matrix = pd.DataFrame(data=recomm_beer.values,
                                 index = recomm_beer.index,
                                 columns=['예측평점'])
recomm_beer_matrix

Unnamed: 0_level_0,예측평점
beerId,Unnamed: 1_level_1
40058,3.478454
4318,3.455407
6368,3.416575
