# Based on Programming Assignment 2: Beer recommendation

- 과제 목표: 뉴럴 네트워크 모델을 설계한 후 모델을 학습하여 각 맥주들의 embedding 들을 생성하고, 영화 embedding 을 활용하여 각 사용자에게 맞춤형 맥주를 추천

In [1]:
# Data analysis libraries
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import mplcursors # Use this is for creating a cursor-interactive plot with "%matplotlib notebook"

from sklearn.decomposition import NMF # Use this for training Non-negative Matrix Factorization
from sklearn.utils.extmath import randomized_svd # Use this for training Singular Value Decomposition
from sklearn.manifold import TSNE # Use this for training t-sne manifolding
from sklearn.model_selection import train_test_split
#### 원핫 인코딩
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity


from itertools import permutations # For making pairs

plt.style.use('ggplot') # You can also use different style

# just for plot checking, use this option
# %matplotlib inline

# for interactive plot
# If you use this option, plot will appear at first-drawn position
%matplotlib notebook

warnings.filterwarnings('ignore')


# Data loading

In [2]:
# Load flat file (Kaggle)
df_reviews_raw = pd.read_csv('beer_reviews.csv')

# Best Practice: Make a copy of the raw data to work on
df_reviews = df_reviews_raw.copy()

# Peep it
df_reviews.head()

# reviews = reviews.drop(["review_time", "brewery_name", "beer_name"], axis=1)
df_reviews = df_reviews.drop(["brewery_name", "beer_name"], axis=1) # timestamp 살려두기
#reviews_features = df_reviews.copy()

# Peep it
df_reviews.head()

Unnamed: 0,brewery_id,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_abv,beer_beerid
0,10325,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,5.0,47986
1,10325,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,6.2,48213
2,10325,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,6.5,48215
3,10325,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,5.0,47969
4,1075,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,7.7,64883


# Preprocessing data

<br>

> 타입을 category로 바꿔주기

<br>

In [6]:
# Convert 'object' to 'category' 
df_reviews[df_reviews.select_dtypes(['object']).columns] = df_reviews.select_dtypes(['object']).\
                                                         apply(lambda x: x.astype('category'))
# Examine structure 
df_reviews.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1518493 entries, 0 to 1586613
Data columns (total 11 columns):
 #   Column              Non-Null Count    Dtype   
---  ------              --------------    -----   
 0   brewery_id          1518493 non-null  int64   
 1   review_time         1518493 non-null  int64   
 2   review_overall      1518493 non-null  float64 
 3   review_aroma        1518493 non-null  float64 
 4   review_appearance   1518493 non-null  float64 
 5   review_profilename  1518493 non-null  category
 6   beer_style          1518493 non-null  category
 7   review_palate       1518493 non-null  float64 
 8   review_taste        1518493 non-null  float64 
 9   beer_abv            1518493 non-null  float64 
 10  beer_beerid         1518493 non-null  int64   
dtypes: category(2), float64(6), int64(3)
memory usage: 124.4 MB



<br>

> 결측치 제거

<br>


In [8]:
# Check for null values
#print(df_reviews.isnull().sum())

# Percent of data missing `beer_abv` values
print("Percent Null Values of `beer_abv` column:", round(67785 / 1586614 * 100, 2),"%")

# Drop null row values
df_reviews = df_reviews.dropna()
#df_reviews.info()

# Check for null values
#print(df_reviews.isnull().sum())

# Percent of data that are null
print("Percent of Null Values:", round((1586614 - 1518478)/ 1586614 * 100, 2),"%")

Percent Null Values of `beer_abv` column: 4.27 %
Percent of Null Values: 4.29 %



<br>

> 중복 데이터 제거

<br>


In [10]:
# Sort by user overall rating first
df_reviews = df_reviews.sort_values('review_overall', ascending=False)

# Peep it
df_reviews.head()

# Keep the highest rating from each user and drop the rest 
df_reviews = df_reviews.drop_duplicates(subset= ['review_profilename','beer_beerid'], keep='first')

# Peep structure
#df_reviews.info()

# Percent of data that are duplicates
print("Percent of Duplicate Values:", round((1518478 - 1504037)/ 1518478 * 100, 2),"%")

Percent of Duplicate Values: 0.95 %



<br>

> 0점으로 기록된 데이터를 0 아닌 값으로 수정해주기

<br>


In [16]:
# Histogram of all numeric features
#reviews.hist(figsize=(12,12))
#plt.show()

# Sort by user overall rating first
df_reviews = df_reviews.sort_values('review_overall', ascending=True)

# Peep it
print(1504052 - 1504045, "개의 데이터에 문제 발생")
df_reviews[(df_reviews['review_overall'] < 1) | (df_reviews['review_appearance'] < 1)]


# Review scores of >= 1 : 문제 데이터 제거
df_reviews = df_reviews[(df_reviews['review_overall'] >= 1)]
#df_reviews = df_reviews[(df_reviews['review_appearance'] >= 1)]
# Peep it
df_reviews.head(8)

7 개의 데이터에 문제 발생


Unnamed: 0,brewery_id,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_abv,beer_beerid
147509,3818,1266551782,1.0,1.5,4.0,brownbeer,Saison / Farmhouse Ale,3.0,1.0,10.1,43098
608977,4120,1144867689,1.0,1.5,2.0,stcules,Märzen / Oktoberfest,1.5,1.5,4.9,22271
349290,306,1322438321,1.0,1.5,1.0,UnderPressure,Light Lager,1.0,1.5,4.2,837
349285,306,1323299431,1.0,1.0,2.0,blacklabel67k,Light Lager,1.0,1.0,4.2,837
608962,493,1269443934,1.0,1.0,2.0,sprucetip,American Adjunct Lager,2.0,1.0,6.4,7609
608407,14967,1256600388,1.0,1.0,3.0,bigreduw64,American Pale Ale (APA),1.5,1.0,5.0,35850
1305481,29,1294126436,1.0,1.0,2.5,dangerpint,Light Lager,1.0,1.0,4.2,41821
1373112,12985,1296854026,1.0,1.5,2.0,breadnerd,American Black Ale,3.5,1.0,6.9,44085



<br>

> 유저와 맥주와 총점만 있는 자료로 만들기

<br>


In [30]:
# Create Pandas DF of ratings by user and item
df_ratings = df_reviews[['review_profilename', 'beer_beerid', 'review_overall']]

# Rename columns
df_ratings = df_ratings.rename(columns = {'review_profilename':'userNm', 
                                    'review_overall':'rating',
                                    'beer_beerid' : 'beerId' })
df_ratings['reviewIdx'] = df_ratings.index


df_ratings.head()

# 고유 사용자, 고유 맥주 갯수 확인
n_users = len(df_ratings['userNm'].unique())
n_beers = len(df_ratings['beerId'].unique())

n_users, n_beers # (32908, 49011)

# df_ratings 기술통계량 확인
df_ratings['rating'].describe()
df_ratings.sort_values(by="beerId", ascending=True).head()

Unnamed: 0,userNm,beerId,rating,reviewIdx
393499,TigerFan,5,4.5,393499
393520,scruffwhor,5,5.0,393520
393523,shbobdb,5,5.0,393523
393546,arizcards,5,4.0,393546
393634,NeroFiddled,5,4.0,393634



<br>

> 유저 

<br>


In [46]:
df_users = df_ratings.groupby('userNm').size().reset_index()
df_users.columns = ["userNm", "review_num"]

# 임의로(스펠링 순으로) 유저번호 붙이기
df_users['userIdx'] = df_users.index

print(df_users.head())

df_users = df_users.sort_values(by="review_num", ascending=False)

df_users['review_num'] = pd.to_numeric(df_users['review_num'])
df_users.head(1000) #479명의 리뷰 데이터가 없음

#df_users_500 = df_users[df_users["review_num"] >= 500]
#df_users_500# 리뷰를 500개 이상 작성한 654명의 유저만 사용


        userNm  review_num  userIdx
0     0110x011         137        0
1     01Ryan10           1        1
2     02maxima           4        2
3   03SVTCobra           3        3
4  04101Brewer           3        4


Unnamed: 0,userNm,review_num,userIdx
26927,northyorksammy,5311,26927
2351,BuckeyeNation,4240,2351
25703,mikesgroove,4226,25703
12378,Thorpe429,3272,12378
32912,womencantsail,3155,32912
...,...,...,...
17357,cnally,363,17357
32708,whartontallboy,363,32708
8564,Morey,363,8564
23068,jondeelee,363,23068


In [47]:
df_ratings_500 = df_ratings.merge(df_users, left_on = 'userNm', right_on = 'userNm', how = 'left')

df_ratings_500.sort_values(by="userIdx", ascending=True)

Unnamed: 0,userNm,beerId,rating,reviewIdx,review_num,userIdx
1277207,0110x011,4307,4.5,720732,137,0
471406,0110x011,31484,3.5,183315,137,0
626667,0110x011,195,4.0,1445057,137,0
1465877,0110x011,3916,5.0,181791,137,0
936547,0110x011,42836,4.0,1437002,137,0
...,...,...,...,...,...,...
785654,zyzygy,27514,4.0,293561,5,33385
1269987,zyzygy,141,4.5,773245,5,33385
982438,zyzygy,28687,4.0,873579,5,33385
755634,zyzygy,1112,4.0,566468,5,33385


In [48]:
df_beers = df_ratings.groupby('beerId').size().reset_index()
df_beers.columns = ["beerId", "br_review_num"]
df_beers['br_review_num'] = pd.to_numeric(df_beers['br_review_num'])

# 임의로(스펠링 순으로) 유저번호 붙이기
#df_beers['userIdx'] = df_users.index

#print(df_beers.head())

df_beers = df_beers.sort_values(by="br_review_num", ascending=False)

df_beers.head() #479명의 리뷰 데이터가 없음

#df_users_500 = df_users[df_users["review_num"] >= 500]
#df_users_500# 리뷰를 500개 이상 작성한 654명의 유저만 사용

df_ratings_500 = df_ratings_500.merge(df_beers, left_on = 'beerId', right_on = 'beerId', how = 'left')
df_ratings_500.sort_values(by="beerId", ascending=True)

Unnamed: 0,userNm,beerId,rating,reviewIdx,review_num,userIdx,br_review_num
1336922,TigerFan,5,4.5,393499,8,12412,420
1490643,scruffwhor,5,5.0,393520,1636,29489,420
1490642,shbobdb,5,5.0,393523,411,29732,420
921837,arizcards,5,4.0,393546,865,14226,420
921213,NeroFiddled,5,4.0,393634,3068,8949,420
...,...,...,...,...,...,...,...
772229,Radome,77312,4.0,529418,193,10055,1
155856,Radome,77313,3.0,684958,193,10055,1
292624,Radome,77314,3.5,529417,193,10055,1
299037,thepeter,77315,3.5,992958,231,31443,1


In [59]:
df_ratings_500 = df_ratings_500[df_ratings_500["review_num"]>2000]
df_ratings_500 = df_ratings_500[df_ratings_500["br_review_num"]>500]

df_beers = df_beers[df_beers["br_review_num"]>500] #44명 유저
df_users = df_users[df_users["review_num"]  >2000] #584개 맥주

#df_users.sort_values(by="userIdx", ascending=True).reset_index()
#df_beers.sort_values(by="beerId",  ascending=True)

df_ratings_500 # 총 리뷰 갯수 19695개
#df_ratings_500[df_ratings_500["br_review_num"]>1000]
#df_beers[df_beers["br_review_num"]>1000]

Unnamed: 0,userNm,beerId,rating,reviewIdx,review_num,userIdx,br_review_num
125,Gavage,837,1.0,349339,2338,4990,1129
669,smcolw,1790,1.0,607137,2038,30131,605
688,Gueuzedude,909,1.0,350861,2734,5348,823
696,feloniousmonk,1790,1.0,607131,2784,19869,605
721,Viggo,837,1.0,349970,2352,12904,1129
...,...,...,...,...,...,...,...
1503688,Gavage,21300,5.0,300682,2338,4990,1493
1503876,mikesgroove,3338,5.0,1226364,4226,25703,606
1503904,oberon,3338,5.0,1226371,2879,27053,606
1503916,Gavage,3338,5.0,1226453,2338,4990,606



<br>

> one-hot 인코딩 

<br>


In [68]:
onehot_encoder = OneHotEncoder() #sparse=False)

df_one_hot_beerId = df_beers.copy().values[:,0].reshape(584, 1)

onehot_encoded = onehot_encoder.fit_transform(df_one_hot_beerId)

onehot_encoded

<584x584 sparse matrix of type '<class 'numpy.float64'>'
	with 584 stored elements in Compressed Sparse Row format>

In [None]:
one_hot = (one_hot_movie[one_hot_movie['count'] > 25].values)[:,0].reshape(1012, 1)

onehot_encoder = OneHotEncoder() #sparse=False)

onehot_encoded = onehot_encoder.fit_transform(one_hot)
#print( tuple(onehot_encoded.toarray()))#_encoded)
#print(onehot_encoded.toarray())#_encoded)
#print(one_hot[0])#_encoded)


one_hot_movie_1 = one_hot_movie[one_hot_movie['count'] > 25]
one_hot_movie_1["oneHot"] = (tuple(onehot_encoded.toarray().astype(int)))
#one_hot_movie_1["oneHot"].values[0]

one_hot_movie_1.sort_values(by="count", ascending=True)

#one_hot_movie_1
#df_ratings_1012
one_hot_movie_1 = one_hot_movie_1.drop(columns=["count"])
df_ratings_one_hot = df_ratings_1012.merge(one_hot_movie_1, left_on = 'movieId', right_on = 'movieId', how = 'left')

df_ratings_one_hot