In [7]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import nltk

from collections import Counter

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

import requests
import re

In [105]:
beerReviews = pd.read_csv("./data/merged_data.csv")

In [106]:
beerReviews = beerReviews[["beer_beerid", "beer_name", "beer_style", "Description"]]
beerReviews = beerReviews.drop_duplicates()

In [107]:
def text_tokenize(text):
    doc = nltk.tokenize.word_tokenize(text)
    tokens = []
    
    for word in nltk.tag.pos_tag(doc):
        # 명사와 형용사로 구성
        if word[1] in ['NN', 'JJ', 'JJR', 'JJS', ]:
            tokens.append(word[0])
    
    return tokens

In [108]:
def token_cleaning(tokens):
    comp = re.compile('[^A-Za-z]') # 영어로 된 단어만 검출
    clean = []
    for t in tokens:
        t = comp.sub('', t)
        if len(t) > 4:
            clean.append(t.lower()) # 검출된 단어 소문자로 통합하여 저장
    return clean

In [109]:
beerReviews['tokens'] = beerReviews['Description'].apply(text_tokenize)
beerReviews['tokens'] = beerReviews['tokens'].apply(token_cleaning)

beerReviews.head()

Unnamed: 0,beer_beerid,beer_name,beer_style,Description,tokens
0,436,Amstel Light,Light Lager,ABV in the USA is 4.1%\t,[]
6,60990,Mogli,American Porter,'Oak Aged Imperial Porter Brewed with Chocolat...,"[first, release, waxed]"
263,20575,Hefeweizen,Hefeweizen,Formerly In-Heat Wheat\t,[]
264,20575,Hefeweizen,Hefeweizen,An authentic example of a Bavarian Hefeweizen....,"[authentic, example, bavarian, cloudy, yeasty,..."
265,20575,Hefeweizen,Hefeweizen,"Our Hefeweizen is a light, unfiltered wheat be...","[light, unfiltered, wheat, wedge, lemon, thirt..."


In [134]:
# 2개 이상 토큰이 있는 맥주만 사용
beerReviews = beerReviews[beerReviews['tokens'].map(lambda d: len(d)) > 2]

beerReviews.head()

Unnamed: 0,beer_beerid,beer_name,beer_style,Description,tokens
6,60990,Mogli,American Porter,'Oak Aged Imperial Porter Brewed with Chocolat...,"[first, release, waxed]"
264,20575,Hefeweizen,Hefeweizen,An authentic example of a Bavarian Hefeweizen....,"[authentic, example, bavarian, cloudy, yeasty,..."
265,20575,Hefeweizen,Hefeweizen,"Our Hefeweizen is a light, unfiltered wheat be...","[light, unfiltered, wheat, wedge, lemon, thirt..."
266,20575,Hefeweizen,Hefeweizen,"Hundreds of years ago in Germany, wheat beers ...","[wheat, church, testament, light, flavor, trad..."
479,27666,Holiday Ale,Herbed / Spiced Beer,Two Roads Holiday Ale is inspired by the littl...,"[little, known, style, subset, small, farmhous..."


In [186]:
# 특징에 해당하지 않는 단어들 제거
stop_words = ['taste', 'flavor', 'bottle', 'color', 'mouthfeel', 'glass', 'lacing', 'finger',
              'little', 'finish', 'interesting', 'aftertaste', 'overall', 'slight', 'colour',
              'flavour', 'style', 'aroma', 'smell', 'review', 'leave', 'decent', 'character', 
              'perfect', 'example','today', 'first', 'available', 'century', 'yeast', 'alcohol',
              'special', 'release', 'percent', 'version', 'wheat', 'effort', 'amazing', 'offer',
              'several'
             ]

cleared = []
for token in beerReviews['tokens']:
    for t in token:
        if t in stop_words:
            continue
        else:
            cleared.append(t)
res = Counter(cleared)
exp_tokens = sorted(res.items(), key=lambda x : x[1])
exp_tokens

[('waxed', 1),
 ('compliment', 1),
 ('complexion', 1),
 ('singlemalt', 1),
 ('piece', 1),
 ('partner', 1),
 ('finishoriginal', 1),
 ('twostroke', 1),
 ('wellintegrated', 1),
 ('centre', 1),
 ('ageing', 1),
 ('beermaking', 1),
 ('antiquity', 1),
 ('potation', 1),
 ('possess', 1),
 ('important', 1),
 ('flock', 1),
 ('rabbit', 1),
 ('distinctively', 1),
 ('finishwe', 1),
 ('nosewe', 1),
 ('unsmoked', 1),
 ('haziness', 1),
 ('strongbeerseason', 1),
 ('rockcellar', 1),
 ('oakwood', 1),
 ('ingenious', 1),
 ('painter', 1),
 ('builder', 1),
 ('maltstressed', 1),
 ('strongsweet', 1),
 ('rarelyseen', 1),
 ('boatload', 1),
 ('malting', 1),
 ('hence', 1),
 ('multifaceted', 1),
 ('passenger', 1),
 ('hopsbernstein', 1),
 ('blessed', 1),
 ('again', 1),
 ('uncomplicated', 1),
 ('sweetnessmicheal', 1),
 ('subtlety', 1),
 ('biscuitlike', 1),
 ('contribution', 1),
 ('ambiguous', 1),
 ('flandersstyle', 1),
 ('wheatwine', 1),
 ('exercise', 1),
 ('eveningabv', 1),
 ('tinge', 1),
 ('fruited', 1),
 ('border',

In [187]:
# 빈도수가 3개 이상인 토큰
exp_token_list = []
for token in exp_tokens:
    text, cnt = token
    if cnt >= 3:
        exp_token_list.append(text)
exp_token_list

['fragrance',
 'forth',
 'chocolaty',
 'prohibition',
 'wandering',
 'tribes',
 'terrible',
 'adrift',
 'hopsibu',
 'blackaroma',
 'coffeeflavor',
 'roastyavailability',
 'worldfamous',
 'remarkable',
 'generation',
 'expensive',
 'persistent',
 'dominant',
 'direct',
 'research',
 'consumer',
 'knowledge',
 'birth',
 'premier',
 'strict',
 'honour',
 'tangerine',
 'longlasting',
 'appealing',
 'design',
 'northern',
 'oldie',
 'goodie',
 'onslaught',
 'highgravity',
 'basis',
 'prize',
 'sunshine',
 'orchard',
 'crisper',
 'diversity',
 'availability',
 'description',
 'cucumber',
 'father',
 'secret',
 'simple',
 'recognition',
 'temptation',
 'quencher',
 'lasting',
 'purest',
 'sailing',
 'stainless',
 'sharp',
 'trappist',
 'memory',
 'potent',
 'incarnation',
 'force',
 'seafood',
 'lederhosen',
 'village',
 'mango',
 'tavern',
 'strike',
 'iconic',
 'excellence',
 'choicest',
 'lobster',
 'imagination',
 'behemoth',
 'coldpressed',
 'percentage',
 'historical',
 'monument',
 'an

In [188]:
token_df = beerReviews.groupby('beer_name')['tokens'].sum().reset_index()
new_beers_df = beerReviews.groupby('beer_name')['beer_style'].sum().reset_index()

new_beers_df = new_beers_df.merge(token_df, on='beer_name')
new_beers_df

Unnamed: 0,beer_name,beer_style,tokens
0,# 100,American Barleywine,"[batch, wheat, chocolate, yeast, local, water]"
1,#9,Fruit / Vegetable Beer,"[secrecy, mysterious, unusual, palate, tongue,..."
2,10 Commandments,Belgian Strong Dark Ale,"[stronger, version, contemplative, alcohol, co..."
3,12 Dogs Of Christmas Ale,Winter Warmer,"[seasonal, generous, toasted, caramel, mixed, ..."
4,120 Minute IPA,American Double / Imperial IPA,"[extreme, colossal, degree, plato, highalpha, ..."
...,...,...,...
1114,Zwickel,Keller Bier / Zwickel BierKeller Bier / Zwicke...,"[flagship, lager, zvickel, unfiltered, unpaste..."
1115,ZÔN,Witbier,"[summer, seasonal, interpretation, classic, be..."
1116,Éphémère (Apple),Fruit / Vegetable Beer,"[ephemeral, series, seasonal, fruit, refreshin..."
1117,Équinoxe Du Printemps,Scotch Ale / Wee Heavy,"[quinoxe, spring, equinox, quintessential, mar..."


In [189]:
# 맥주별 토큰 빈도수 확인
token_count = []
for i in range(len(new_beers_df)):
    exp_token_dict = {}
    exp_token_dict = {k : 0 for k in exp_token_list}
    tokens = list(new_beers_df['tokens'])[i]
    for token in tokens:
        if token in exp_token_list:
            exp_token_dict[token] += 1
    token_count.append(exp_token_dict)
token_count_df = pd.DataFrame(token_count, index=new_beers_df.beer_name)
token_count_df

Unnamed: 0_level_0,fragrance,forth,chocolaty,prohibition,wandering,tribes,terrible,adrift,hopsibu,blackaroma,...,lager,classic,american,traditional,german,malty,smooth,bitterness,chocolate,caramel
beer_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
# 100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
#9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10 Commandments,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12 Dogs Of Christmas Ale,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
120 Minute IPA,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zwickel,0,0,0,0,0,0,0,0,0,0,...,3,0,0,0,3,0,0,0,0,0
ZÔN,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
Éphémère (Apple),0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Équinoxe Du Printemps,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [190]:
# 가중치 변환을 위해 스케일러 적용
scaler = MinMaxScaler()
beers_df = pd.DataFrame(scaler.fit_transform(token_count_df),
                        columns=token_count_df.columns,
                        index = token_count_df.index)
beers_df.head()

Unnamed: 0_level_0,fragrance,forth,chocolaty,prohibition,wandering,tribes,terrible,adrift,hopsibu,blackaroma,...,lager,classic,american,traditional,german,malty,smooth,bitterness,chocolate,caramel
beer_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
# 100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002506,0.0
#9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Commandments,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12 Dogs Of Christmas Ale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004132
120 Minute IPA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.004505,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [191]:
last_beers_df = beers_df.reset_index().merge(new_beers_df, on='beer_name')
last_beers_df.drop('tokens', axis=1, inplace=True)
last_beers_df.drop('beer_style', axis=1, inplace=True)
last_beers_df.head()

Unnamed: 0,beer_name,fragrance,forth,chocolaty,prohibition,wandering,tribes,terrible,adrift,hopsibu,...,lager,classic,american,traditional,german,malty,smooth,bitterness,chocolate,caramel
0,# 100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002506,0.0
1,#9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10 Commandments,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,12 Dogs Of Christmas Ale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004132
4,120 Minute IPA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.004505,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [192]:
import pickle

last_beers_df.to_pickle('./review_cosine_similarity.pkl')

with open('./review_cosine_similarity.pkl', 'rb') as f:
    last_beers_df = pickle.load(f)
    
matrix = last_beers_df.set_index('beer_name')
cosine_sim = cosine_similarity(matrix, matrix)

indices = pd.Series(data=last_beers_df.index, index=last_beers_df.beer_name)
indices

beer_name
# 100                          0
#9                             1
10 Commandments                2
12 Dogs Of Christmas Ale       3
120 Minute IPA                 4
                            ... 
Zwickel                     1114
ZÔN                         1115
Éphémère (Apple)            1116
Équinoxe Du Printemps       1117
Über Alt                    1118
Length: 1119, dtype: int64

In [194]:
# 유사도 분석해 3개 추천
def get_recommendations(name, cosine_sim=cosine_sim):
    idx = indices[name]

    sim_scores = list(enumerate(cosine_sim[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        
    sim_scores = sim_scores[1:4]

    beer_indices = [i[0] for i in sim_scores]

    return indices.iloc[beer_indices].index.tolist()

In [197]:
beers = new_beers_df.set_index('beer_name')
result = get_recommendations('10 Commandments')

print('10 Commandments : ',beers.loc['10 Commandments']['beer_style'], '\n')

print("<Recommendation Result>")
for item in result:
    print(item, " : ", beers.loc[item]['beer_style'])

10 Commandments :  Belgian Strong Dark Ale 

<Recommendation Result>
Headwall Alt  :  Altbier
Gerst Amber  :  American Amber / Red Ale
Blue Label Ale  :  English Pale Mild Ale
