In [166]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import nltk

from collections import Counter

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

import requests
import re

In [141]:
beerReviews = pd.read_csv("./data/merged_data.csv")

In [142]:
beerReviews = beerReviews[["beer_beerid", "beer_name", "beer_style", "beer_abv", "Description"]]
beerReviews = beerReviews.drop_duplicates()

In [143]:
def text_tokenize(text):
    doc = nltk.tokenize.word_tokenize(text)
    tokens = []
    
    for word in nltk.tag.pos_tag(doc):
        if word[1] in ['NN', 'JJ', 'JJR', 'JJS', ]:
            tokens.append(word[0])
    
    return tokens

In [144]:
def token_cleaning(tokens):
    comp = re.compile('[^A-Za-z]') # 영어로 된 단어만 검출
    clean = []
    for t in tokens:
        t = comp.sub('', t)
        if len(t) > 4:
            clean.append(t.lower()) # 검출된 단어 소문자로 통합하여 저장
    return clean

In [145]:
beerReviews['tokens'] = beerReviews['Description'].apply(text_tokenize)
beerReviews['tokens'] = beerReviews['tokens'].apply(token_cleaning)

beerReviews.head()

Unnamed: 0,beer_beerid,beer_name,beer_style,beer_abv,Description,tokens
0,436,Amstel Light,Light Lager,3.5,ABV in the USA is 4.1%\t,[]
6,60990,Mogli,American Porter,8.5,'Oak Aged Imperial Porter Brewed with Chocolat...,"[first, release, waxed]"
263,20575,Hefeweizen,Hefeweizen,5.0,Formerly In-Heat Wheat\t,[]
264,20575,Hefeweizen,Hefeweizen,5.0,An authentic example of a Bavarian Hefeweizen....,"[authentic, example, bavarian, cloudy, yeasty,..."
265,20575,Hefeweizen,Hefeweizen,5.0,"Our Hefeweizen is a light, unfiltered wheat be...","[light, unfiltered, wheat, wedge, lemon, thirt..."


In [146]:
beerReviews = beerReviews[beerReviews['tokens'].map(lambda d: len(d)) > 0]

beerReviews.head()

Unnamed: 0,beer_beerid,beer_name,beer_style,beer_abv,Description,tokens
6,60990,Mogli,American Porter,8.5,'Oak Aged Imperial Porter Brewed with Chocolat...,"[first, release, waxed]"
264,20575,Hefeweizen,Hefeweizen,5.0,An authentic example of a Bavarian Hefeweizen....,"[authentic, example, bavarian, cloudy, yeasty,..."
265,20575,Hefeweizen,Hefeweizen,5.0,"Our Hefeweizen is a light, unfiltered wheat be...","[light, unfiltered, wheat, wedge, lemon, thirt..."
266,20575,Hefeweizen,Hefeweizen,5.0,"Hundreds of years ago in Germany, wheat beers ...","[wheat, church, testament, light, flavor, trad..."
479,27666,Holiday Ale,Herbed / Spiced Beer,7.5,Two Roads Holiday Ale is inspired by the littl...,"[little, known, style, subset, small, farmhous..."


In [147]:
# 특징에 해당하지 않는 단어들
stop_words = ['taste', 'flavor', 'bottle', 'color', 'mouthfeel', 'glass', 'lacing', 'finger',
              'little', 'finish', 'interesting', 'aftertaste', 'overall', 'slight', 'colour',
              'flavour', 'style', 'aroma', 'smell', 'review', 'leave', 'decent', 'character', 'perfect']

cleared = []
most_tokens = []
for token in beerReviews['tokens']:
    for t in token:
        if t in stop_words:
            continue
        else:
            cleared.append(t)
    cnt = Counter(cleared).most_common(20)
    for c in cnt:
        text, _ = c
        most_tokens.append(text)
res = Counter(most_tokens)
exp_tokens = sorted(res.items(), key=lambda x : x[1])
exp_tokens

[('lemon', 2),
 ('squash', 2),
 ('cinnamon', 2),
 ('wedge', 3),
 ('yeasty', 4),
 ('fruity', 4),
 ('early', 4),
 ('sweetness', 4),
 ('example', 5),
 ('today', 5),
 ('festival', 6),
 ('winter', 7),
 ('fullbodied', 7),
 ('first', 8),
 ('release', 8),
 ('waxed', 8),
 ('small', 8),
 ('available', 11),
 ('homage', 11),
 ('tradition', 12),
 ('authentic', 14),
 ('flagship', 14),
 ('banana', 17),
 ('cloudy', 19),
 ('unfiltered', 26),
 ('summer', 28),
 ('special', 42),
 ('brown', 42),
 ('different', 42),
 ('alcohol', 45),
 ('noble', 74),
 ('popular', 110),
 ('craft', 118),
 ('bavarian', 121),
 ('spicy', 131),
 ('seasonal', 241),
 ('pumpkin', 259),
 ('subtle', 1576),
 ('blend', 2134),
 ('amber', 3330),
 ('unique', 3333),
 ('golden', 5858),
 ('coffee', 6481),
 ('complex', 6641),
 ('chocolate', 7399),
 ('sweet', 7456),
 ('barley', 7490),
 ('smooth', 7520),
 ('wheat', 7528),
 ('caramel', 7539),
 ('bitterness', 7540),
 ('lager', 7549),
 ('traditional', 7549),
 ('light', 7550),
 ('malty', 7559),
 ('cl

In [148]:
# 빈도수가 3개 이상인 토큰
exp_token_list = []
for token in exp_tokens:
    text, cnt = token
    if cnt >= 3:
        exp_token_list.append(text)
exp_token_list

['wedge',
 'yeasty',
 'fruity',
 'early',
 'sweetness',
 'example',
 'today',
 'festival',
 'winter',
 'fullbodied',
 'first',
 'release',
 'waxed',
 'small',
 'available',
 'homage',
 'tradition',
 'authentic',
 'flagship',
 'banana',
 'cloudy',
 'unfiltered',
 'summer',
 'special',
 'brown',
 'different',
 'alcohol',
 'noble',
 'popular',
 'craft',
 'bavarian',
 'spicy',
 'seasonal',
 'pumpkin',
 'subtle',
 'blend',
 'amber',
 'unique',
 'golden',
 'coffee',
 'complex',
 'chocolate',
 'sweet',
 'barley',
 'smooth',
 'wheat',
 'caramel',
 'bitterness',
 'lager',
 'traditional',
 'light',
 'malty',
 'classic',
 'american',
 'german',
 'century',
 'yeast']

In [149]:
token_df = beerReviews.groupby('beer_name')['tokens'].sum().reset_index()
new_beers_df = beerReviews.groupby('beer_name')['beer_style'].sum().reset_index()

new_beers_df = new_beers_df.merge(token_df, on='beer_name')
new_beers_df

Unnamed: 0,beer_name,beer_style,tokens
0,# 100,American Barleywine,"[batch, wheat, chocolate, yeast, local, water]"
1,#9,Fruit / Vegetable Beer,"[secrecy, mysterious, unusual, palate, tongue,..."
2,'t Smisje BBBourgondier,Quadrupel (Quad),"[available, honor]"
3,10 Commandments,Belgian Strong Dark Ale,"[stronger, version, contemplative, alcohol, co..."
4,1100 Wheat Wine,Wheatwine,"[first, bourbon]"
...,...,...,...
1187,Zwickel,Keller Bier / Zwickel BierKeller Bier / Zwicke...,"[flagship, lager, zvickel, unfiltered, unpaste..."
1188,ZÔN,Witbier,"[summer, seasonal, interpretation, classic, be..."
1189,Éphémère (Apple),Fruit / Vegetable Beer,"[ephemeral, series, seasonal, fruit, refreshin..."
1190,Équinoxe Du Printemps,Scotch Ale / Wee Heavy,"[quinoxe, spring, equinox, quintessential, mar..."


In [150]:
token_count = []
for i in range(len(new_beers_df)):
    exp_token_dict = {}
    exp_token_dict = {k : 0 for k in exp_token_list}
    tokens = list(new_beers_df['tokens'])[i]
    for token in tokens:
        if token in exp_token_list:
            exp_token_dict[token] += 1
    token_count.append(exp_token_dict)
token_count_df = pd.DataFrame(token_count, index=new_beers_df.beer_name)
token_count_df

Unnamed: 0_level_0,wedge,yeasty,fruity,early,sweetness,example,today,festival,winter,fullbodied,...,bitterness,lager,traditional,light,malty,classic,american,german,century,yeast
beer_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
# 100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
#9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
't Smisje BBBourgondier,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10 Commandments,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1100 Wheat Wine,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zwickel,0,0,0,0,0,0,0,0,0,0,...,0,3,0,0,0,0,0,3,0,3
ZÔN,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
Éphémère (Apple),0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Équinoxe Du Printemps,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [157]:
(token_count_df.sum(axis=1) == 0).sum()

scaler = MinMaxScaler()
beers_df = pd.DataFrame(scaler.fit_transform(token_count_df),
                        columns=token_count_df.columns,
                        index = token_count_df.index)
beers_df.head()

Unnamed: 0_level_0,wedge,yeasty,fruity,early,sweetness,example,today,festival,winter,fullbodied,...,bitterness,lager,traditional,light,malty,classic,american,german,century,yeast
beer_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
# 100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003378
#9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
't Smisje BBBourgondier,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Commandments,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1100 Wheat Wine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [158]:
last_beers_df = beers_df.reset_index().merge(new_beers_df, on='beer_name')
last_beers_df.drop('tokens', axis=1, inplace=True)
last_beers_df.head()

Unnamed: 0,beer_name,wedge,yeasty,fruity,early,sweetness,example,today,festival,winter,...,lager,traditional,light,malty,classic,american,german,century,yeast,beer_style
0,# 100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003378,American Barleywine
1,#9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Fruit / Vegetable Beer
2,'t Smisje BBBourgondier,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Quadrupel (Quad)
3,10 Commandments,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Belgian Strong Dark Ale
4,1100 Wheat Wine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Wheatwine


In [174]:
import pickle

last_beers_df.to_pickle('./Beers_TFIDF_ended.pkl')

with open('./Beers_TFIDF_ended.pkl', 'rb') as f:
    last_beers_df = pickle.load(f)
    
matrix = last_beers_df.set_index('beer_name').drop(['beer_style'], axis=1)
cosine_sim = cosine_similarity(matrix, matrix)

indices = pd.Series(data=last_beers_df.index, index=last_beers_df.beer_name)
indices.head()

beer_name
# 100                      0
#9                         1
't Smisje BBBourgondier    2
10 Commandments            3
1100 Wheat Wine            4
dtype: int64

In [175]:
def get_recommendations(name, cosine_sim=cosine_sim):
    idx = indices[name]

    sim_scores = list(enumerate(cosine_sim[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    sim_scores = sim_scores[1:4]

    beer_indices = [i[0] for i in sim_scores]

    return indices.iloc[beer_indices].index.tolist()

In [176]:
get_recommendations('10 Commandments')

['Bracia', 'Labatt Blue Non-Alcoholic', 'Little Dog']