In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Import Dataset
Dataset use of a different encoding (forware slashes), therefore I use latin1 encoding

In [50]:
beer_recipe = pd.read_csv('./data/recipeData.csv', index_col='BeerID', encoding='latin1')
beer_recipe.head()

Unnamed: 0_level_0,Name,URL,Style,StyleID,Size(L),OG,FG,ABV,IBU,Color,...,BoilGravity,Efficiency,MashThickness,SugarScale,BrewMethod,PitchRate,PrimaryTemp,PrimingMethod,PrimingAmount,UserId
BeerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Vanilla Cream Ale,/homebrew/recipe/view/1633/vanilla-cream-ale,Cream Ale,45,21.77,1.055,1.013,5.48,17.65,4.83,...,1.038,70.0,,Specific Gravity,All Grain,,17.78,corn sugar,4.5 oz,116.0
2,Southern Tier Pumking clone,/homebrew/recipe/view/16367/southern-tier-pumk...,Holiday/Winter Special Spiced Beer,85,20.82,1.083,1.021,8.16,60.65,15.64,...,1.07,70.0,,Specific Gravity,All Grain,,,,,955.0
3,Zombie Dust Clone - EXTRACT,/homebrew/recipe/view/5920/zombie-dust-clone-e...,American IPA,7,18.93,1.063,1.018,5.91,59.25,8.98,...,,70.0,,Specific Gravity,extract,,,,,
4,Zombie Dust Clone - ALL GRAIN,/homebrew/recipe/view/5916/zombie-dust-clone-a...,American IPA,7,22.71,1.061,1.017,5.8,54.48,8.5,...,,70.0,,Specific Gravity,All Grain,,,,,
5,Bakke Brygg Belgisk Blonde 50 l,/homebrew/recipe/view/89534/bakke-brygg-belgis...,Belgian Blond Ale,20,50.0,1.06,1.01,6.48,17.84,4.57,...,1.05,72.0,,Specific Gravity,All Grain,,19.0,Sukkerlake,6-7 g sukker/l,18325.0


### Preprocessing

In [51]:
## Delete URL column mannually
del_columns = ['URL', 'Name']

for column in beer_recipe.columns:
    nullCount = beer_recipe[column].isnull()
    print('{} is null {} % of the time'.format(column, round((nullCount.sum() / len(beer_recipe)) * 100), 2))
    
    if (nullCount.sum() / len(beer_recipe)) > 0.3:
        del_columns.append(column)
        
beer_recipe = beer_recipe.drop(columns = del_columns)

Name is null 0.0 % of the time
URL is null 0.0 % of the time
Style is null 1.0 % of the time
StyleID is null 0.0 % of the time
Size(L) is null 0.0 % of the time
OG is null 0.0 % of the time
FG is null 0.0 % of the time
ABV is null 0.0 % of the time
IBU is null 0.0 % of the time
Color is null 0.0 % of the time
BoilSize is null 0.0 % of the time
BoilTime is null 0.0 % of the time
BoilGravity is null 4.0 % of the time
Efficiency is null 0.0 % of the time
MashThickness is null 40.0 % of the time
SugarScale is null 0.0 % of the time
BrewMethod is null 0.0 % of the time
PitchRate is null 53.0 % of the time
PrimaryTemp is null 31.0 % of the time
PrimingMethod is null 91.0 % of the time
PrimingAmount is null 94.0 % of the time
UserId is null 68.0 % of the time


In [52]:
boilGravity_median = beer_recipe['BoilGravity'].median()
beer_recipe['BoilGravity'].fillna(boilGravity_median, inplace = True)

In [53]:
from math import isnan

style_count = {}

for style in beer_recipe.Style.unique():
    if not isinstance(style, str) and isnan(style):
        continue
    style_count[style] = len(beer_recipe[beer_recipe['Style'] == style])
    
style_count_sorted = sorted(style_count.items(), key=lambda x: (-x[1], x[0]))
style_count_sorted_top30 = [style for (style, count) in style_count_sorted[:100]]

In [54]:
drop_index = []

for index, row in beer_recipe.iterrows():
    if row['Style'] not in style_count_sorted_top30:
        drop_index.append(index)
        
beer_recipe.drop(drop_index, inplace = True)

In [55]:
beer_recipe = pd.get_dummies(beer_recipe, columns = ['SugarScale', 'BrewMethod'])

In [56]:
beer_recipe_X = beer_recipe.drop(columns = ['Style', 'StyleID'])

In [57]:
beer_receipe_norm = (beer_recipe_X - beer_recipe_X.mean()) / (beer_recipe_X.max() - beer_recipe_X.min())

In [58]:
beer_recipe_X.max()

Size(L)                        9200.0000
OG                               32.5008
FG                               10.3414
ABV                              54.7200
IBU                            3409.3000
Color                           186.0000
BoilSize                       9700.0000
BoilTime                        240.0000
BoilGravity                      52.6000
Efficiency                      100.0000
SugarScale_Plato                  1.0000
SugarScale_Specific Gravity       1.0000
BrewMethod_All Grain              1.0000
BrewMethod_BIAB                   1.0000
BrewMethod_Partial Mash           1.0000
BrewMethod_extract                1.0000
dtype: float64

In [59]:
beer_recipe[beer_receipe_norm.columns] = beer_receipe_norm

In [60]:
beer_recipe = beer_recipe.groupby('Style', as_index=False).mean()

In [61]:
beer_recipe

Unnamed: 0,Style,StyleID,Size(L),OG,FG,ABV,IBU,Color,BoilSize,BoilTime,BoilGravity,Efficiency,SugarScale_Plato,SugarScale_Specific Gravity,BrewMethod_All Grain,BrewMethod_BIAB,BrewMethod_Partial Mash,BrewMethod_extract
0,Altbier,1,-0.001453,0.002037,0.000927,-0.018529,-0.001820,0.003032,-0.001463,0.021325,0.000144,-0.001772,0.010877,-0.010877,0.041837,0.003805,-0.040043,-0.005599
1,American Amber Ale,4,-0.000288,-0.003397,-0.001884,-0.008103,-0.001716,0.004011,-0.000352,-0.009748,-0.001478,-0.017547,-0.006483,0.006483,-0.024817,-0.024235,0.003688,0.045363
2,American Barleywine,5,0.000320,0.013745,0.009385,0.081906,0.013941,0.019173,0.000595,0.069505,0.005960,0.003624,0.006638,-0.006638,0.074309,-0.023315,0.002511,-0.053505
3,American Brown Ale,6,-0.001043,-0.003690,-0.001764,-0.005430,-0.002723,0.054726,-0.000872,-0.008680,-0.001901,-0.001963,-0.008259,0.008259,-0.008561,0.006061,0.003873,-0.001373
4,American IPA,7,-0.000195,-0.000438,-0.000585,0.005962,0.007800,-0.026929,-0.000220,-0.006976,-0.000302,-0.005301,-0.002337,0.002337,-0.004899,-0.002353,-0.002870,0.010121
5,American Lager,8,-0.000160,0.003182,0.000802,-0.016364,-0.006329,-0.045014,-0.000060,-0.001082,0.002576,0.017665,0.015289,-0.015289,0.012872,0.026831,-0.010978,-0.028724
6,American Light Lager,9,0.003736,0.000996,0.000364,-0.008585,-0.004284,-0.017367,0.003583,-0.018171,0.000968,-0.021230,0.004683,-0.004683,-0.014730,-0.015637,-0.006060,0.036427
7,American Pale Ale,10,-0.000211,-0.001441,-0.001262,-0.011964,-0.000496,-0.033414,-0.000227,-0.012237,-0.000561,0.002829,-0.000293,0.000293,-0.022918,0.019935,-0.001438,0.004421
8,American Porter,11,0.000454,0.000236,0.001018,0.001494,-0.001247,0.117825,0.000456,-0.003799,0.000002,0.002650,-0.000207,0.000207,0.011387,0.010001,-0.011764,-0.009624
9,American Stout,12,-0.000238,0.000689,0.001804,0.005740,0.001098,0.137136,-0.000203,-0.006141,0.000100,-0.006826,-0.001960,0.001960,-0.027593,0.001537,0.007863,0.018193


### Apply t-SNE

In [62]:
from sklearn.manifold import TSNE

In [63]:
beer_recipe_X, beer_recipe_y = beer_recipe.drop(columns = ['Style', 'StyleID']), beer_recipe['Style']

In [64]:
beer_recipe_X.columns

Index(['Size(L)', 'OG', 'FG', 'ABV', 'IBU', 'Color', 'BoilSize', 'BoilTime',
       'BoilGravity', 'Efficiency', 'SugarScale_Plato',
       'SugarScale_Specific Gravity', 'BrewMethod_All Grain',
       'BrewMethod_BIAB', 'BrewMethod_Partial Mash', 'BrewMethod_extract'],
      dtype='object')

In [65]:
tsne = TSNE(n_components=2, n_iter=500, verbose=1, random_state=42)
Z = tsne.fit_transform(beer_recipe_X)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 100 samples in 0.001s...
[t-SNE] Computed neighbors for 100 samples in 0.005s...
[t-SNE] Computed conditional probabilities for sample 100 / 100
[t-SNE] Mean sigma: 0.050580



invalid value encountered in sqrt



[t-SNE] KL divergence after 250 iterations with early exaggeration: 57.249466
[t-SNE] Error after 500 iterations: 0.259696


In [66]:
Z

array([[-3.687309  ,  0.31392932],
       [-0.8904093 , -4.641138  ],
       [-0.5437613 ,  3.421788  ],
       [ 1.49228   , -1.8889406 ],
       [-3.1260386 , -2.9191785 ],
       [-6.160279  , -1.2546194 ],
       [-2.1388295 , -4.339795  ],
       [-4.7036824 , -3.6756642 ],
       [ 4.2546535 , -1.7286264 ],
       [ 4.557111  , -2.7483552 ],
       [-0.3131212 ,  0.05714021],
       [-4.8173876 , -4.2120824 ],
       [-1.221966  , -6.2056174 ],
       [ 4.4820204 , -0.61859006],
       [-4.2678847 ,  2.0665867 ],
       [-0.20021561,  2.2741625 ],
       [ 0.2794783 , -0.99089694],
       [-2.6089332 , -0.22164519],
       [-4.2353406 , -2.8294673 ],
       [-1.2329682 , -3.8672059 ],
       [-2.3341677 , -0.83796734],
       [-8.264446  , -1.039133  ],
       [-7.329354  , -2.0149455 ],
       [-1.642259  , -2.750493  ],
       [-4.401847  , -2.9017122 ],
       [-4.56748   ,  3.0941317 ],
       [ 0.9476708 , -1.3296547 ],
       [-7.8285093 , -2.7559829 ],
       [-0.31666297,

In [67]:
import plotly.plotly as py
import plotly.graph_objs as go

trace0 = go.Scatter(
    x=Z[:, 0],
    y=Z[:, 1],
    mode='markers+text',
    text=list(beer_recipe_y),
    textposition='top center'
)

data = [trace0]

py.plot(data, filename = 'basic', auto_open=True)

'https://plot.ly/~Jinheon/2'