In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Import Dataset
Dataset use of a different encoding (forware slashes), therefore I use latin1 encoding

In [44]:
beer_recipe = pd.read_csv('./data/recipeData.csv', index_col='BeerID', encoding='latin1')
beer_recipe.head()

Unnamed: 0_level_0,Name,URL,Style,StyleID,Size(L),OG,FG,ABV,IBU,Color,...,BoilGravity,Efficiency,MashThickness,SugarScale,BrewMethod,PitchRate,PrimaryTemp,PrimingMethod,PrimingAmount,UserId
BeerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Vanilla Cream Ale,/homebrew/recipe/view/1633/vanilla-cream-ale,Cream Ale,45,21.77,1.055,1.013,5.48,17.65,4.83,...,1.038,70.0,,Specific Gravity,All Grain,,17.78,corn sugar,4.5 oz,116.0
2,Southern Tier Pumking clone,/homebrew/recipe/view/16367/southern-tier-pumk...,Holiday/Winter Special Spiced Beer,85,20.82,1.083,1.021,8.16,60.65,15.64,...,1.07,70.0,,Specific Gravity,All Grain,,,,,955.0
3,Zombie Dust Clone - EXTRACT,/homebrew/recipe/view/5920/zombie-dust-clone-e...,American IPA,7,18.93,1.063,1.018,5.91,59.25,8.98,...,,70.0,,Specific Gravity,extract,,,,,
4,Zombie Dust Clone - ALL GRAIN,/homebrew/recipe/view/5916/zombie-dust-clone-a...,American IPA,7,22.71,1.061,1.017,5.8,54.48,8.5,...,,70.0,,Specific Gravity,All Grain,,,,,
5,Bakke Brygg Belgisk Blonde 50 l,/homebrew/recipe/view/89534/bakke-brygg-belgis...,Belgian Blond Ale,20,50.0,1.06,1.01,6.48,17.84,4.57,...,1.05,72.0,,Specific Gravity,All Grain,,19.0,Sukkerlake,6-7 g sukker/l,18325.0


### Preprocessing

In [45]:
## Delete URL column mannually
del_columns = ['URL', 'Name']

for column in beer_recipe.columns:
    nullCount = beer_recipe[column].isnull()
    print('{} is null {} % of the time'.format(column, round((nullCount.sum() / len(beer_recipe)) * 100), 2))
    
    if (nullCount.sum() / len(beer_recipe)) > 0.3:
        del_columns.append(column)
        
beer_recipe = beer_recipe.drop(columns = del_columns)

Name is null 0.0 % of the time
URL is null 0.0 % of the time
Style is null 1.0 % of the time
StyleID is null 0.0 % of the time
Size(L) is null 0.0 % of the time
OG is null 0.0 % of the time
FG is null 0.0 % of the time
ABV is null 0.0 % of the time
IBU is null 0.0 % of the time
Color is null 0.0 % of the time
BoilSize is null 0.0 % of the time
BoilTime is null 0.0 % of the time
BoilGravity is null 4.0 % of the time
Efficiency is null 0.0 % of the time
MashThickness is null 40.0 % of the time
SugarScale is null 0.0 % of the time
BrewMethod is null 0.0 % of the time
PitchRate is null 53.0 % of the time
PrimaryTemp is null 31.0 % of the time
PrimingMethod is null 91.0 % of the time
PrimingAmount is null 94.0 % of the time
UserId is null 68.0 % of the time


In [46]:
boilGravity_median = beer_recipe['BoilGravity'].median()
beer_recipe['BoilGravity'].fillna(boilGravity_median, inplace = True)

In [47]:
from math import isnan

style_count = {}

for style in beer_recipe.Style.unique():
    if not isinstance(style, str) and isnan(style):
        continue
    style_count[style] = len(beer_recipe[beer_recipe['Style'] == style])
    
style_count_sorted = sorted(style_count.items(), key=lambda x: (-x[1], x[0]))
style_count_sorted_top30 = [style for (style, count) in style_count_sorted[:30]]

In [48]:
drop_index = []

for index, row in beer_recipe.iterrows():
    if row['Style'] not in style_count_sorted_top30:
        drop_index.append(index)
        
beer_recipe.drop(drop_index, inplace = True)

In [49]:
beer_recipe = pd.get_dummies(beer_recipe, columns = ['SugarScale', 'BrewMethod'])

In [50]:
beer_recipe_X = beer_recipe.drop(columns = ['Style'])

In [51]:
beer_receipe_norm = (beer_recipe_X - beer_recipe_X.mean()) / (beer_recipe_X.max() - beer_recipe_X.min())

In [52]:
beer_recipe[beer_receipe_norm.columns] = beer_receipe_norm

In [53]:
beer_recipe = beer_recipe.groupby('Style', as_index=False).mean()

In [55]:
beer_recipe

Unnamed: 0,Style,StyleID,Size(L),OG,FG,ABV,IBU,Color,BoilSize,BoilTime,BoilGravity,Efficiency,SugarScale_Plato,SugarScale_Specific Gravity,BrewMethod_All Grain,BrewMethod_BIAB,BrewMethod_Partial Mash,BrewMethod_extract
0,American Amber Ale,-0.256515,-0.000455,-0.002684,-0.001647,-0.008156,-0.002893,0.006983,-0.000538,-0.006385,-0.001162,-0.014906,-0.004614,0.004614,-0.020063,-0.023447,0.00363,0.039881
1,American Brown Ale,-0.244819,-0.001592,-0.002978,-0.001511,-0.005482,-0.0039,0.057698,-0.001319,-0.005317,-0.001585,0.000678,-0.00639,0.00639,-0.003808,0.006849,0.003814,-0.006855
2,American IPA,-0.238972,-0.000315,0.000274,-0.00017,0.005909,0.006623,-0.023958,-0.000338,-0.003613,1.3e-05,-0.00266,-0.000468,0.000468,-0.000145,-0.001565,-0.002929,0.004639
3,American Light Lager,-0.227276,0.005613,0.001709,0.000909,-0.008638,-0.00546,-0.014396,0.005377,-0.014808,0.001283,-0.018589,0.006552,-0.006552,-0.009977,-0.014849,-0.006119,0.030944
4,American Pale Ale,-0.221428,-0.000339,-0.000728,-0.00094,-0.012016,-0.001673,-0.030442,-0.00035,-0.008875,-0.000246,0.00547,0.001576,-0.001576,-0.018165,0.020723,-0.001497,-0.001061
5,American Porter,-0.21558,0.000663,0.000948,0.001652,0.001442,-0.002423,0.120797,0.000677,-0.000436,0.000317,0.005291,0.001662,-0.001662,0.016141,0.010789,-0.011823,-0.015107
6,American Stout,-0.209732,-0.000379,0.001402,0.002546,0.005687,-7.8e-05,0.140107,-0.000314,-0.002779,0.000415,-0.004185,-9.2e-05,9.2e-05,-0.022839,0.002325,0.007804,0.01271
7,American Wheat Beer,-0.198036,0.000258,0.001445,-0.001127,-0.014798,-0.006889,-0.042736,0.000375,-0.010531,0.00154,-0.007584,0.009888,-0.009888,-0.017362,0.027841,-0.01682,0.006341
8,American Wheat or Rye Beer,-0.192188,-0.001889,-0.004345,-0.002997,-0.016847,-0.007156,-0.038393,-0.002138,-0.006915,-0.002259,-0.012225,-0.006238,0.006238,-0.061033,-0.038947,0.026154,0.073825
9,Belgian Pale Ale,-0.139556,0.000403,-0.001342,-0.000876,-0.008889,-0.005695,-0.025063,0.000492,0.012669,0.000786,0.001282,0.000249,-0.000249,-0.003138,0.01591,-0.007401,-0.005371


### Apply t-SNE

In [56]:
from sklearn.manifold import TSNE

In [57]:
beer_recipe_X, beer_recipe_y = beer_recipe.drop(columns = ['Style']), beer_recipe['Style']

In [58]:
tsne = TSNE(n_components=2, n_iter=10000, verbose=1, random_state=42)
Z = tsne.fit_transform(beer_recipe_X)

[t-SNE] Computing 29 nearest neighbors...
[t-SNE] Indexed 30 samples in 0.000s...
[t-SNE] Computed neighbors for 30 samples in 0.034s...
[t-SNE] Computed conditional probabilities for sample 30 / 30
[t-SNE] Mean sigma: 1125899906842624.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 40.451550
[t-SNE] Error after 1350 iterations: 0.498864


In [88]:
import plotly
plotly.tools.set_credentials_file(username='Jinheon', api_key='Olp7Ss5A2VZVyLMzqwZr')

In [90]:
import plotly.plotly as py
import plotly.graph_objs as go

trace0 = go.Scatter(
    x=Z[:, 0],
    y=Z[:, 1],
    mode='markers+text',
    text=list(beer_recipe_y),
    textposition='top center'
)

data = [trace0]

py.plot(data, filename = 'basic', auto_open=True)

'https://plot.ly/~Jinheon/2'