In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

### Import Dataset
Dataset use of a different encoding (forware slashes), therefore I use latin1 encoding

In [2]:
beer_recipe = pd.read_csv('./data/recipeData.csv', index_col='BeerID', encoding='latin1')
beer_recipe.head()

Unnamed: 0_level_0,Name,URL,Style,StyleID,Size(L),OG,FG,ABV,IBU,Color,...,BoilGravity,Efficiency,MashThickness,SugarScale,BrewMethod,PitchRate,PrimaryTemp,PrimingMethod,PrimingAmount,UserId
BeerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Vanilla Cream Ale,/homebrew/recipe/view/1633/vanilla-cream-ale,Cream Ale,45,21.77,1.055,1.013,5.48,17.65,4.83,...,1.038,70.0,,Specific Gravity,All Grain,,17.78,corn sugar,4.5 oz,116.0
2,Southern Tier Pumking clone,/homebrew/recipe/view/16367/southern-tier-pumk...,Holiday/Winter Special Spiced Beer,85,20.82,1.083,1.021,8.16,60.65,15.64,...,1.07,70.0,,Specific Gravity,All Grain,,,,,955.0
3,Zombie Dust Clone - EXTRACT,/homebrew/recipe/view/5920/zombie-dust-clone-e...,American IPA,7,18.93,1.063,1.018,5.91,59.25,8.98,...,,70.0,,Specific Gravity,extract,,,,,
4,Zombie Dust Clone - ALL GRAIN,/homebrew/recipe/view/5916/zombie-dust-clone-a...,American IPA,7,22.71,1.061,1.017,5.8,54.48,8.5,...,,70.0,,Specific Gravity,All Grain,,,,,
5,Bakke Brygg Belgisk Blonde 50 l,/homebrew/recipe/view/89534/bakke-brygg-belgis...,Belgian Blond Ale,20,50.0,1.06,1.01,6.48,17.84,4.57,...,1.05,72.0,,Specific Gravity,All Grain,,19.0,Sukkerlake,6-7 g sukker/l,18325.0


### Preprocessing

In [3]:
## Delete URL column mannually
del_columns = ['URL', 'Name']

for column in beer_recipe.columns:
    nullCount = beer_recipe[column].isnull()
    print('{} is null {} % of the time'.format(column, round((nullCount.sum() / len(beer_recipe)) * 100), 2))
    
    if (nullCount.sum() / len(beer_recipe)) > 0.3:
        del_columns.append(column)
        
beer_recipe = beer_recipe.drop(columns = del_columns)

Name is null 0.0 % of the time
URL is null 0.0 % of the time
Style is null 1.0 % of the time
StyleID is null 0.0 % of the time
Size(L) is null 0.0 % of the time
OG is null 0.0 % of the time
FG is null 0.0 % of the time
ABV is null 0.0 % of the time
IBU is null 0.0 % of the time
Color is null 0.0 % of the time
BoilSize is null 0.0 % of the time
BoilTime is null 0.0 % of the time
BoilGravity is null 4.0 % of the time
Efficiency is null 0.0 % of the time
MashThickness is null 40.0 % of the time
SugarScale is null 0.0 % of the time
BrewMethod is null 0.0 % of the time
PitchRate is null 53.0 % of the time
PrimaryTemp is null 31.0 % of the time
PrimingMethod is null 91.0 % of the time
PrimingAmount is null 94.0 % of the time
UserId is null 68.0 % of the time


In [4]:
boilGravity_median = beer_recipe['BoilGravity'].median()
beer_recipe['BoilGravity'].fillna(boilGravity_median, inplace = True)

In [5]:
from math import isnan

style_count = {}

for style in beer_recipe.Style.unique():
    if not isinstance(style, str) and isnan(style):
        continue
    style_count[style] = len(beer_recipe[beer_recipe['Style'] == style])
    
style_count_sorted = sorted(style_count.items(), key=lambda x: (-x[1], x[0]))
style_count_sorted_top100 = [style for (style, count) in style_count_sorted[:100]]

In [6]:
drop_index = []

for index, row in beer_recipe.iterrows():
    if row['Style'] not in style_count_sorted_top100:
        drop_index.append(index)
        
beer_recipe.drop(drop_index, inplace = True)

In [7]:
beer_recipe = pd.get_dummies(beer_recipe, columns = ['SugarScale', 'BrewMethod'])

In [8]:
beer_recipe_X = beer_recipe.drop(columns = ['Style', 'StyleID'])

In [9]:
beer_receipe_norm = (beer_recipe_X - beer_recipe_X.mean()) / (beer_recipe_X.max() - beer_recipe_X.min())

In [10]:
beer_recipe[beer_receipe_norm.columns] = beer_receipe_norm

In [11]:
beer_recipe = beer_recipe.groupby('Style').apply(lambda x: x.sample(10)).reset_index(drop=True)

In [12]:
beer_recipe

Unnamed: 0,Style,StyleID,Size(L),OG,FG,ABV,IBU,Color,BoilSize,BoilTime,BoilGravity,Efficiency,SugarScale_Plato,SugarScale_Specific Gravity,BrewMethod_All Grain,BrewMethod_BIAB,BrewMethod_Partial Mash,BrewMethod_extract
0,Altbier,1,-0.004374,-0.011335,-0.006161,-0.026098,-0.002821,-0.000602,-0.003970,-0.021954,-0.006163,0.036542,-0.02562,0.02562,0.326508,-0.164078,-0.047342,-0.115088
1,Altbier,1,0.038697,-0.011431,-0.006451,-0.024088,-0.005939,-0.009472,0.036100,-0.021954,-0.005612,0.136542,-0.02562,0.02562,0.326508,-0.164078,-0.047342,-0.115088
2,Altbier,1,-0.002286,-0.011177,-0.006451,-0.005630,0.002162,0.001979,-0.002255,0.103046,-0.005631,0.086542,-0.02562,0.02562,0.326508,-0.164078,-0.047342,-0.115088
3,Altbier,1,0.006520,-0.011272,-0.006161,-0.022260,-0.002006,0.066603,0.005994,0.103046,-0.005555,0.086542,-0.02562,0.02562,-0.673492,0.835922,-0.047342,-0.115088
4,Altbier,1,-0.002286,-0.011272,-0.006547,-0.012392,0.000757,-0.000655,-0.002976,-0.021954,-0.005745,-0.262458,-0.02562,0.02562,-0.673492,-0.164078,0.952658,-0.115088
5,Altbier,1,-0.002523,-0.011208,-0.005677,-0.025550,0.000223,0.002355,-0.002214,-0.021954,-0.005745,0.086542,-0.02562,0.02562,0.326508,-0.164078,-0.047342,-0.115088
6,Altbier,1,-0.002626,-0.011208,-0.006161,-0.014767,-0.000419,0.003431,-0.002897,0.040546,-0.005555,0.086542,-0.02562,0.02562,-0.673492,0.835922,-0.047342,-0.115088
7,Altbier,1,-0.002523,-0.011272,-0.006257,-0.019153,-0.001202,0.013915,-0.002409,-0.021954,-0.005726,0.036542,-0.02562,0.02562,0.326508,-0.164078,-0.047342,-0.115088
8,Altbier,1,-0.002523,-0.011018,-0.005774,-0.010930,-0.003367,0.009183,-0.002214,0.103046,-0.005669,0.116542,-0.02562,0.02562,-0.673492,0.835922,-0.047342,-0.115088
9,Altbier,1,-0.000259,-0.011335,-0.006064,-0.029022,-0.001387,0.016442,0.000127,0.061380,-0.005726,0.086542,-0.02562,0.02562,0.326508,-0.164078,-0.047342,-0.115088


In [13]:
beer_recipe = beer_recipe.groupby('Style', as_index=False).mean()

### Apply t-SNE

In [14]:
from sklearn.manifold import TSNE

In [15]:
beer_recipe_X, beer_recipe_y = beer_recipe.drop(columns = ['Style', 'StyleID']), beer_recipe['Style']

In [16]:
beer_recipe_X.columns

Index(['Size(L)', 'OG', 'FG', 'ABV', 'IBU', 'Color', 'BoilSize', 'BoilTime',
       'BoilGravity', 'Efficiency', 'SugarScale_Plato',
       'SugarScale_Specific Gravity', 'BrewMethod_All Grain',
       'BrewMethod_BIAB', 'BrewMethod_Partial Mash', 'BrewMethod_extract'],
      dtype='object')

In [17]:
tsne = TSNE(n_components=2, n_iter=2000, verbose=1, random_state=42)
Z = tsne.fit_transform(beer_recipe_X)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 100 samples in 0.001s...
[t-SNE] Computed neighbors for 100 samples in 0.006s...
[t-SNE] Computed conditional probabilities for sample 100 / 100
[t-SNE] Mean sigma: 0.141465


  result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind


[t-SNE] KL divergence after 250 iterations with early exaggeration: 55.063694
[t-SNE] Error after 1050 iterations: 0.269177


In [19]:
import plotly.plotly as py
import plotly.graph_objs as go

trace0 = go.Scatter(
    x=Z[:, 0],
    y=Z[:, 1],
    mode='markers+text',
    text=list(beer_recipe_y),
    textposition='top center'
)

data = [trace0]

py.plot(data, filename = 'basic', auto_open=True)

'https://plot.ly/~Jinheon/2'