In [1]:
import numpy as np
import pandas as pd
import missingno as msno
import matplotlib.pyplot as plt

### Import Dataset
Dataset use of a different encoding (forware slashes), therefore I use latin1 encoding

In [2]:
beer_recipe = pd.read_csv('./data/recipeData.csv', index_col='BeerID', encoding='latin1')
beer_recipe.head()

Unnamed: 0_level_0,Name,URL,Style,StyleID,Size(L),OG,FG,ABV,IBU,Color,...,BoilGravity,Efficiency,MashThickness,SugarScale,BrewMethod,PitchRate,PrimaryTemp,PrimingMethod,PrimingAmount,UserId
BeerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Vanilla Cream Ale,/homebrew/recipe/view/1633/vanilla-cream-ale,Cream Ale,45,21.77,1.055,1.013,5.48,17.65,4.83,...,1.038,70.0,,Specific Gravity,All Grain,,17.78,corn sugar,4.5 oz,116.0
2,Southern Tier Pumking clone,/homebrew/recipe/view/16367/southern-tier-pumk...,Holiday/Winter Special Spiced Beer,85,20.82,1.083,1.021,8.16,60.65,15.64,...,1.07,70.0,,Specific Gravity,All Grain,,,,,955.0
3,Zombie Dust Clone - EXTRACT,/homebrew/recipe/view/5920/zombie-dust-clone-e...,American IPA,7,18.93,1.063,1.018,5.91,59.25,8.98,...,,70.0,,Specific Gravity,extract,,,,,
4,Zombie Dust Clone - ALL GRAIN,/homebrew/recipe/view/5916/zombie-dust-clone-a...,American IPA,7,22.71,1.061,1.017,5.8,54.48,8.5,...,,70.0,,Specific Gravity,All Grain,,,,,
5,Bakke Brygg Belgisk Blonde 50 l,/homebrew/recipe/view/89534/bakke-brygg-belgis...,Belgian Blond Ale,20,50.0,1.06,1.01,6.48,17.84,4.57,...,1.05,72.0,,Specific Gravity,All Grain,,19.0,Sukkerlake,6-7 g sukker/l,18325.0


In [3]:
print(beer_recipe.columns)

Index(['Name', 'URL', 'Style', 'StyleID', 'Size(L)', 'OG', 'FG', 'ABV', 'IBU',
       'Color', 'BoilSize', 'BoilTime', 'BoilGravity', 'Efficiency',
       'MashThickness', 'SugarScale', 'BrewMethod', 'PitchRate', 'PrimaryTemp',
       'PrimingMethod', 'PrimingAmount', 'UserId'],
      dtype='object')


In [4]:
print(beer_recipe.info(verbose=False))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73861 entries, 1 to 73861
Columns: 22 entries, Name to UserId
dtypes: float64(13), int64(2), object(7)
memory usage: 13.0+ MB
None


### Drop Large Missing Value Columns

In [5]:
## Delete URL column mannually
del_columns = ['URL', 'Name']

In [6]:
for column in beer_recipe.columns:
    nullCount = beer_recipe[column].isnull()
    print('{} is null {} % of the time'.format(column, round((nullCount.sum() / len(beer_recipe)) * 100), 2))
    
    if (nullCount.sum() / len(beer_recipe)) > 0.3:
        del_columns.append(column)

Name is null 0.0 % of the time
URL is null 0.0 % of the time
Style is null 1.0 % of the time
StyleID is null 0.0 % of the time
Size(L) is null 0.0 % of the time
OG is null 0.0 % of the time
FG is null 0.0 % of the time
ABV is null 0.0 % of the time
IBU is null 0.0 % of the time
Color is null 0.0 % of the time
BoilSize is null 0.0 % of the time
BoilTime is null 0.0 % of the time
BoilGravity is null 4.0 % of the time
Efficiency is null 0.0 % of the time
MashThickness is null 40.0 % of the time
SugarScale is null 0.0 % of the time
BrewMethod is null 0.0 % of the time
PitchRate is null 53.0 % of the time
PrimaryTemp is null 31.0 % of the time
PrimingMethod is null 91.0 % of the time
PrimingAmount is null 94.0 % of the time
UserId is null 68.0 % of the time


In [7]:
del_columns

['URL',
 'Name',
 'MashThickness',
 'PitchRate',
 'PrimaryTemp',
 'PrimingMethod',
 'PrimingAmount',
 'UserId']

In [8]:
beer_recipe = beer_recipe.drop(columns = del_columns)

In [9]:
beer_recipe.head()

Unnamed: 0_level_0,Style,StyleID,Size(L),OG,FG,ABV,IBU,Color,BoilSize,BoilTime,BoilGravity,Efficiency,SugarScale,BrewMethod
BeerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,Cream Ale,45,21.77,1.055,1.013,5.48,17.65,4.83,28.39,75,1.038,70.0,Specific Gravity,All Grain
2,Holiday/Winter Special Spiced Beer,85,20.82,1.083,1.021,8.16,60.65,15.64,24.61,60,1.07,70.0,Specific Gravity,All Grain
3,American IPA,7,18.93,1.063,1.018,5.91,59.25,8.98,22.71,60,,70.0,Specific Gravity,extract
4,American IPA,7,22.71,1.061,1.017,5.8,54.48,8.5,26.5,60,,70.0,Specific Gravity,All Grain
5,Belgian Blond Ale,20,50.0,1.06,1.01,6.48,17.84,4.57,60.0,90,1.05,72.0,Specific Gravity,All Grain


### Imputation of Missing Value Columns
* BoilGravity Column

In [10]:
boilGravity_median = beer_recipe['BoilGravity'].median()
beer_recipe['BoilGravity'].fillna(boilGravity_median, inplace = True)

### Class Imbalance

#### Style
On the Brewers' Freiends website, the style is represented by two columns 
(Style name and Category).

Forexample, style name American IPA has categories of IPA, and India Pale Ale(IPA). <br>
Also, style name cream Ale has categories of Light Hybrid Beer and Standard American Beer.

But style and category is not a subset of each other.<br>
On the above category (IPA) have others style like Specialty IPA: Belgian IPA, ...

* In these dataset, they collected only the style information

In [11]:
print('There are {} different styles of beer'.format(beer_recipe.Style.nunique()))
print('There are {} different styles ID of beer'.format(beer_recipe.StyleID.nunique()))

There are 175 different styles of beer
There are 176 different styles ID of beer


* Style and StyleID are not matched, because StyleID have rows of nan at style field.

In [12]:
from math import isnan

In [13]:
style_count = {}

for style in beer_recipe.Style.unique():
    if not isinstance(style, str) and isnan(style):
        continue
    style_count[style] = len(beer_recipe[beer_recipe['Style'] == style])

In [14]:
style_count_sorted = sorted(style_count.items(), key=lambda x: (-x[1], x[0]))

In [15]:
style_count_sorted

[('American IPA', 11940),
 ('American Pale Ale', 7581),
 ('Saison', 2617),
 ('American Light Lager', 2277),
 ('American Amber Ale', 2038),
 ('Blonde Ale', 1753),
 ('Imperial IPA', 1478),
 ('American Stout', 1268),
 ('Irish Red Ale', 1204),
 ('American Brown Ale', 1152),
 ('Witbier', 1072),
 ('California Common Beer', 1044),
 ('Weissbier', 988),
 ('Oatmeal Stout', 961),
 ('Russian Imperial Stout', 929),
 ('Sweet Stout', 919),
 ('Weizen/Weissbier', 919),
 ('Robust Porter', 897),
 ('Kölsch', 869),
 ('Double IPA', 864),
 ('Cream Ale', 830),
 ('American Porter', 787),
 ('English IPA', 784),
 ('Imperial Stout', 675),
 ('Extra Special/Strong Bitter (ESB)', 659),
 ('American Wheat Beer', 654),
 ('Specialty IPA: Black IPA', 638),
 ('Belgian Pale Ale', 625),
 ('American Wheat or Rye Beer', 571),
 ('Belgian Tripel', 563),
 ('Belgian Dark Strong Ale', 538),
 ('Belgian Dubbel', 530),
 ('Spice  Herb  or Vegetable Beer', 514),
 ('Fruit Beer', 502),
 ('Belgian Blond Ale', 496),
 ('Dry Stout', 484),
 (

In [16]:
style_count_sorted_top10 = [style for (style, count) in style_count_sorted[:100]]

### Drop the columns, not the top 10

In [17]:
style_count_sorted_top10

['American IPA',
 'American Pale Ale',
 'Saison',
 'American Light Lager',
 'American Amber Ale',
 'Blonde Ale',
 'Imperial IPA',
 'American Stout',
 'Irish Red Ale',
 'American Brown Ale',
 'Witbier',
 'California Common Beer',
 'Weissbier',
 'Oatmeal Stout',
 'Russian Imperial Stout',
 'Sweet Stout',
 'Weizen/Weissbier',
 'Robust Porter',
 'Kölsch',
 'Double IPA',
 'Cream Ale',
 'American Porter',
 'English IPA',
 'Imperial Stout',
 'Extra Special/Strong Bitter (ESB)',
 'American Wheat Beer',
 'Specialty IPA: Black IPA',
 'Belgian Pale Ale',
 'American Wheat or Rye Beer',
 'Belgian Tripel',
 'Belgian Dark Strong Ale',
 'Belgian Dubbel',
 'Spice  Herb  or Vegetable Beer',
 'Fruit Beer',
 'Belgian Blond Ale',
 'Dry Stout',
 'Strong Bitter',
 'German Pilsner (Pils)',
 'Specialty IPA: Red IPA',
 'Brown Porter',
 'Experimental Beer',
 'Specialty Beer',
 'Belgian Specialty Ale',
 'Berliner Weisse',
 'Vienna Lager',
 'Best Bitter',
 'British Brown Ale',
 'Oktoberfest/Märzen',
 'Foreign Extr

In [18]:
drop_index = []

for index, row in beer_recipe.iterrows():
    if row['Style'] not in style_count_sorted_top10:
        drop_index.append(index)

In [19]:
beer_recipe.drop(drop_index, inplace = True)

### Random Forest

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [21]:
## Split dev & test
train, test = train_test_split(beer_recipe, test_size = 0.3, random_state=42)

train_X, train_y = train.drop(columns = ['Style']), train['Style']
test_X, test_y = test.drop(columns = ['Style']), test['Style']

## One-hot
train_X = pd.get_dummies(train_X)
test_X = pd.get_dummies(test_X)

print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)

(48853, 17) (48853,) (20937, 17) (20937,)


In [22]:
train_X.head()

Unnamed: 0_level_0,StyleID,Size(L),OG,FG,ABV,IBU,Color,BoilSize,BoilTime,BoilGravity,Efficiency,SugarScale_Plato,SugarScale_Specific Gravity,BrewMethod_All Grain,BrewMethod_BIAB,BrewMethod_Partial Mash,BrewMethod_extract
BeerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
5248,65,22.71,1.059,1.016,5.56,53.83,11.77,28.39,60,1.047,75.0,0,1,1,0,0,0
38041,10,18.93,1.058,1.015,5.75,45.07,6.52,24.61,60,1.045,35.0,0,1,0,0,0,1
44670,10,22.0,1.054,1.01,5.77,33.7,6.45,28.4,60,1.042,75.0,0,1,1,0,0,0
15132,85,9.5,1.102,1.026,10.05,29.43,44.34,11.0,60,1.088,65.0,0,1,1,0,0,0
33018,134,18.93,1.062,1.011,6.64,28.55,3.59,24.61,60,1.048,70.0,0,1,1,0,0,0


In [23]:
train_X.columns

Index(['StyleID', 'Size(L)', 'OG', 'FG', 'ABV', 'IBU', 'Color', 'BoilSize',
       'BoilTime', 'BoilGravity', 'Efficiency', 'SugarScale_Plato',
       'SugarScale_Specific Gravity', 'BrewMethod_All Grain',
       'BrewMethod_BIAB', 'BrewMethod_Partial Mash', 'BrewMethod_extract'],
      dtype='object')

In [24]:
classifier = RandomForestClassifier(n_estimators=100, random_state=42)

In [25]:
classifier.fit(train_X, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [26]:
from sklearn.metrics import accuracy_score

In [27]:
predicted = classifier.predict(test_X)
accuracy = accuracy_score(test_y, predicted)

In [28]:
classifier.predict_proba(test_X)[0]

array([0.  , 0.  , 0.  , 0.01, 0.  , 0.  , 0.  , 0.  , 0.  , 0.03, 0.  ,
       0.  , 0.  , 0.02, 0.  , 0.04, 0.03, 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.01, 0.  , 0.  , 0.02, 0.  , 0.  , 0.  ,
       0.01, 0.  , 0.  , 0.  , 0.  , 0.03, 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.02, 0.  , 0.01, 0.04, 0.  , 0.  , 0.  , 0.  , 0.07,
       0.01, 0.01, 0.  , 0.01, 0.01, 0.  , 0.  , 0.02, 0.  , 0.01, 0.  ,
       0.  , 0.  , 0.01, 0.25, 0.  , 0.01, 0.  , 0.05, 0.12, 0.06, 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.02, 0.  , 0.01, 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.02, 0.03, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.01,
       0.  ])

In [29]:
print(accuracy)

0.8801643024311029


In [30]:
from sklearn.metrics import f1_score

In [31]:
f1_score(predicted, test_y, average='macro')

0.7389565810980387