In [63]:
import numpy as np, pandas as pd, tensorflow as tf, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping, ModelCheckpoint

***
## Análisis exploratorio de datos

In [41]:
data_set = pd.read_csv("recipeData.csv", encoding = "latin-1")
data_set.head(2)

Unnamed: 0,BeerID,Name,URL,Style,StyleID,Size(L),OG,FG,ABV,IBU,...,BoilGravity,Efficiency,MashThickness,SugarScale,BrewMethod,PitchRate,PrimaryTemp,PrimingMethod,PrimingAmount,UserId
0,1,Vanilla Cream Ale,/homebrew/recipe/view/1633/vanilla-cream-ale,Cream Ale,45,21.77,1.055,1.013,5.48,17.65,...,1.038,70.0,,Specific Gravity,All Grain,,17.78,corn sugar,4.5 oz,116.0
1,2,Southern Tier Pumking clone,/homebrew/recipe/view/16367/southern-tier-pumk...,Holiday/Winter Special Spiced Beer,85,20.82,1.083,1.021,8.16,60.65,...,1.07,70.0,,Specific Gravity,All Grain,,,,,955.0


In [42]:
data_shape = data_set.shape
data_shape

(73861, 23)

In [43]:
col_name = data_set.columns
col_name

Index(['BeerID', 'Name', 'URL', 'Style', 'StyleID', 'Size(L)', 'OG', 'FG',
       'ABV', 'IBU', 'Color', 'BoilSize', 'BoilTime', 'BoilGravity',
       'Efficiency', 'MashThickness', 'SugarScale', 'BrewMethod', 'PitchRate',
       'PrimaryTemp', 'PrimingMethod', 'PrimingAmount', 'UserId'],
      dtype='object')

In [44]:
data_set.isnull().sum()

BeerID               0
Name                 1
URL                  0
Style              596
StyleID              0
Size(L)              0
OG                   0
FG                   0
ABV                  0
IBU                  0
Color                0
BoilSize             0
BoilTime             0
BoilGravity       2990
Efficiency           0
MashThickness    29864
SugarScale           0
BrewMethod           0
PitchRate        39252
PrimaryTemp      22662
PrimingMethod    67095
PrimingAmount    69087
UserId           50490
dtype: int64

***
Se eliminan las siguientes features (no son relevantes o tienen más del 50% de la data perdida)
* BeerID
* Name
* URL
* Style
* PrimingMethod
* PrimingAmount
* UserId

In [45]:
col_name = col_name.drop(["BeerID","Name", "URL", "Style", "PrimingMethod", "PrimingAmount", "UserId"])
col_name

Index(['StyleID', 'Size(L)', 'OG', 'FG', 'ABV', 'IBU', 'Color', 'BoilSize',
       'BoilTime', 'BoilGravity', 'Efficiency', 'MashThickness', 'SugarScale',
       'BrewMethod', 'PitchRate', 'PrimaryTemp'],
      dtype='object')

In [46]:
data_set = data_set[col_name]
data_set.head(2)

Unnamed: 0,StyleID,Size(L),OG,FG,ABV,IBU,Color,BoilSize,BoilTime,BoilGravity,Efficiency,MashThickness,SugarScale,BrewMethod,PitchRate,PrimaryTemp
0,45,21.77,1.055,1.013,5.48,17.65,4.83,28.39,75,1.038,70.0,,Specific Gravity,All Grain,,17.78
1,85,20.82,1.083,1.021,8.16,60.65,15.64,24.61,60,1.07,70.0,,Specific Gravity,All Grain,,


In [47]:
data_set.isnull().sum()

StyleID              0
Size(L)              0
OG                   0
FG                   0
ABV                  0
IBU                  0
Color                0
BoilSize             0
BoilTime             0
BoilGravity       2990
Efficiency           0
MashThickness    29864
SugarScale           0
BrewMethod           0
PitchRate        39252
PrimaryTemp      22662
dtype: int64

In [48]:
data_y = to_categorical(data_set.StyleID)
data_y.shape

(73861, 177)

***
### Datos categóricos

In [49]:
data_set.SugarScale.value_counts()

Specific Gravity    71959
Plato                1902
Name: SugarScale, dtype: int64

In [50]:
data_set.BrewMethod.value_counts()

All Grain       49692
BIAB            12016
extract          8626
Partial Mash     3527
Name: BrewMethod, dtype: int64

In [51]:
col_x = col_name.drop(["BoilGravity","MashThickness", "PitchRate", "PrimaryTemp", "SugarScale", "BrewMethod"])
col_x

Index(['StyleID', 'Size(L)', 'OG', 'FG', 'ABV', 'IBU', 'Color', 'BoilSize',
       'BoilTime', 'Efficiency'],
      dtype='object')

#### One Hot Encoding SugarScale

In [52]:
data_set.SugarScale = pd.Categorical(data_set.SugarScale)
data_set.SugarScale = data_set.SugarScale.cat.codes 
data_encoded_SugarScale = to_categorical(data_set.SugarScale)
data_encoded_SugarScale.shape

(73861, 2)

#### One Hot Encoding BrewMethod

In [53]:
data_set.BrewMethod = pd.Categorical(data_set.BrewMethod)
data_set.BrewMethod = data_set.BrewMethod.cat.codes 
data_encoded_BrewMethod = to_categorical(data_set.BrewMethod)
data_encoded_BrewMethod.shape

(73861, 4)

In [56]:
data_x = data_set[col_x]
data_x.shape

(73861, 10)

In [57]:
data_x = pd.concat([data_x, pd.DataFrame(data_encoded_SugarScale, columns = ["SugarScale_SG", "SugarScale_Plato"])], axis=1)
data_x.shape

(73861, 12)

In [58]:
data_x = pd.concat([data_x, pd.DataFrame(data_encoded_BrewMethod, columns = ["BrewMethod_AG", "BrewMethod_B", "BrewMethod_E", "BrewMethod_PM"])], axis=1)
data_x.shape

(73861, 16)

In [59]:
data_x.head()

Unnamed: 0,StyleID,Size(L),OG,FG,ABV,IBU,Color,BoilSize,BoilTime,Efficiency,SugarScale_SG,SugarScale_Plato,BrewMethod_AG,BrewMethod_B,BrewMethod_E,BrewMethod_PM
0,45,21.77,1.055,1.013,5.48,17.65,4.83,28.39,75,70.0,0.0,1.0,1.0,0.0,0.0,0.0
1,85,20.82,1.083,1.021,8.16,60.65,15.64,24.61,60,70.0,0.0,1.0,1.0,0.0,0.0,0.0
2,7,18.93,1.063,1.018,5.91,59.25,8.98,22.71,60,70.0,0.0,1.0,0.0,0.0,0.0,1.0
3,7,22.71,1.061,1.017,5.8,54.48,8.5,26.5,60,70.0,0.0,1.0,1.0,0.0,0.0,0.0
4,20,50.0,1.06,1.01,6.48,17.84,4.57,60.0,90,72.0,0.0,1.0,1.0,0.0,0.0,0.0


#### Split data_test y data_train

In [64]:
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size = 0.2, random_state = 0)

In [65]:
model = Sequential()
model.add(Dense(128, input_shape = (16, ), activation = "relu"))
model.add(Dense(64, activation = "relu"))
model.add(Dense(32, activation = "relu"))
model.add(Dense(177, activation = "softmax"))
model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])

In [67]:
monitor_val_acc = EarlyStopping(monitor = "val_acc", patience = 5)
modelCheckpoint = ModelCheckpoint("beer_model.hdf5", save_best_only = True)

model.fit(x_train, y_train, epochs = 100, validation_split = 0.2, callbacks = [monitor_val_acc, modelCheckpoint])
accuracy = model.evaluate(x_test, y_test)[1]
print('Accuracy:', accuracy)

Train on 47270 samples, validate on 11818 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Accuracy: 0.8058620456318425
