In [71]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [61]:
import pandas as pd
pd.set_option('display.max_columns', 23)

df = pd.read_csv('./recipeData.csv', encoding='latin-1')

In [62]:
df.shape

(73861, 23)

In [63]:
# Get style of beer if has more than 1000 samples
df = df[df['Style'].isin(df['Style'].value_counts()[df['Style'].value_counts() > 1000].index)]

In [65]:
df.shape

(35424, 23)

In [14]:
df = df.drop(['BeerID', 'Name', 'URL', 'StyleID', 'UserId'], axis=1)

In [17]:
df.dtypes

Style             object
Size(L)          float64
OG               float64
FG               float64
ABV              float64
IBU              float64
Color            float64
BoilSize         float64
BoilTime           int64
BoilGravity      float64
Efficiency       float64
MashThickness    float64
SugarScale        object
BrewMethod        object
PitchRate        float64
PrimaryTemp      float64
PrimingMethod     object
PrimingAmount     object
dtype: object

In [35]:
df.isna().sum()

Style                0
Size(L)              0
OG                   0
FG                   0
ABV                  0
IBU                  0
Color                0
BoilSize             0
BoilTime             0
BoilGravity          0
Efficiency           0
MashThickness        0
SugarScale           0
BrewMethod           0
PitchRate            0
PrimaryTemp          0
PrimingMethod    32534
PrimingAmount    33369
dtype: int64

In [41]:
df = df.drop(['PrimingMethod', 'PrimingAmount'], axis=1)

In [46]:
df['SugarScale'] = df['SugarScale'].map({'Specific Gravity': 0, 'Plato': 1})

In [52]:
df = pd.get_dummies(df, columns = ['BrewMethod'])

In [54]:
df['Style'].unique()

array(['American IPA', 'American Pale Ale', 'Imperial IPA', 'Saison',
       'Blonde Ale', 'American Brown Ale', 'American Amber Ale',
       'Witbier', 'American Stout', 'Irish Red Ale',
       'American Light Lager', 'California Common Beer'], dtype=object)

In [57]:
df['Style'] = pd.Categorical(df['Style']).codes

In [68]:


le = LabelEncoder()
df['Style_encoded'] = le.fit_transform(df['Style'])

In [72]:
scaler = MinMaxScaler()
X = df.drop(['Style', 'Style_encoded'], axis=1)
y = df['Style_encoded']

In [73]:
X_scaled = scaler.fit_transform(X)

In [78]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

models = [
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    AdaBoostClassifier(),
    BaggingClassifier(),
    ExtraTreesClassifier(),
    LogisticRegression(),
    DecisionTreeClassifier(),
    KNeighborsClassifier(),
    SVC()
]

for model in models: 
    print(model)
    print(cross_val_score(model, X_scaled, y, cv=5, n_jobs=4).mean())
    print('------------------')

RandomForestClassifier()
0.6158533727939393
------------------
GradientBoostingClassifier()
0.6188738505212379
------------------
AdaBoostClassifier()
0.5450258397366495
------------------
BaggingClassifier()
0.5784211440733179
------------------
ExtraTreesClassifier()
0.5991698409289155
------------------
LogisticRegression()


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.4326446349820901
------------------
DecisionTreeClassifier()
0.4735771049851625
------------------
KNeighborsClassifier()


Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fc823cc2de0>
Traceback (most recent call last):
  File "/home/leo/anaconda3/lib/python3.11/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/leo/anaconda3/lib/python3.11/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
             ^^^^^^^^^Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fe3fd5abec0>
Traceback (most recent call last):
  File "/home/leo/anaconda3/lib/python3.11/site-packages/threadpoolctl.py", line 400, in match_module_callback
^^^^^^^^^    self._make_module_from_path(filepath)
^^  File "/home/leo/anaconda3/lib/python3.11/site-packages/threadpoolctl.py", lin

0.4868167412961988
------------------
SVC()
0.46756446983411487
------------------


In [80]:
from sklearn.model_selection import GridSearchCV
import numpy as np

param_grid = {
  'learning_rate': [0.1, 0.05, 0.01],
  'max_depth': [3, 4, 6],
  'min_samples_leaf': [1, 3, 5]
}

model = GradientBoostingClassifier(random_state=28064212)

grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', n_jobs=5, cv=5, verbose=1)
grid.fit(X_scaled, y)

print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
