In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../data/winequalityN.csv')
df = df.fillna(df.mean())

In [3]:
X = df.drop(columns=['type', 'quality'])
y = df['quality']

In [4]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE(k_neighbors=4)
# transform the dataset
X, y = oversample.fit_resample(X, y)

In [5]:
y.value_counts()

3    2836
4    2836
5    2836
6    2836
7    2836
8    2836
9    2836
Name: quality, dtype: int64

In [6]:
cd ..

/home/hrach2003/Projects/HTI_ML/WineQuality/src


In [7]:
from utils import classify

In [8]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
classify(rf, X, y)

Accuracy: 88.9196675900277
CV Score: 82.61137960541382
              precision    recall  f1-score   support

           3       1.00      0.98      0.99       586
           4       0.97      0.91      0.94       605
           5       0.76      0.80      0.78       542
           6       0.65      0.74      0.69       504
           7       0.87      0.83      0.85       591
           8       0.98      0.95      0.96       618
           9       1.00      1.00      1.00       525

    accuracy                           0.89      3971
   macro avg       0.89      0.89      0.89      3971
weighted avg       0.90      0.89      0.89      3971



(88.9196675900277, 82.61137960541382)

In [9]:
from sklearn.ensemble import ExtraTreesClassifier
et = ExtraTreesClassifier()
classify(et, X, y)

Accuracy: 89.34777134223117
CV Score: 83.56340014221495
              precision    recall  f1-score   support

           3       1.00      0.99      1.00       580
           4       0.98      0.92      0.95       601
           5       0.78      0.79      0.78       561
           6       0.64      0.75      0.69       493
           7       0.89      0.83      0.85       602
           8       0.97      0.96      0.97       608
           9       1.00      1.00      1.00       526

    accuracy                           0.89      3971
   macro avg       0.89      0.89      0.89      3971
weighted avg       0.90      0.89      0.90      3971



(89.34777134223117, 83.56340014221495)

In [10]:
from models.RandomForestClassifier import getRandomizedSearchParams, searchBestParamsRandomizedSearch, getGridSearchParams, gridSearch
random_grid = getRandomizedSearchParams()

{'bootstrap': [True, False],
 'max_depth': [10, 15, 20, 25, 30, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000]}


In [11]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
searchBestParamsRandomizedSearch(rf, x_train, y_train, random_grid)
#  result
# {
#  'n_estimators': 600,
#  'min_samples_split': 2,
#  'min_samples_leaf': 1,
#  'max_features': 'auto',
#  'max_depth': None,
#  'bootstrap': False
# }

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] END bootstrap=False, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=80; total time=   2.5s
[CV] END bootstrap=False, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=80; total time=   2.5s
[CV] END bootstrap=False, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=80; total time=   2.5s
[CV] END bootstrap=False, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.1s
[CV] END bootstrap=False, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.1s
[CV] END bootstrap=False, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.1s
[CV] END bootstrap=False, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estima

{'n_estimators': 600,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': None,
 'bootstrap': False}

In [12]:
best_params_rf = {
 'n_estimators': 600,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': None,
 'bootstrap': False
}
rf_rand = RandomForestClassifier(**best_params_rf)
classify(rf_rand, X, y)

Accuracy: 89.97733568370687
CV Score: 83.74978670931
              precision    recall  f1-score   support

           3       1.00      0.99      0.99       584
           4       0.98      0.93      0.95       597
           5       0.78      0.80      0.79       555
           6       0.67      0.76      0.71       506
           7       0.89      0.84      0.87       598
           8       0.98      0.97      0.97       606
           9       1.00      1.00      1.00       525

    accuracy                           0.90      3971
   macro avg       0.90      0.90      0.90      3971
weighted avg       0.91      0.90      0.90      3971



(89.97733568370687, 83.74978670931)

In [14]:
grid_params = getGridSearchParams()
grid_params

{'n_estimators': [500, 550, 600, 650, 700],
 'min_samples_split': [2, 3, 4],
 'min_samples_leaf': [1],
 'max_features': ['auto'],
 'max_depth': [None, 5, 8],
 'bootstrap': [False]}

In [15]:
gridSearch(rf, x_train, y_train, grid_params)

RandomForestClassifier(bootstrap=False, n_estimators=600)

## Increasing the accuracy from 88.9196675900277 to 89.97733568370687
## by 1%

In [35]:
best_params_et = {
    'n_estimators': [100, 300, 500, 600, 700],
    'min_samples_split': [1,4,7,10],
    'min_samples_leaf': [1,2,3,5],
    'max_features': ['auto'],
    'max_depth': [None, 1, 2],
    'bootstrap': [False],
    'criterion': ['gini', 'entropy'],
    'max_features': ['auto','sqrt','log2']
}

In [36]:
res = searchBestParamsRandomizedSearch(et, x_train, y_train, best_params_et)
res

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END bootstrap=False, criterion=gini, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=4, n_estimators=600; total time=   4.2s
[CV] END bootstrap=False, criterion=gini, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=4, n_estimators=600; total time=   4.2s
[CV] END bootstrap=False, criterion=gini, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=4, n_estimators=600; total time=   4.2s
[CV] END bootstrap=False, criterion=gini, max_depth=2, max_features=sqrt, min_samples_leaf=3, min_samples_split=4, n_estimators=100; total time=   0.2s
[CV] END bootstrap=False, criterion=gini, max_depth=2, max_features=sqrt, min_samples_leaf=3, min_samples_split=4, n_estimators=100; total time=   0.2s
[CV] END bootstrap=False, criterion=gini, max_depth=2, max_features=sqrt, min_samples_leaf=3, min_samples_split=4, n_estimators=100; total time=   0.2s
[CV] END bootstra

[CV] END bootstrap=False, criterion=entropy, max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=300; total time=   1.8s
[CV] END bootstrap=False, criterion=gini, max_depth=1, max_features=auto, min_samples_leaf=5, min_samples_split=4, n_estimators=300; total time=   0.5s
[CV] END bootstrap=False, criterion=gini, max_depth=1, max_features=auto, min_samples_leaf=5, min_samples_split=4, n_estimators=300; total time=   0.5s
[CV] END bootstrap=False, criterion=gini, max_depth=1, max_features=auto, min_samples_leaf=5, min_samples_split=4, n_estimators=300; total time=   0.5s
[CV] END bootstrap=False, criterion=gini, max_depth=1, max_features=sqrt, min_samples_leaf=2, min_samples_split=7, n_estimators=100; total time=   0.2s
[CV] END bootstrap=False, criterion=gini, max_depth=1, max_features=sqrt, min_samples_leaf=2, min_samples_split=7, n_estimators=100; total time=   0.2s
[CV] END bootstrap=False, criterion=gini, max_depth=1, max_features=sqrt, min_sam

[CV] END bootstrap=False, criterion=entropy, max_depth=None, max_features=sqrt, min_samples_leaf=3, min_samples_split=4, n_estimators=500; total time=   3.0s
[CV] END bootstrap=False, criterion=gini, max_depth=None, max_features=auto, min_samples_leaf=3, min_samples_split=4, n_estimators=700; total time=   3.9s
[CV] END bootstrap=False, criterion=gini, max_depth=None, max_features=auto, min_samples_leaf=3, min_samples_split=4, n_estimators=700; total time=   4.0s
[CV] END bootstrap=False, criterion=gini, max_depth=None, max_features=auto, min_samples_leaf=3, min_samples_split=4, n_estimators=700; total time=   4.0s
[CV] END bootstrap=False, criterion=gini, max_depth=1, max_features=auto, min_samples_leaf=2, min_samples_split=7, n_estimators=700; total time=   1.1s
[CV] END bootstrap=False, criterion=gini, max_depth=1, max_features=auto, min_samples_leaf=2, min_samples_split=7, n_estimators=700; total time=   1.1s
[CV] END bootstrap=False, criterion=gini, max_depth=1, max_features=auto,

{'n_estimators': 600,
 'min_samples_split': 4,
 'min_samples_leaf': 1,
 'max_features': 'log2',
 'max_depth': None,
 'criterion': 'gini',
 'bootstrap': False}

In [39]:
grid_params = {
    'n_estimators': [550, 600, 650],
    'min_samples_split': [3, 4, 5, 6],
    'min_samples_leaf': [1],
    'max_features': ["auto", "sqrt", "log2"],
    'max_depth': [None],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [False]
}
res = gridSearch(rf, x_train, y_train, grid_params)
res

KeyboardInterrupt: 

In [37]:
et_rand = ExtraTreesClassifier(random_state=42, **res,)
classify(et_rand, X, y)

Accuracy: 89.82624024175271
CV Score: 83.7447362394996
              precision    recall  f1-score   support

           3       1.00      0.99      1.00       580
           4       0.98      0.92      0.95       604
           5       0.78      0.82      0.79       541
           6       0.67      0.76      0.71       504
           7       0.89      0.82      0.85       610
           8       0.98      0.97      0.97       606
           9       1.00      1.00      1.00       526

    accuracy                           0.90      3971
   macro avg       0.90      0.90      0.90      3971
weighted avg       0.90      0.90      0.90      3971



(89.82624024175271, 83.7447362394996)