In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, classification_report

from joblib import dump

In [2]:
cwd = os.getcwd()

In [3]:
ip_filepath = os.path.dirname(cwd)+'\\WineQT.csv'

In [4]:
wq_df = pd.read_csv(ip_filepath)

In [5]:
print(wq_df.head())

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  Id  
0      9.4        5   0  
1      9.8        5   1  
2      9

In [6]:
wq = wq_df.copy()

In [7]:
wq.drop('Id',axis=1, inplace=True)
print("Dropped the Id column as it is an index column, and the df has already possess an index column")

Dropped the Id column as it is an index column, and the df has already possess an index column


In [8]:
def replace_spaces(df):
    return df.columns.str.replace(" ","_")
def title_name(df):
    return df.columns.str.title()

In [9]:
wq.columns = replace_spaces(wq)
wq.columns = title_name(wq)

In [10]:
print(wq.columns)

Index(['Fixed_Acidity', 'Volatile_Acidity', 'Citric_Acid', 'Residual_Sugar',
       'Chlorides', 'Free_Sulfur_Dioxide', 'Total_Sulfur_Dioxide', 'Density',
       'Ph', 'Sulphates', 'Alcohol', 'Quality'],
      dtype='object')


In [11]:
target = wq.loc[:,'Quality']
classifiers = wq.loc[:, wq.columns != 'Quality']

#### Split train and test

In [12]:
f_train, f_test, t_train, t_test = train_test_split(classifiers, target, test_size = 0.2, random_state = 101)
print(f'Shape of the f_train: {f_train.shape}')
print(f'Shape of the f_test: {f_test.shape}')
print(f'Shape of the t_train: {t_train.shape}')
print(f'Shape of the t_test: {t_test.shape}')

Shape of the f_train: (914, 11)
Shape of the f_test: (229, 11)
Shape of the t_train: (914,)
Shape of the t_test: (229,)


#### Model Bulding and training

In [13]:
# Model Instantiation:
rfc = RandomForestClassifier(random_state = 101)

# Training the model:
rfc_mod = rfc.fit(f_train, t_train)

# Prediction
rfc_pred = rfc_mod.predict(f_test)

In [14]:
print("Confusion Matrix of the Decision Tree Model: \n {}".format(confusion_matrix(t_test, rfc_pred)))
print("Accuracy score of the Decision Tree Model: \n{} %".format(round(accuracy_score(t_test, rfc_pred)*100,2)))

Confusion Matrix of the Decision Tree Model: 
 [[ 0  0  1  0  0  0]
 [ 0  0  4  2  0  0]
 [ 0  0 79 23  0  0]
 [ 0  0 18 66  6  1]
 [ 0  0  1 14 12  0]
 [ 0  0  0  0  1  1]]
Accuracy score of the Decision Tree Model: 
69.0 %


#### Hyperparameter Tuning

In [15]:
rfc = RandomForestClassifier(random_state = 101)

In [16]:
params = {
    'bootstrap': [True],
    'max_depth': [5, 8, 10, 15],
    'max_features': [2, 3, 4, 5, 6],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [10, 20, 30, 50]
}

In [17]:
grid_search = GridSearchCV(estimator=rfc, 
                           param_grid=params, 
                           cv=4, n_jobs=-1, verbose=2, scoring = "accuracy")

In [18]:
%%time
grid_search.fit(f_train, t_train)

Fitting 4 folds for each of 720 candidates, totalling 2880 fits
Wall time: 1min 56s


GridSearchCV(cv=4, estimator=RandomForestClassifier(random_state=101),
             n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [5, 8, 10, 15],
                         'max_features': [2, 3, 4, 5, 6],
                         'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [8, 10, 12],
                         'n_estimators': [10, 20, 30, 50]},
             scoring='accuracy', verbose=2)

In [19]:
grid_search.best_estimator_

RandomForestClassifier(max_depth=15, max_features=3, min_samples_leaf=3,
                       min_samples_split=8, n_estimators=30, random_state=101)

In [20]:
rfc = RandomForestClassifier(max_depth=15, max_features=3, min_samples_leaf=3,
                       min_samples_split=8, n_estimators=30, random_state=101)

# Training the model:
rfc_mod = rfc.fit(f_train, t_train)

# Prediction
rfc_pred = rfc_mod.predict(f_test)

In [21]:
print("Confusion Matrix of the Decision Tree Model: \n {}".format(confusion_matrix(t_test, rfc_pred)))
print("Accuracy score of the Decision Tree Model: \n{} %".format(round(accuracy_score(t_test, rfc_pred)*100,2)))

Confusion Matrix of the Decision Tree Model: 
 [[ 0  0  1  0  0  0]
 [ 0  0  4  2  0  0]
 [ 0  0 77 25  0  0]
 [ 0  0 17 68  6  0]
 [ 0  0  2 12 13  0]
 [ 0  0  0  2  0  0]]
Accuracy score of the Decision Tree Model: 
69.0 %


#### There is no much improvement in the accuracy after hyperparameter tuning, however it performs well when compared to decision tree classifier

### Binary Classification

In [22]:
target_binary = target.apply(lambda x: 1 if x>= 6 else 0)

In [23]:
f_train, f_test, t_train, t_test = train_test_split(classifiers, target_binary, test_size = 0.2, random_state = 101)

In [24]:
# Model Instantiation:
rfc = RandomForestClassifier(random_state = 101)

# Training the model:
rfc_mod = rfc.fit(f_train, t_train)

# Prediction
rfc_pred = rfc_mod.predict(f_test)

In [25]:
print("Confusion Matrix of the Decision Tree Model: \n {}".format(confusion_matrix(t_test, rfc_pred)))
print("Accuracy score of the Decision Tree Model: \n{} %".format(round(accuracy_score(t_test, rfc_pred)*100,2)))

Confusion Matrix of the Decision Tree Model: 
 [[85 24]
 [22 98]]
Accuracy score of the Decision Tree Model: 
79.91 %


In [26]:
rfc = RandomForestClassifier(random_state = 101)
params = {
    'bootstrap': [True],
    'max_depth': [5, 8, 10, 15],
    'max_features': [2, 3, 4, 5, 6],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [10, 20, 30, 50]
}
grid_search = GridSearchCV(estimator=rfc, 
                           param_grid=params, 
                           cv=4, n_jobs=-1, verbose=2, scoring = "accuracy")

In [27]:
%%time
grid_search.fit(f_train, t_train)

Fitting 4 folds for each of 720 candidates, totalling 2880 fits
Wall time: 2min 17s


GridSearchCV(cv=4, estimator=RandomForestClassifier(random_state=101),
             n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [5, 8, 10, 15],
                         'max_features': [2, 3, 4, 5, 6],
                         'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [8, 10, 12],
                         'n_estimators': [10, 20, 30, 50]},
             scoring='accuracy', verbose=2)

In [28]:
grid_search.best_estimator_

RandomForestClassifier(max_depth=15, max_features=6, min_samples_leaf=5,
                       min_samples_split=8, n_estimators=20, random_state=101)

In [29]:
rfc = RandomForestClassifier(max_depth=15, max_features=6, min_samples_leaf=5,
                       min_samples_split=8, n_estimators=20, random_state=101)

# Training the model:
rfc_mod = rfc.fit(f_train, t_train)

# Prediction
rfc_pred = rfc_mod.predict(f_test)

In [30]:
print("Confusion Matrix of the Decision Tree Model: \n {}".format(confusion_matrix(t_test, rfc_pred)))
print("Accuracy score of the Decision Tree Model: \n{} %".format(round(accuracy_score(t_test, rfc_pred)*100,2)))

Confusion Matrix of the Decision Tree Model: 
 [[ 83  26]
 [ 19 101]]
Accuracy score of the Decision Tree Model: 
80.35 %


#### The model performance improved after hyperparameter tuning

In [32]:
dump(rfc_mod, 'wineclassification.joblib')

['wineclassification.joblib']