In [166]:
import pandas as pd
import numpy as np
from pandas import read_csv
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, roc_auc_score, confusion_matrix, accuracy_score, f1_score, classification_report
import warnings
warnings.filterwarnings('ignore')

In [167]:
data=pd.read_csv('WineQT.csv')

In [168]:
data=data.drop('Id', axis=1)

In [169]:
data.skew().sort_values()

density                 0.102395
pH                      0.221138
quality                 0.286792
citric acid             0.371561
volatile acidity        0.681547
alcohol                 0.863313
fixed acidity           1.044930
free sulfur dioxide     1.231261
total sulfur dioxide    1.665766
sulphates               2.497266
residual sugar          4.361096
chlorides               6.026360
dtype: float64

In [170]:
data['fixed acidity']=data['fixed acidity'].apply(np.log)
data['total sulfur dioxide']=data['total sulfur dioxide'].apply(np.log)
data['sulphates']=data['sulphates'].apply(np.log)
data['free sulfur dioxide']=data['free sulfur dioxide'].apply(np.log)
data['chlorides']=data['chlorides'].apply(np.log)
data['residual sugar']=data['residual sugar'].apply(np.log)

In [171]:
data.quality.unique()

array([5, 6, 7, 4, 8, 3], dtype=int64)

In [172]:
X=data.drop('quality', axis=1)
y=data['quality']

In [173]:
sm = SMOTE(sampling_strategy='auto', random_state=42)
X,y=sm.fit_resample(X,y)

In [182]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.20, random_state=42)

In [183]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('RF', RandomForestClassifier()))

In [184]:
results_c = []
names_c = []

for name, model in models:
    # define how to split off validation data ('kfold' how many folds)
    cv = KFold(n_splits=10, shuffle=True, random_state=42)  
    # train the model
    cv_results = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')    
    results_c.append(cv_results)
    names_c.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LR: 0.577653 (0.014329)
KNN: 0.782143 (0.029010)
CART: 0.767467 (0.032226)
NB: 0.519408 (0.026993)
RF: 0.848151 (0.020087)


In [185]:
RF = RandomForestClassifier(n_estimators=200,max_depth=15,random_state=42)
RF.fit(X_train, y_train)

RandomForestClassifier(max_depth=15, n_estimators=200, random_state=42)

In [186]:
y_pred = RF.predict(X_test)

In [187]:
accuracy_score(y_test, y_pred)

0.8793103448275862

In [188]:
confusion_matrix(y_test,y_pred)

array([[97,  0,  0,  0,  0,  0],
       [ 0, 90,  2,  0,  0,  0],
       [ 0,  2, 77, 25,  4,  0],
       [ 0,  4, 15, 63, 13,  0],
       [ 0,  0,  1,  4, 91,  0],
       [ 0,  0,  0,  0,  0, 92]], dtype=int64)

In [189]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           3       1.00      1.00      1.00        97
           4       0.94      0.98      0.96        92
           5       0.81      0.71      0.76       108
           6       0.68      0.66      0.67        95
           7       0.84      0.95      0.89        96
           8       1.00      1.00      1.00        92

    accuracy                           0.88       580
   macro avg       0.88      0.88      0.88       580
weighted avg       0.88      0.88      0.88       580

