# Red wine quality project: analyzed models
## Jakub Kosterna, Bartosz Siński, Jan Smoleń

### Packages load & generation seed declaration

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

import xgboost as xgb
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

import pickle

np.random.seed = 42

import warnings
warnings.filterwarnings('ignore')

### Data and train-test split

In [2]:
df_wines = pd.read_csv('../data/winequality-red.csv')
df_wines["is_good"] = df_wines.apply(lambda row: 1 if row.quality > 5 else 0, axis = 1)

X = df_wines.drop(["is_good", 'quality'], axis=1)
y = df_wines["is_good"]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42)

### X & y data saves

In [3]:
X_train.to_csv("../data/X_train.csv")
X_test.to_csv("../data/X_test.csv")
y_train.to_csv("../data/y_train.csv")
y_test.to_csv("../data/y_test.csv")

### 1. XGBoost

In [4]:
gbm_param_grid = {
    "learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25, 0.30] ,
    "max_depth": [3, 4, 5, 6, 8, 10, 12, 15],
    "min_child_weight": [1, 3, 5, 7],
    "gamma": [0.0, 0.1, 0.2 , 0.3, 0.4],
    "colsample_bytree": [0.3, 0.4, 0.5, 0.7]
}

gbm = xgb.XGBClassifier(objective = "binary:logistic", eval_metric = "logloss", use_label_encoder = False, seed = 42)

rs_gbm = RandomizedSearchCV(param_distributions = gbm_param_grid,
    estimator = gbm,
    cv = 4,
    n_iter = 2000)

rs_gbm.fit(X_train, y_train)
rs_gbm.best_params_

{'min_child_weight': 1,
 'max_depth': 12,
 'learning_rate': 0.05,
 'gamma': 0.2,
 'colsample_bytree': 0.7}

In [5]:
y_pred_gbm = rs_gbm.predict(X_test)
accuracy_score(y_pred_gbm, y_test)

0.805

In [6]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_gbm).ravel()
pd.DataFrame({"Actual positives": [tp, fp], "Actual negatives": [fn, tn]}, index = ["Positive predictions",
                                                                                    "Negative predictions"])

Unnamed: 0,Actual positives,Actual negatives
Positive predictions,179,35
Negative predictions,43,143


In [7]:
pickle.dump(rs_gbm, open("xgb.pickle", 'wb'))

### 2. Support Vector Machine

In [8]:
cpar = []
gpar = []
for i in range(-4, 5):
    cpar.append(10**i)
for i in range(-4, 5):
    gpar.append(10**i)
gpar.append("auto")
gpar.append("scale")
params = [{'C': cpar,
        'kernel': ["rbf"], #"poly", "linear"], ## >24 hours waiting time
        'gamma': gpar}]
svm_tuned = SVC(random_state = 42)

gs_svm = GridSearchCV(svm_tuned, param_grid = params, scoring = 'accuracy', cv = 4, n_jobs = 2)
gs_svm.fit(X_train, y_train)
gs_svm.best_params_

{'C': 10000, 'gamma': 0.0001, 'kernel': 'rbf'}

In [9]:
y_pred_svm = gs_svm.predict(X_test)
accuracy_score(y_pred_svm, y_test)

0.7475

In [10]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_svm).ravel()
pd.DataFrame({"Actual positives": [tp, fp], "Actual negatives": [fn, tn]}, index = ["Positive predictions",
                                                                                    "Negative predictions"])

Unnamed: 0,Actual positives,Actual negatives
Positive predictions,151,63
Negative predictions,38,148


In [11]:
pickle.dump(gs_svm, open("svm.pickle", 'wb'))

### 3. Random Forest

In [12]:
rfc = RandomForestClassifier(random_state = 42)
grid = {'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
rs_rfc = RandomizedSearchCV(estimator = rfc, param_distributions = grid, cv = 4, n_iter = 20, random_state = 1613)
rs_rfc.fit(X_train, y_train)
rs_rfc.best_params_

{'n_estimators': 2000,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'max_depth': 100,
 'bootstrap': True}

In [13]:
y_pred_rfc = rs_rfc.predict(X_test)
accuracy_score(y_pred_rfc, y_test)

0.785

In [14]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_rfc).ravel()
pd.DataFrame({"Actual positives": [tp, fp], "Actual negatives": [fn, tn]}, index = ["Positive predictions",
                                                                                    "Negative predictions"])

Unnamed: 0,Actual positives,Actual negatives
Positive predictions,170,44
Negative predictions,42,144


In [15]:
pickle.dump(rs_rfc, open("rfc.pickle", 'wb'))

### 4. Gradient Boosting

In [16]:
gbc = GradientBoostingClassifier(random_state = 42)
gbc_param_grid = {
    "n_estimators": [5, 50, 250, 500],
    "max_depth": [1, 3, 5, 7, 9],
    "learning_rate": [0.01, 0.1, 1, 10, 100]
}
gs_gbc = GridSearchCV(gbc, gbc_param_grid, cv = 5)
gs_gbc.fit(X_train, y_train)
gs_gbc.best_params_

{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 50}

In [17]:
y_pred_gbc = gs_gbc.predict(X_test)
accuracy_score(y_pred_gbc, y_test)

0.8075

In [18]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_gbc).ravel()
pd.DataFrame({"Actual positives": [tp, fp], "Actual negatives": [fn, tn]}, index = ["Positive predictions",
                                                                                    "Negative predictions"])

Unnamed: 0,Actual positives,Actual negatives
Positive predictions,176,38
Negative predictions,39,147


In [19]:
pickle.dump(gs_gbc, open("gbc.pickle", 'wb'))

### 5. Model results summary

In [20]:
results = {
    "algorithm" : ['XGBoost', 'Support Vector Machine', 'Random Forest', 'Gradient Boosting'],
    "accuracy" : [accuracy_score(y_test, y_pred_gbm), accuracy_score(y_test, y_pred_svm),
                  accuracy_score(y_test, y_pred_rfc), accuracy_score(y_test, y_pred_gbc)],
    "precision" : [precision_score(y_test, y_pred_gbm), precision_score(y_test, y_pred_svm),
                   precision_score(y_test, y_pred_rfc), precision_score(y_test, y_pred_gbc)],
    "recall" : [recall_score(y_test, y_pred_gbm), recall_score(y_test, y_pred_svm),
                recall_score(y_test, y_pred_rfc), recall_score(y_test, y_pred_gbc)],
    'ROC AUC' : [roc_auc_score(y_test, y_pred_gbm), roc_auc_score(y_test, y_pred_svm),
                 roc_auc_score(y_test, y_pred_rfc), roc_auc_score(y_test, y_pred_gbc)]
}
pd.DataFrame(results)

Unnamed: 0,algorithm,accuracy,precision,recall,ROC AUC
0,XGBoost,0.805,0.806306,0.836449,0.802633
1,Support Vector Machine,0.7475,0.798942,0.705607,0.750653
2,Random Forest,0.785,0.801887,0.794393,0.784293
3,Gradient Boosting,0.8075,0.818605,0.82243,0.806376
