# Prebuilt Model Analysis

## Evaluating Model

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn Libraries
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn import metrics

#Load dataset
df = pd.read_csv('WineQuality.csv')

# Drop redundant columns
df.drop(columns=['total sulfur dioxide'])

# Convert type to numeric
type_to_numeric = {'white': 0, 'red': 1}
df['type'] = df['type'].map(type_to_numeric)

# Create binary classification target
df['best quality'] = [1 if x >= 6 else 0 for x in df['quality']]

#Split dataset into features and target
feature = df.drop(columns=['quality', 'best quality'])
target = df['best quality']

xtrain, xtest, ytrain, ytest = train_test_split(feature, target, test_size=0.2, random_state=27)

# Imputation of missing values
imputer = SimpleImputer(strategy='mean')
xtrain = imputer.fit_transform(xtrain)
xtest = imputer.transform(xtest)

# Normalization of data
norm = MinMaxScaler()
xtrain = norm.fit_transform(xtrain)
xtest = norm.transform(xtest)

# Model training and evaluation
models = [LogisticRegression(), XGBClassifier(), SVC(kernel='rbf')]

for i in range(3):
    models[i].fit(xtrain, ytrain)

    print(f'{models[i]} : ')
    print('Training Accuracy : ', metrics.roc_auc_score(ytrain, models[i].predict(xtrain)))
    print('Validation Accuracy : ', metrics.roc_auc_score(
        ytest, models[i].predict(xtest)))
    print()


LogisticRegression() : 
Training Accuracy :  0.7086603183794197
Validation Accuracy :  0.6789752079463781

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              feature_weights=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=None,
              n_jobs=None, num_parallel_tree=None, ...) : 
Training Accuracy :  0.9853672394377359
Validation Accuracy :  0.7892423080029072

SVC() : 
Training Accuracy :  0.73748772907327

XGB has a very high accuracy in training but then low in validation - possibly overfitting? 
Logistic Regression has the least drop in accuracy between training and validation

In [None]:
print(metrics.classification_report(ytest,models[1].predict(xtest)))

              precision    recall  f1-score   support

           0       0.77      0.70      0.74       488
           1       0.83      0.88      0.85       812

    accuracy                           0.81      1300
   macro avg       0.80      0.79      0.79      1300
weighted avg       0.81      0.81      0.81      1300

