In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [None]:
import ssl

# Set the path to the CA certificates bundle
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/Explore-AI/Public-Data/master/Data/classification_sprint/winequality.csv')
df.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,0,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,0,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,0,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,0,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


### Data Processing
Convert the quality for lower quality wines (quality less than or equal to 4) to 0

Convert the quality for higher quality wines (quality greater than or equal to 5) to 1

Split the data into 75% training and 25% testing data

Set random_state to equal 42 for this internal method.


In [3]:
def data_preprocess(df):

    # your code here
    df = df.fillna(0)
    df['quality'] = df['quality'].apply(lambda x : 1 if x >=5 else 0)
    y = df['quality'].values
    X = df.drop('quality', axis=1)
    X = preprocessing.StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    return (X_train, y_train), (X_test, y_test)

In [4]:
(X_train, y_train), (X_test, y_test)=data_preprocess(df)
print(X_train[:2])
print(y_train[:2])
print(X_test[:2])
print(y_test[:2])

[[-0.57136659  0.07127869 -0.48054096  1.17914161 -0.09303318 -0.79974133
   0.0830898  -0.15472329 -0.36573452  0.13010447  0.06101473  0.25842195]
 [-0.57136659  1.50396711 -0.72301571  0.56008035 -0.63948302 -0.05776881
  -0.70572997  0.62379657  0.16787589 -0.86828773 -0.47467813 -0.99931317]]
[1 0]
[[-0.57136659 -0.15493527 -0.54115965  0.90400327 -0.66050032 -0.31460545
   0.53384396  0.03990667 -1.35291379 -0.26925241 -0.34075491  1.18076103]
 [-0.57136659  0.29749266 -1.20796522  2.8987562  -0.80762143 -0.45729248
  -0.19863155 -0.22549783 -1.03274754 -0.7185289  -0.87644778  0.25842195]]
[1 1]


In [5]:
print(X_train[:2])

[[-0.57136659  0.07127869 -0.48054096  1.17914161 -0.09303318 -0.79974133
   0.0830898  -0.15472329 -0.36573452  0.13010447  0.06101473  0.25842195]
 [-0.57136659  1.50396711 -0.72301571  0.56008035 -0.63948302 -0.05776881
  -0.70572997  0.62379657  0.16787589 -0.86828773 -0.47467813 -0.99931317]]


In [6]:
# Model Training
def train_SVC_model(X_train,y_train):
    model = SVC(random_state=40, gamma='auto')
    model.fit(X_train,y_train)
    return model

In [7]:
svc = train_SVC_model(X_train,y_train)
svc.classes_

array([0, 1])

In [8]:
# Model Testing
def custom_scoring_function(y_true, y_pred):
    epsilon = 1e-15
    y_pred = np.maximum(epsilon, y_pred)
    y_pred = np.minimum(1-epsilon, y_pred)
    ll = np.sum(y_true * np.log(y_pred) + np.subtract(1,y_true) * np.log(np.subtract(1,y_pred)))
    ll = ll * -1.0/len(y_true)
    return np.float64(round(ll,7))

In [9]:
y_pred = svc.predict(X_test)
print('Log Loss value: ', custom_scoring_function(y_test, y_pred))
print('Accuracy: ',round(accuracy_score(y_test,y_pred),4))

Log Loss value:  1.2540518
Accuracy:  0.9637


### Hyperparameter Optimization

In [10]:
# Getting Model Parameters
def get_model_hyperparams(model):
    return list(model.get_params().keys())

In [11]:
get_model_hyperparams(SVC())

['C',
 'break_ties',
 'cache_size',
 'class_weight',
 'coef0',
 'decision_function_shape',
 'degree',
 'gamma',
 'kernel',
 'max_iter',
 'probability',
 'random_state',
 'shrinking',
 'tol',
 'verbose']

In [12]:
# Hyperparameter Search
def tune_SVC_model(X_train, y_train):

    # your code here
    from sklearn import metrics
    scorer = metrics.make_scorer(custom_scoring_function, greater_is_better=False)
    nfolds = 5
    Cs = [0.1, 1, 10]
    gammas = [0.01, 0.1, 1]

    param_grid = {
        'C'     : Cs,
        'gamma' : gammas,
        }

    grid_LM = GridSearchCV(SVC(), param_grid, scoring=scorer, cv=nfolds)
    grid_LM.fit(X_train, y_train)
    return grid_LM

In [13]:
svc_tuned = tune_SVC_model(X_train, y_train)

In [14]:
y_pred = svc_tuned.predict(X_test)
print('Log Loss value: ',custom_scoring_function(y_test,y_pred))
print('Accuracy: ',round(accuracy_score(y_test,y_pred),4))

Log Loss value:  1.2115421
Accuracy:  0.9649


In [15]:
# Optimal model parameters
def get_best_params(model):
    return model.best_params_

In [16]:
get_best_params(svc_tuned)

{'C': 1, 'gamma': 1}