# Classification - hyperparameter tuning


## Imports



In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

### Reading in the data




In [143]:
df = pd.read_csv('https://raw.githubusercontent.com/Explore-AI/Public-Data/master/Data/classification_sprint/winequality.csv')
df.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,0,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,0,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,0,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,0,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


##  Data preprocessing



In [144]:
### START FUNCTION
from sklearn.preprocessing import StandardScaler

def data_preprocess(df):
    """
    Function to preprocess the data for modeling.

    Parameters:
    df (DataFrame): The input DataFrame containing the dataset.

    Returns:
    tuple: A tuple containing train and test data in the format (X_train, y_train), (X_test, y_test).
    """
    # Make a copy of the dataframe to avoid modifying the original data
    df1 = df.copy()
    
    # Convert 'quality' column to binary labels (0 for quality <= 4, 1 for quality >= 5)
    df1.loc[df1['quality'] <= 4, 'quality'] = 0
    df1.loc[df1['quality'] >= 5, 'quality'] = 1
    
    # Fill missing values with 0
    df1 = df1.fillna(0)

    # Separate features and target variable
    X = df1.drop('quality', axis=1)
    y = df1['quality']
    
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=42)
    
    # Convert target variables to arrays
    y_train = y_train.values
    y_test = y_test.values

    return (X_train, y_train), (X_test, y_test)

### END FUNCTION

In [145]:
(X_train, y_train), (X_test, y_test) = data_preprocess(df)

In [146]:
print(X_train[:2])
print(y_train[:2])
print(X_test[:2])
print(y_test[:2])

[[-0.57136659  0.07127869 -0.48054096  1.17914161 -0.09303318 -0.79974133
   0.0830898  -0.15472329 -0.36573452  0.13010447  0.06101473  0.25842195]
 [-0.57136659  1.50396711 -0.72301571  0.56008035 -0.63948302 -0.05776881
  -0.70572997  0.62379657  0.16787589 -0.86828773 -0.47467813 -0.99931317]]
[1 0]
[[-0.57136659 -0.15493527 -0.54115965  0.90400327 -0.66050032 -0.31460545
   0.53384396  0.03990667 -1.35291379 -0.26925241 -0.34075491  1.18076103]
 [-0.57136659  0.29749266 -1.20796522  2.8987562  -0.80762143 -0.45729248
  -0.19863155 -0.22549783 -1.03274754 -0.7185289  -0.87644778  0.25842195]]
[1 1]


##  Model training


In [147]:
### START FUNCTION

def train_SVC_model(X_train, y_train):
    """
    Function to train a Support Vector Classifier (SVC) model.

    Parameters:
    X_train (array-like): Training features.
    y_train (array-like): Training target variable.

    Returns:
    SVC: Trained SVC model.
    """
    # Initialize SVC model with specified parameters
    model = SVC(random_state=40, gamma='auto')
    
    # Fit the model to the training data
    model.fit(X_train, y_train)
    
    return model


### END FUNCTION

In [148]:
svc = train_SVC_model(X_train,y_train)
svc.classes_

array([0, 1], dtype=int64)

In [149]:
### START FUNCTION

def custom_scoring_function(y_true, y_pred):
    """
    Custom scoring function to calculate log loss.

    Parameters:
    y_true (array-like): True labels.
    y_pred (array-like): Predicted probabilities.

    Returns:
    float: Log loss value rounded to 7 decimal places.
    """
    # Set a small value to prevent log(0) error
    epsilon = 1e-15
    
    # Clip predicted probabilities to avoid extreme values
    y_pred = np.maximum(epsilon, y_pred)
    y_pred = np.minimum(1 - epsilon, y_pred)
    
    # Calculate log loss
    loss = - np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
    
    return round(loss, 7)


### END FUNCTION

In [150]:
y_pred = svc.predict(X_test)
print('Log Loss value: ', custom_scoring_function(y_test, y_pred))
print('Accuracy: ',round(accuracy_score(y_test,y_pred),4))

Log Loss value:  1.2540518
Accuracy:  0.9637


## Hyperparameter optimisation

###  Getting model parameters


In [151]:
### START FUNCTION
def get_model_hyperparams(model):
    """
    Function to retrieve hyperparameters of a given model.

    Parameters:
    model (estimator): A scikit-learn model object.

    Returns:
    list: List of hyperparameters for the given model.
    """
    # Retrieve hyperparameters of the model
    hyperparams = model.get_params().keys()
    
    # Convert to a list and return
    return list(hyperparams)


### END FUNCTION

In [152]:
get_model_hyperparams(svc)

['C',
 'break_ties',
 'cache_size',
 'class_weight',
 'coef0',
 'decision_function_shape',
 'degree',
 'gamma',
 'kernel',
 'max_iter',
 'probability',
 'random_state',
 'shrinking',
 'tol',
 'verbose']

###  Hyperparameter search


In [153]:
### START FUNCTION
def tune_SVC_model(X_train, y_train):
    """
    Tune the parameters of a Support Vector Classifier (SVC) using grid search.
    
    Parameters:
        X_train (array-like): Training data features.
        y_train (array-like): Training data labels.
        
    Returns:
        GridSearchCV: A GridSearchCV object containing the tuned SVC model.
    """
    # Define the grid of hyperparameters to search over
    param_grid = {'C': [0.1, 1, 10], 'gamma': [0.01, 0.1, 1]}
    
    # Define a custom scorer for GridSearchCV
    custom_scorer = make_scorer(custom_scoring_function, greater_is_better=False)
    
    # Perform grid search with cross-validation
    grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring=custom_scorer)
    grid_search.fit(X_train, y_train)

    return grid_search
    
# Tune the SVC model using the provided training data
svc_tuned = tune_SVC_model(X_train, y_train)

### END FUNCTION

In [154]:
y_pred = svc.predict(X_test)
print('Log Loss value: ',custom_scoring_function(y_test,y_pred))
print('Accuracy: ',round(accuracy_score(y_test,y_pred),4))

Log Loss value:  1.2540518
Accuracy:  0.9637


### Optimal model parameters


In [155]:
### START FUNCTION
def get_best_params(model):
    """
    Function to extract the best hyperparameters from a GridSearchCV object.

    Parameters:
    model (GridSearchCV): A GridSearchCV object containing the results of hyperparameter tuning.

    Returns:
    dict: A dictionary containing the optimal hyperparameters found by the grid search.
    """
    # Retrieve the best estimator from the grid search
    best_model = model.best_estimator_
    
    # Get the parameters of the best estimator
    best_params = best_model.get_params()
    
    return best_params

### END FUNCTION

In [156]:
get_best_params(svc_tuned)

{'C': 1,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 1,
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}