ML model fitting with various hyperparameter tuning methods

Data taken from https://archive.ics.uci.edu/ml/datasets/Raisin+Dataset

In [27]:
# import packages needed
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from skopt import BayesSearchCV
from skopt.space import Categorical, Real
from tpot import TPOTClassifier

In [28]:
# load and inspect data
data = pd.read_excel('raisin_dataset.xlsx')
print(data.head())

# set up input and output (Kecimen=0, Besni=1) variables, and training/test data
features = data.drop('Class', axis = 1)
y = data['Class'].map({'Kecimen':0, 'Besni':1})
mm_scaler = MinMaxScaler()
X = mm_scaler.fit_transform(features)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 99)

    Area  MajorAxisLength  MinorAxisLength  Eccentricity  ConvexArea  \
0  87524       442.246011       253.291155      0.819738       90546   
1  75166       406.690687       243.032436      0.801805       78789   
2  90856       442.267048       266.328318      0.798354       93717   
3  45928       286.540559       208.760042      0.684989       47336   
4  79408       352.190770       290.827533      0.564011       81463   

     Extent  Perimeter    Class  
0  0.758651   1184.040  Kecimen  
1  0.684130   1121.786  Kecimen  
2  0.637613   1208.575  Kecimen  
3  0.699599    844.162  Kecimen  
4  0.792772   1073.251  Kecimen  


In [29]:
# create support vector machine classifier model and grid search for hyperparameters
svm = SVC()
parameters = {'kernel': ['linear', 'rbf', 'sigmoid'], 'C': [1, 10, 100]}
grid = GridSearchCV(svm, parameters)
grid.fit(X_train, y_train)
print(grid.best_estimator_)

# print grid of search results 
results = pd.concat([pd.DataFrame(grid.cv_results_['params']), pd.DataFrame(grid.cv_results_['mean_test_score'], columns=['Score'])], axis=1)
cv_table = results.pivot(index='kernel', columns='C')
print(cv_table)

# performance score on test data
print(f"SVM test score: {grid.score(X_test, y_test)}")

SVC(C=1, kernel='linear')
            Score                    
C             1         10        100
kernel                               
linear   0.869841  0.869841  0.866667
rbf      0.869841  0.865079  0.869841
sigmoid  0.311111  0.277778  0.273016
Test score: 0.8555555555555555


In [31]:
# use Bayesian optimization to tune hyperparameters
search_spaces = {'kernel': Categorical(['linear', 'rbf', 'sigmoid']), 'C': Real(1, 100, prior='uniform')}
bayes = BayesSearchCV(svm, search_spaces, n_iter = 10)
bayes.fit(X_train, y_train)
print(bayes.best_estimator_)
print(f"Bayes test score: {bayes.score(X_test, y_test)}")

# use tree-based pipeline optimization tool
tpot = TPOTClassifier(generations = 2, population_size = 20)
tpot.fit(X_train, y_train)
print(f"TPOT test score: {tpot.score(X_test, y_test)}")
tpot.export('tpot_pipeline.py')

SVC(C=2.34797934003766)
Bayes test score: 0.8629629629629629
TPOT test score: 0.8777777777777778


