# Imoport the libraries

In [146]:
# import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# import sklearn libraries
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# ignore warnings
import warnings

warnings.filterwarnings("ignore")



# Import data

In [147]:
#  load data

iris_df = pd.read_csv('../data/iris.csv')

#  Feature engineering

There isn't much to do to transfrom the data, what we can do is to remove the highly correlated features.

# Model training

We should first choose our model, We will try three different models: SVM, Logistic Regression and KNN.

In [148]:
# train test split

X = iris_df.drop(['species'], axis=1)

y = iris_df['species']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(120, 4) (30, 4) (120,) (30,)


In [149]:
def print_class_scores(y_test, y_pred):
    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Precision: ', precision_score(y_test, y_pred, average='weighted'))
    print('Recall: ', recall_score(y_test, y_pred, average='weighted'))
    print('F1: ', f1_score(y_test, y_pred, average='weighted'))

## 1. SVM

In [150]:
# Model

model = SVC(kernel='linear')

steps = [('scaler', StandardScaler()), ('model', model)]

pipeline = Pipeline(steps)

pipeline.fit(X_train, y_train)

# evaluate on train set

y_pred = pipeline.predict(X_train)

print('Train set scores:')

print_class_scores(y_train, y_pred)

# evaluate on test set

y_pred = pipeline.predict(X_test)

print('Test set scores:')

print_class_scores(y_test, y_pred)

Train set scores:
Accuracy:  0.975
Precision:  0.9751928288513655
Recall:  0.975
F1:  0.9749960931395532
Test set scores:
Accuracy:  1.0
Precision:  1.0
Recall:  1.0
F1:  1.0


Those are great results, let's see if we can improve them with a randomized search.

In [151]:
# randomized search

from sklearn.model_selection import RandomizedSearchCV

# define parameter grid

# params to tune for svm

param_grid = {
    'model__C': [0.1, 1, 10],            # Regularization parameter
    'model__kernel': ['linear', 'rbf'],  # Kernel type
    'model__gamma': [0.001, 0.01, 0.1],  # Kernel coefficient (for 'rbf' and 'poly')
    'model__degree': [2, 3, 4],          # Polynomial degree (for 'poly')
    'model__coef0': [0, 1, 2],           # Coefficient 0 (for 'poly' and 'sigmoid')
    'model__class_weight': [None, 'balanced'],  # Class weights
}

# define randomized search

random_search = RandomizedSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=1, scoring='accuracy', n_iter=30)

# fit model

random_search.fit(X_train, y_train)

# train set scores

y_pred = random_search.predict(X_train)

print('Train set scores:')

print_class_scores(y_train, y_pred)

# test set scores

y_pred = random_search.predict(X_test)

print('Test set scores:')

print_class_scores(y_test, y_pred)

# best params

random_search.best_params_


Fitting 3 folds for each of 30 candidates, totalling 90 fits


2988.35s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
2988.58s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
2988.81s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
2989.04s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
2989.28s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
2989.50s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
2989.73s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
2989.96s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
2990.20s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
2990.43s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
2990.67s - pydevd: Sending message related to proc

Train set scores:
Accuracy:  0.975
Precision:  0.9751928288513655
Recall:  0.975
F1:  0.9749960931395532
Test set scores:
Accuracy:  1.0
Precision:  1.0
Recall:  1.0
F1:  1.0


{'model__kernel': 'linear',
 'model__gamma': 0.001,
 'model__degree': 2,
 'model__coef0': 2,
 'model__class_weight': None,
 'model__C': 1}

We still have the same results, so probably that's the best we can do with a SVM.

## 2.Logistic regression

In [152]:
#  Logistic Regression

from sklearn.linear_model import LogisticRegression

# define model

model = LogisticRegression()

steps = [('scaler', StandardScaler()), ('model', model)]

pipeline = Pipeline(steps)

pipeline.fit(X_train, y_train)

# evaluate on train set

y_pred = pipeline.predict(X_train)

print('Train set scores:')

print_class_scores(y_train, y_pred)

# evaluate on test set

y_pred = pipeline.predict(X_test)

print('Test set scores:')

print_class_scores(y_test, y_pred)



Train set scores:
Accuracy:  0.9583333333333334
Precision:  0.9585157390035438
Recall:  0.9583333333333334
F1:  0.9583268218992551
Test set scores:
Accuracy:  0.9333333333333333
Precision:  0.9333333333333333
Recall:  0.9333333333333333
F1:  0.9333333333333333


We have good results even with a simple logistic regression, let's see if we can improve them with a randomized search.

In [153]:
# randomized search

from sklearn.model_selection import RandomizedSearchCV

# define parameter grid logistic regression

param_grid = {
    'model__penalty': ['l1', 'l2'],  # Regularization type
    'model__C': [0.1, 1, 10],        # Regularization parameter
    'model__class_weight': [None, 'balanced'],  # Class weights
}

# define randomized search

random_search = RandomizedSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=1, scoring='accuracy', n_iter=10)

# fit model

random_search.fit(X_train, y_train)

# train set scores

y_pred = random_search.predict(X_train)

print('Train set scores:')

print_class_scores(y_train, y_pred)

# test set scores

y_pred = random_search.predict(X_test)

print('Test set scores:')

print_class_scores(y_test, y_pred)


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Train set scores:
Accuracy:  0.975
Precision:  0.9751928288513655
Recall:  0.975
F1:  0.9749960931395532
Test set scores:
Accuracy:  1.0
Precision:  1.0
Recall:  1.0
F1:  1.0


Yes we could improve the results.

## 2.KNN

In [154]:
# Knn

from sklearn.neighbors import KNeighborsClassifier

# define model

model = KNeighborsClassifier(n_neighbors=3)

steps = [('scaler', StandardScaler()), ('model', model)]

pipeline = Pipeline(steps)

pipeline.fit(X_train, y_train)

# evaluate on train set

y_pred = pipeline.predict(X_train)

print('Train set scores:')

print_class_scores(y_train, y_pred)

# evaluate on test set

y_pred = pipeline.predict(X_test)

print('Test set scores:')

print_class_scores(y_test, y_pred)


Train set scores:
Accuracy:  0.9583333333333334
Precision:  0.9599832390530064
Recall:  0.9583333333333334
F1:  0.9582746570698378
Test set scores:
Accuracy:  0.9333333333333333
Precision:  0.9444444444444445
Recall:  0.9333333333333333
F1:  0.9326599326599326


We have good results even with a KNN, let's see if we can improve them with a randomized search.

In [155]:
# randomized search

from sklearn.model_selection import RandomizedSearchCV

# define parameter grid knn

param_grid = {
    'model__n_neighbors': [3, 5, 7],  # Number of neighbors
    'model__weights': ['uniform', 'distance'],  # Weight function
    'model__p': [1, 2],  # Power parameter for the Minkowski metric
}

# define randomized search

random_search = RandomizedSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=1, scoring='accuracy', n_iter=10)

# fit model

random_search.fit(X_train, y_train)

# train set scores

y_pred = random_search.predict(X_train)

print('Train set scores:')

print_class_scores(y_train, y_pred)

# test set scores

y_pred = random_search.predict(X_test)

print('Test set scores:')

print_class_scores(y_test, y_pred)



Fitting 3 folds for each of 10 candidates, totalling 30 fits
Train set scores:
Accuracy:  0.9583333333333334
Precision:  0.9585157390035438
Recall:  0.9583333333333334
F1:  0.9583268218992551
Test set scores:
Accuracy:  0.9666666666666667
Precision:  0.9696969696969696
Recall:  0.9666666666666667
F1:  0.9665831244778613


We improved the results a little bit.