# Classification - Example
In this script we demonstrate how a Machine Learning workflow can look like when you use a train, validation and test set. 

In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_theme()

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Loading Data

In [2]:
# This code is merely executed to see the description and target names in a smooth way
iris = load_iris()

In [None]:
print(iris.DESCR)

In [None]:
iris.target_names

## Storing/Loading the data in the way it will be used

In [5]:
X, y = load_iris(return_X_y=True, as_frame=True)

# Only choose two variables for my model
X = X[['sepal length (cm)', 'sepal width (cm)']]

In [None]:
print(X.info())

In [None]:
print(y.info())

# Train, Validation, and Test Set

In [8]:
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=40)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.3, random_state=36)

# EDA
We are only allowed to explore / learn things on the training data when creating our model. 

In [None]:
classes = ['setosa', 'versicolor', 'virginica']
scatter = plt.scatter(X_train['sepal length (cm)'], X_train['sepal width (cm)'], c=y_train, cmap=plt.get_cmap("viridis"))
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Sepal Width (cm)')
plt.title('Scatter Plot of Sepal Length vs. Sepal Width (Train Data)')
plt.legend(handles=scatter.legend_elements()[0], labels=classes)

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
# 0 = 'setosa', 1 = 'versicolor', 2 = 'virginica'
y_train.value_counts()

# Training 2 different models

In [24]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
log_reg_pred = log_reg.predict(X_val)

In [25]:
tree_clf = DecisionTreeClassifier()

# Finding the best hyperparameter through GridSearch.
hyper_params = {'max_depth':(None, 1, 2, 5, 10)}
clf = GridSearchCV(tree_clf, hyper_params)

clf.fit(X_train, y_train)
clf_pred = clf.predict(X_val)

In [None]:
print(clf.best_params_)
pd.DataFrame(clf.cv_results_)

## Choosing the best model through validation set

In [None]:
# These names were obtained above
target_names_iris = ['setosa', 'versicolor', 'virginica']

cm1 = confusion_matrix(y_val, log_reg_pred)
cm2 = confusion_matrix(y_val, clf_pred)

fig, axs = plt.subplots(1, 2, figsize = (8, 4), layout='constrained')
ConfusionMatrixDisplay(cm1, display_labels = target_names_iris).plot(ax=axs[0])
ConfusionMatrixDisplay(cm2, display_labels = target_names_iris).plot(ax=axs[1])
[ax.grid(False) for ax in axs]

In [None]:
print(classification_report(y_val, log_reg_pred, target_names=target_names_iris))

In [None]:
print(classification_report(y_val, clf_pred, target_names=target_names_iris))

# Evaluating chosen model through test set

The results are extremely good since this is a "toy dataset". In reality we do not expect numbers that are as perfect as those below. 

In [39]:
# Now we retrain our model on the train + validation data. 
log_reg_final = LogisticRegression().fit(X_train_full, y_train_full)
pred_test = log_reg_final.predict(X_test)

In [None]:
fig, ax = plt.subplots(figsize=(7, 6))
cm_test = confusion_matrix(y_test, pred_test)
ConfusionMatrixDisplay(cm_test, display_labels=target_names_iris).plot(ax=ax)
ax.grid(False)

In [None]:
print(classification_report(y_test, pred_test, target_names=target_names_iris))