In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split


## Data

In [None]:
# Iris dataset: https://en.wikipedia.org/wiki/Iris_flower_data_set
# Added some noise to make it harder to classify

FTS = ['sepal.length', 'sepal.width', 'petal.length', 'petal.width']
TARGET = 'variety'
ADD_NOISE = True

data = pd.read_csv(os.path.join('data', 'iris.csv'))
if ADD_NOISE:
    np.random.seed(42)
    data[FTS] += np.random.normal(0, 1, data[FTS].shape)

data.head()

In [None]:
data.variety.value_counts()

## Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data[FTS], data[TARGET], test_size=0.2, random_state=42)

## Logistic Regression

In [None]:
logreg = LogisticRegression(max_iter=10000, random_state=42)
logreg.fit(X_train, y_train)

In [None]:
def get_cm_scores(model, X_train, y_train, X_test, y_test):
    _, axs = plt.subplots(1, 2, figsize=(20, 5))
    for i, (name, X, y) in enumerate([("train", X_train, y_train), ("test", X_test, y_test)]):
        report = classification_report(y, model.predict(X), output_dict=True)["macro avg"]
        report["accuracy"] = model.score(X, y)
        report = {k: round(v, 3) for k, v in report.items()}
        sns.heatmap(confusion_matrix(y, model.predict(X)), annot=True, fmt='d', ax=axs[i]).set_title(name+str(report))

In [None]:
get_cm_scores(logreg, X_train, y_train, X_test, y_test)

## Decision Tree

In [None]:
tree = DecisionTreeClassifier(random_state=42, max_depth=100)
tree.fit(X_train, y_train)

In [None]:
get_cm_scores(tree, X_train, y_train, X_test, y_test)

## Explainability

In [None]:
logreg.coef_, logreg.intercept_ # coefficients and intercepts

In [None]:
importances = pd.DataFrame({"feature": FTS, "importance": np.abs(logreg.coef_).mean(axis=0)})
plt.barh(importances.feature, importances.importance)
plt.title("Feature importances Logistic Regression")

In [None]:
importances = pd.DataFrame({"feature": FTS, "importance": tree.feature_importances_})
plt.barh(importances.feature, importances.importance)
plt.title("Feature importances Decision Tree")

In [None]:
_, ax = plt.subplots(figsize=(40, 20))
plot_tree(
    tree,
    feature_names=FTS,
    class_names=data.variety.unique(),
    filled=True,
    rounded=True,
    fontsize=14,
    ax=ax,
)
plt.show()