In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, preprocessing
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit
from sklearn.pipeline import Pipeline

# Cross-entropy loss

In [None]:
# Simulate a range of predicted probabilities (avoiding 0 and 1 to prevent log(0))
preds = np.linspace(0.001, 0.999, 200)

# Compute losses
loss_ce = -np.log(preds)
loss_mse = (1 - preds)**2

plt.figure(figsize=(8, 5))
plt.plot(preds, loss_ce, label="CE")
plt.plot(preds, loss_mse, label="MSE")

plt.xlabel("Predicted Probability")
plt.ylabel("Log Loss")
plt.legend()
plt.show()

# Load iris dataset

In [None]:
iris = datasets.load_iris()
label_map = {index: str(value) for index, value in enumerate(iris.target_names)}


df_iris = pd.DataFrame(data=iris["data"], columns=iris["feature_names"])
df_iris["target"] = iris.target
df_iris["target_name"]= df_iris["target"].replace(label_map)
df_iris

We'll only select `versicolor` and `virginica` for binary classification

In [None]:
mask = df_iris["target_name"].str.contains("versicolor|virginica")
df_iris_subset = df_iris[mask]

# Logistic regression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df_iris_subset.iloc[:, :4],
    df_iris_subset["target_name"],
    test_size=0.2,
    random_state=42,
    shuffle=True,
    stratify=None,
)

scaler = preprocessing.StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Create linear regression object (model)
logregr = LogisticRegression(penalty=None)

# Train the model using the data
logregr.fit(X_train_scaled, y_train)

print(f"Accuracy of logistic regression classifier on training set: {logregr.score(X_train_scaled, y_train):.4f}")
print(f"Accuracy of logistic regression classifier on test set: {logregr.score(X_test_scaled, y_test):.4f}")

Alternatively, we can assess model performace using cross-validation (Note that we are no fune-tuning hyperparameters)

In [None]:
# Evaluate the model using ShuffleSplit cross-validation
pipeline = Pipeline([("scaler", scaler), ("fit", logregr)])
shuffle_split = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
scores = cross_val_score(pipeline, df_iris_subset.iloc[:, :4], df_iris_subset["target"], cv=shuffle_split)
scores

In [None]:
scores.mean()

## Cross-validation for hyperparameter tuning

In [None]:
# Define the range of regularization parameters
Cs = np.logspace(-3, 3, 10)

shuffle_split = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
clf = LogisticRegressionCV(
    Cs=Cs,
    cv=shuffle_split,
).fit(X_train_scaled, y_train)
print(f"Best regularization strength (C): {clf.C_}")
print(f"Accuracy on test set: {clf.score(X_test_scaled, y_test):.4f}")

In [None]:
# Extract the coefficients
# coefs_paths_ is a dictionary with class labels as keys
coefs_paths = clf.coefs_paths_["virginica"]

# extract feature names
feature_names = df_iris_subset.iloc[:, :4].columns

# Plot the coefficient paths for one the cross-validation splits
plt.figure(figsize=(5, 4))
for i in range(len(feature_names)):
    plt.plot(Cs, coefs_paths[0, :, i], label=f"{feature_names[i]}")


plt.xlabel("C")
plt.ylabel("Coefficient")
plt.title("Logistic Regression Coefficient Paths")
plt.xscale("log")
plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
plt.grid(True)
plt.show()

In [None]:
scores = clf.scores_["virginica"]
plt.figure(figsize=(5, 3))
plt.plot(Cs, scores.mean(axis=0), marker="o", label="Mean CV score")
plt.fill_between(
    Cs,
    scores.mean(axis=0) - scores.std(axis=0),
    scores.mean(axis=0) + scores.std(axis=0),
    alpha=0.1,
    color="b",
    label="± 1 std. dev.")
plt.xscale("log")
plt.xlabel("C")
plt.ylabel("Mean CV Accuracy")
plt.title("Cross-Validation accuracy for different regularization strengths")
plt.grid(True)
plt.legend()
plt.show()

In [None]:
df_coefs = pd.DataFrame(
    clf.coef_[0],
    columns=["Coefficients"],
    index=feature_names,
)
df_coefs.plot.barh(figsize=(5, 4))
plt.axvline(x=0, color=".5")
plt.xlabel("coefficient values")

In [None]:

idx = np.where(clf.C_ == Cs)[0]
df_coefs = pd.DataFrame(
    coefs_paths[:, idx[0], :4],
    columns=feature_names,
)


df_coefs.boxplot(figsize=(5, 4), vert=False)
plt.axvline(x=0, color=".5")
plt.xlabel("Coefficient values")
plt.show()