# Logistic Regression

In [1]:
import numpy as np
from sklearn.datasets import fetch_covtype
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score, average_precision_score, log_loss
from sklearn.metrics import cohen_kappa_score, confusion_matrix, brier_score_loss
from sklearn.metrics import matthews_corrcoef, fowlkes_mallows_score, r2_score
from sklearn.preprocessing import label_binarize
from sklearn.metrics import precision_recall_curve, roc_curve, top_k_accuracy_score

# Load the forest cover type dataset
data = fetch_covtype()
X, y = data.data, data.target

# Adjust labels to be in the range 0 to 6
y = y - 1
num_classes = len(set(y))

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train a Logistic Regression model
lr_model = LogisticRegression(max_iter=200, random_state=42)
lr_model.fit(X_train, y_train)

# Make predictions with both models
y_pred_lr = lr_model.predict(X_test)
y_prob_lr = lr_model.predict_proba(X_test)

# Define a function to calculate and print metrics
def print_metrics(y_test, y_pred, y_prob, model_name):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro', zero_division=1)
    recall = recall_score(y_test, y_pred, average='macro', zero_division=1)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=1)
    roc_auc = roc_auc_score(label_binarize(y_test, classes=range(num_classes)), y_prob, multi_class='ovr')
    pr_auc = average_precision_score(label_binarize(y_test, classes=range(num_classes)), y_prob, average='macro')
    log_loss_value = log_loss(y_test, y_prob)
    kappa = cohen_kappa_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)

    brier_scores = [brier_score_loss(y_test == i, y_prob[:, i]) for i in range(num_classes)]
    brier_score_avg = sum(brier_scores) / num_classes

    mcc = matthews_corrcoef(y_test, y_pred)
    fmi = fowlkes_mallows_score(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    top_k_acc = top_k_accuracy_score(y_test, y_prob, k=3)

    print(f"Metrics for {model_name}:")
    print(f"Accuracy: {accuracy:.3f}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"F1-Score: {f1:.3f}")
    print(f"AUC-ROC: {roc_auc:.3f}")
    print(f"AUC-PR: {pr_auc:.3f}")
    print(f"Log Loss: {log_loss_value:.3f}")
    print(f"Cohen's Kappa: {kappa:.3f}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print(f"Average Brier Score: {brier_score_avg:.3f}")
    print(f"Matthews Correlation Coefficient: {mcc:.3f}")
    print(f"Fowlkes-Mallows Index: {fmi:.3f}")
    print(f"Coefficient of Determination (R^2): {r2:.3f}")
    print(f"Top-3 Accuracy: {top_k_acc:.3f}")
    print("\n")

# Print metrics for both models
print_metrics(y_test, y_pred_lr, y_prob_lr, "Logistic Regression")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Metrics for Logistic Regression:
Accuracy: 0.725
Precision: 0.592
Recall: 0.507
F1-Score: 0.526
AUC-ROC: 0.936
AUC-PR: 0.607
Log Loss: 0.632
Cohen's Kappa: 0.548
Confusion Matrix:
[[29803 11847     8     0     0    18   881]
 [10253 45244   630     2    45   298    28]
 [    0   738  5707   126     4   546     0]
 [    0     2   256   212     0    56     0]
 [   12  1892    67     0    14    10     0]
 [    0   818  1738    27     2   904     0]
 [ 1665    39     0     0     0     0  2311]]
Average Brier Score: 0.055
Matthews Correlation Coefficient: 0.549
Fowlkes-Mallows Index: 0.611
Coefficient of Determination (R^2): 0.227
Top-3 Accuracy: 0.994


