## **Logistic Regression from scratch**

In [1]:
import os
import sys
import argparse
import time
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix

src_path = os.path.abspath('..')
if src_path not in sys.path:
    sys.path.append(src_path)

from src.model.logistic_regression import LogisticRegression
from src.utils.data_processing import load_data, scale_numerical_features
from src.utils.model_utils import create_output_dir, load_config, save_checkpoint
from src.utils.visualization import plot_confusion_matrix

In [2]:
X_train, y_train = load_data("../data/train.csv")
X_val, y_val = load_data("../data/val.csv")
X_test, y_test = load_data("../data/test.csv")

X_train_scaled, X_val_scaled, scaler = scale_numerical_features(X_train=X_train, X_val=X_val, features_to_scale=['AGE'])
X_test_scaled = X_test.copy()
X_test_scaled['AGE'] = scaler.transform(X_test[['AGE']])

In [3]:
def train_and_evaluate(lr, penalty, C):
    model = LogisticRegression(
        learning_rate=lr,
        penalty=penalty,
        C=C
    )
    start_time = time.time()
    model.fit(X_train, y_train, X_val=X_val, y_val=y_val, early_stopping=True, patience=10)
    training_time = time.time() - start_time

    val_pred = model.predict(X_val)
    metrics = model.compute_metrics(y_val, val_pred)
    return metrics['accuracy'], training_time, model

In [None]:
learning_rates = [0.1, 0.05, 0.01, 0.001, 0.0001]
penalties = ['l1', 'l2', None]
C_values = [0.01, 0.1, 1]

best_accuracy = 0
best_params = None

for penalty in penalties:
    if penalty == None:
        C_current = [None]
    else:
        C_current = C_values

    for C in C_current:
        for lr in learning_rates:
            accuracy, training_time, model = train_and_evaluate(lr, penalty, C)
            print(f"Training with penalty={penalty}, C={C}, lr={lr} => Accuracy: {accuracy:.4f}, Training Time: {training_time:.4f} seconds")
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_params = (lr, penalty, C)

print("==============================================================")
print("Best params:", best_params)
print("Best accuracy:", best_accuracy)


Training with penalty=l1, C=0.01, lr=0.1 => Accuracy: 0.3340, Training Time: 0.8308 seconds
Training with penalty=l1, C=0.01, lr=0.05 => Accuracy: 0.3564, Training Time: 1.2142 seconds
Training with penalty=l1, C=0.01, lr=0.01 => Accuracy: 0.6382, Training Time: 9.1982 seconds
Training with penalty=l1, C=0.01, lr=0.001 => Accuracy: 0.7248, Training Time: 457.5534 seconds
Training with penalty=l1, C=0.01, lr=0.0001 => Accuracy: 0.6664, Training Time: 560.9543 seconds
Training with penalty=l1, C=0.1, lr=0.1 => Accuracy: 0.6666, Training Time: 0.7457 seconds
Training with penalty=l1, C=0.1, lr=0.05 => Accuracy: 0.6666, Training Time: 0.5687 seconds
Training with penalty=l1, C=0.1, lr=0.01 => Accuracy: 0.6530, Training Time: 0.7489 seconds


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training with penalty=l1, C=0.1, lr=0.001 => Accuracy: 0.6668, Training Time: 497.2857 seconds


In [None]:
# Validate the best model on the test set
best_lr, best_penalty, best_C = best_params
best_model = LogisticRegression(
    learning_rate=best_lr,
    penalty=best_penalty,
    C=best_C
)

best_model.fit(X_train, y_train, X_val=X_val, y_val=y_val, early_stopping=True, patience=10)
y_test_pred = best_model.predict(X_test)
metrics = best_model.compute_metrics(y_test, y_test_pred)

print("Accuracy:", metrics['accuracy'])
print("Precision:", metrics['precision'])
print("Recall:", metrics['recall'])
print("F1 Score:", metrics['f1_score'])
print("===================================")
print("Classification Report:", metrics['classification_report'])

In [None]:
# Plot confusion matrix
cm = confusion_matrix(y_test, y_test_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title(f'Confusion Matrix - Best Model (lr={best_lr}, penalty={best_penalty}, C={best_C})')
plot_confusion_matrix(cm, classes=['No', 'Yes'])
plt.ylabel('True Label')
plt.xlabel('Predicted Label')

## **Logistic Regression Sklearn**

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [None]:
param_grid = {'penalty': ['l1', 'l2', None],
              'C': [0.01, 0.1, 1],
              'solver': ['liblinear', 'saga']}

grid = GridSearchCV(LogisticRegression(solver='liblinear'), param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best params:", grid.best_params_)

Best params: {'C': 0.01, 'penalty': 'l1'}


In [None]:
# Validate the best model on the test set
best_sklearn_model = grid.best_estimator_
y_test_pred = best_sklearn_model.predict(X_test)
print("Validate the best model on the test set")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("Precision:", precision_score(y_test, y_test_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_test_pred, average='weighted'))
print("F1 Score:", f1_score(y_test, y_test_pred, average='weighted'))
print("===================================")
print("Classification Report:\n", classification_report(y_test, y_test_pred))

Validation Accuracy (Grid Search): 0.7636
Validation Precision (Grid Search): 0.6742
Validation Recall (Grid Search): 0.5620
Validation F1 Score (Grid Search): 0.6130
Classification Report (Grid Search):
              precision    recall  f1-score   support

           0       0.80      0.86      0.83     29487
           1       0.67      0.56      0.61     14736

    accuracy                           0.76     44223
   macro avg       0.74      0.71      0.72     44223
weighted avg       0.76      0.76      0.76     44223

Validation Confusion Matrix (Grid Search):
[[25485  4002]
 [ 6454  8282]]


In [None]:
# Plot confusion matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title(f'Confusion Matrix - {best_sklearn_model.__class__.__name__}')
plt.show()