In [1]:
## TensorFlow and Keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import model_to_dot, plot_model
from keras_tuner import HyperModel, RandomSearch, Objective

## "Normal" libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import (auc, precision_recall_curve, confusion_matrix,
                             roc_auc_score, roc_curve, classification_report,
                             precision_score, recall_score, f1_score, accuracy_score)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from IPython.display import SVG
from sklearn.linear_model import LogisticRegression

In [2]:
def unique_value_counts(array):
    array = np.array(array)  # Convert to numpy array if not already
    unique, counts = np.unique(array, return_counts=True)
    return dict(zip(unique, counts))

In [3]:
def calculate_metrics(y_true, y_pred, y_prob=None, metrics=['precision', 'recall', 'f1', 'accuracy', 'auc']):
    # Compute the confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    TP = cm[1, 1]
    FP = cm[0, 1]
    TN = cm[0, 0]
    FN = cm[1, 0]

    results = {}

    # Calculate precision
    if 'precision' in metrics:
        results['precision'] = round( TP / (TP + FP), 4) if (TP + FP) > 0 else 0

    # Calculate recall
    if 'recall' in metrics:
        results['recall'] = round( TP / (TP + FN), 4) if (TP + FN) > 0 else 0

    # Calculate F1 score
    if 'f1' in metrics:
        results['f1'] = round( 2 * (TP / (2 * TP + FP + FN)), 4) if (2 * TP + FP + FN) > 0 else 0

    # Calculate accuracy
    if 'accuracy' in metrics:
        results['accuracy'] = round( (TP + TN) / (TP + TN + FP + FN), 4)

    # Calculate AUC
    if 'auc' in metrics and y_prob is not None:
        # Note: AUC requires probabilities, not class labels.
        results['auc'] = round( roc_auc_score(y_true, y_prob), 4)

    return results


# Data set-up

In [4]:
# Load the dataset
data = pd.read_excel('cleaned data.xlsx')

# Save as dataframe
data = pd.DataFrame(data)

In [5]:
# create X (features) and y (response)
X = data.drop(['Pno', 'Take-up ind'], axis = 1)
y = data['Take-up ind']

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the data
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# all my variables as seen as floats rather than strings or integers
X = X.astype(np.float32).values
y = y.astype(np.float32).values

In [6]:
# Get the test set (10%)
x_train_val, x_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.1, stratify=y, random_state=7)

# Now training (70%) and validation sets (20%)
x_train, x_val, y_train, y_val = train_test_split(
    x_train_val, y_train_val, test_size=0.2222, stratify=y_train_val, random_state=7)

# Apply SMOTE to the training data
smote = SMOTE(random_state=7)
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)

# Version 1 - NO SMOTE

In [51]:
logreg = LogisticRegression(max_iter=2000, random_state=7, solver='lbfgs', C=1.0, penalty='l2') # start with many iterations and no penalty

# Fit and save the logistic regression model using the training datamodel = logreg.fit(x_train, y_train)

In [52]:
# Predict
y_probs = model.predict(x_val)
y_val_pred = (y_probs > 0.5).astype(int) # Apply threshold
print(unique_value_counts(y_val_pred))

# Metrics
metrics = calculate_metrics(y_true=y_val, y_pred=y_val_pred, y_prob=y_probs,
                            metrics=['precision', 'recall', 'f1', 'accuracy', 'auc'])
print(metrics) 


{0: 214, 1: 5}
{'precision': 0.8, 'recall': 0.1905, 'f1': 0.3077, 'accuracy': 0.9178, 'auc': 0.5927}


# Version 2 - SMOTE

In [21]:
logreg = LogisticRegression(max_iter=2000, random_state=7, solver='lbfgs', C=1.0, penalty='l2')

# Fit and save the logistic regression model using the training data
model = logreg.fit(x_train_smote, y_train_smote)

In [22]:
# Predict
y_probs = model.predict(x_val)
y_val_pred = (y_probs > 0.5).astype(int) # Apply threshold
print(unique_value_counts(y_val_pred))

# Metrics
metrics = calculate_metrics(y_true=y_val, y_pred=y_val_pred, y_prob=y_probs,
                            metrics=['precision', 'recall', 'f1', 'accuracy', 'auc'])
print(metrics)

{0: 176, 1: 43}
{'precision': 0.2093, 'recall': 0.4286, 'f1': 0.2812, 'accuracy': 0.79, 'auc': 0.6284}


# Version 3 - SMOTE, NOW L1

In [24]:
logreg = LogisticRegression(max_iter=2000, random_state=7, solver='liblinear', C=1.0, penalty='l1')

# Fit and save the logistic regression model using the training data
model = logreg.fit(x_train_smote, y_train_smote)

In [25]:
# Predict
y_probs = model.predict(x_val)
y_val_pred = (y_probs > 0.5).astype(int) # Apply threshold
print(unique_value_counts(y_val_pred))

# Metrics
metrics = calculate_metrics(y_true=y_val, y_pred=y_val_pred, y_prob=y_probs,
                            metrics=['precision', 'recall', 'f1', 'accuracy', 'auc'])
print(metrics)

{0: 177, 1: 42}
{'precision': 0.2381, 'recall': 0.4762, 'f1': 0.3175, 'accuracy': 0.8037, 'auc': 0.6573}


# HYPERPARAMETER TUNING

In [77]:
# Determine a hyperparameter grid
parameter_grid = {
    'lambda_val': [0.001, 0.01, 0.1, 1, 2, 5, 10],
    'penalty': ['l1', 'l2'], # both regularizations
    'solver': ['liblinear'],  # check guides online!
    'max_iter': [1, 2, 5, 10, 50, 100, 150, 200, 300]
}

In [78]:
# Initialize
best_auc = -np.inf # start at the lowest possible
best_parameters = None
best_model = None

In [79]:
# Iterate the model over the parameters

for lambda_val in parameter_grid['lambda_val']:

    for penalty in parameter_grid['penalty']:

        for solver in parameter_grid['solver']:

            for max_iter in parameter_grid['max_iter']:

                # run model like before
                model = LogisticRegression(C=lambda_val, penalty=penalty, solver=solver, max_iter=max_iter)
                model.fit(x_train, y_train)
                # predict like before

                # Predict
                y_probs = model.predict(x_val)
                y_val_pred = (y_probs > 0.5).astype(int) # Apply threshold
                
                # Metrics
                metrics = calculate_metrics(y_true=y_val, y_pred=y_val_pred, y_prob=y_probs,
                            metrics=['precision', 'recall', 'f1', 'accuracy', 'auc'])

                #print(metrics) # see as iterations continue

                # AUC specifically
                new_auc = roc_auc_score(y_val, y_probs)
                
                if new_auc > best_auc:
                    best_auc = new_auc
                    best_parameters = {'lambda_val': lambda_val, 'penalty': penalty, 'solver': solver, 'max_iter': max_iter}
                    best_model = model




In [80]:
print(f"Best parameters: {best_parameters}")
print(f"Best AUC on thr validation set: {round(best_auc, 4)}")

Best parameters: {'lambda_val': 2, 'penalty': 'l1', 'solver': 'liblinear', 'max_iter': 1}
Best AUC on thr validation set: 0.6403


# Rebuild the best chosen model

In [83]:
best_model

In [None]:
# Predict
y_probs = model.predict(x_test)
y_test_pred = (y_probs > 0.5).astype(int) # Apply threshold
print(unique_value_counts(y_test_pred))

# Metrics
metrics = calculate_metrics(y_true=y_test, y_pred=y_test_pred, y_prob=y_probs,
                            metrics=['precision', 'recall', 'f1', 'accuracy', 'auc'])
print(metrics)

In [None]:
cm_df = pd.DataFrame(confusion_matrix(y_test_pred, y_test), 
                     index=['Actual 0', 'Actual 1'], 
                     columns=['Predicted 0', 'Predicted 1'])
cm_df