In [1]:
import matplotlib.pyplot as plt

# LOAD DATASET
from util.data_loader import load_dataset
x_train, y_train, x_test, feature_names = load_dataset(
    path_x_train="./../course/projects/project1/data/dataset/x_train.csv",
    path_y_train="./../course/projects/project1/data/dataset/y_train.csv",
    path_x_test="./../course/projects/project1/data/dataset/x_test.csv"
)

In [2]:
# REMOVE USELESS FEATURES
from util.features_info import FEATURES_DICT
from util.features_util import keep_features
feature_indexes = dict(zip(feature_names, range(len(feature_names))))
x_clean, clean_features, clean_feature_index = keep_features(x_train, FEATURES_DICT.keys(), feature_names, feature_indexes)

x_test_clean, clean_features, clean_feature_index = keep_features(x_test, FEATURES_DICT.keys(), feature_names, feature_indexes)

Kept 76 features: dict_keys(['GENHLTH', 'PHYSHLTH', 'MENTHLTH', 'POORHLTH', 'PERSDOC2', 'MEDCOST', 'CHECKUP1', '_HCVU651', '_CHOLCHK', '_RFCHOL', 'CVDSTRK3', '_ASTHMS1', 'CHCSCNCR', 'CHCOCNCR', 'CHCCOPD1', '_DRDXAR1', 'ADDEPEV2', 'CHCKIDNY', 'DIABETE3', 'DIABAGE2', 'SEX', '_AGE80', 'MARITAL', '_CHLDCNT', '_EDUCAG', '_INCOMG', 'PREGNANT', 'QLACTLM2', 'USEEQUIP', 'DECIDE', 'DIFFWALK', 'DIFFDRES', 'DIFFALON', 'HTM4', 'WTKG3', '_BMI5', '_SMOKER3', 'USENOW3', 'DRNKANY5', 'DROCDY3_', '_RFBING5', '_DRNKWEK', '_RFDRHV5', 'FTJUDA1_', 'FRUTDA1_', 'BEANDAY_', 'GRENDAY_', 'ORNGDAY_', 'VEGEDA1_', '_FRUTSUM', '_VEGESUM', '_FRTLT1', '_VEGLT1', '_TOTINDA', 'METVL11_', 'METVL21_', 'MAXVO2_', 'ACTIN11_', 'ACTIN21_', 'PADUR1_', 'PADUR2_', 'PAFREQ1_', 'PAFREQ2_', '_MINAC11', '_MINAC21', 'STRFREQ_', 'PA1MIN_', 'PAVIG11_', 'PAVIG21_', 'PA1VIGM_', '_PACAT1', '_PAINDX1', '_PA150R2', '_PA300R2', '_PA30021', '_PASTRNG'])
Kept 76 features: dict_keys(['GENHLTH', 'PHYSHLTH', 'MENTHLTH', 'POORHLTH', 'PERSDOC2', 'ME

In [3]:
# PREPROCESS KEPT FEATURES
import numpy as np
from util.features_info import REPLACEMENT_LIST
from util.preprocessing import preprocessing_pipeline
x_preprocessed = preprocessing_pipeline(x_clean, 
                                        where=clean_features, 
                                        feature_index=clean_feature_index,
                                        nan_replacement=REPLACEMENT_LIST)

# x_test_preprocessed = preprocessing_pipeline(x_test_clean, 
#                                              where=clean_features, 
#                                              feature_index=clean_feature_index, 
#                                              nan_replacement=REPLACEMENT_LIST)

  outputs = ufunc(*inputs)


In [4]:
# Split train-validation
from util.data_loader import split_train_validation
(x_tr, x_va, y_tr, y_va) = split_train_validation(x_preprocessed, y_train, valid_proportion=0.2)

In [17]:
# oversample training set
from util.data_loader import custom_random_oversampling
x_tr_oversampled, y_tr_oversampled = custom_random_oversampling(x_tr, y_tr[:,1])

In [7]:
import numpy as np

def min_max_normalize(arr: np.ndarray):
    arr_normalized = np.empty_like(arr)
    for column in range(arr.shape[1]):
        arr_normalized[:, column] = (arr[:, column] - arr[:, column].min()) / (arr[:, column].max() - arr[:, column].min())
    return arr_normalized

def standardize(arr: np.ndarray):
    mean = np.mean(arr, axis=0)
    std = np.std(arr, axis=0)
    arr_std = (arr - mean) / std
    return arr_std

In [16]:
x_tr.shape, y_tr.shape, x_tr_oversampled.shape, y_tr_oversampled.shape

((262508, 76), (262508, 2), (471794, 76), (471794,))

In [18]:
# Normalize and add bias column for TRAINING
x_tr_normalized = min_max_normalize(x_tr_oversampled)
x_tr_normalized = np.append(x_tr_normalized, np.ones(shape=(x_tr_normalized.shape[0], 1)), axis=1)

# Normalize and add bias column for VALIDATION
x_va_normalized = min_max_normalize(x_va)
x_va_normalized = np.append(x_va_normalized, np.ones(shape=(x_va_normalized.shape[0], 1)), axis=1)

# # Normalize and add bias column for TEST
# x_test_normalized = min_max_normalize(x_test)
# x_test_normalized = np.append(x_test_normalized, np.ones(shape=(x_test_normalized.shape[0], 1)), axis=1)

In [None]:
from util.train import reg_logistic_regression, reg_logistic_regression_hyperparameters

lambdas = np.logspace(-10,0,10)
gammas = np.logspace(-10,0,10)
w_initials = [np.random.random(size=x_tr_normalized.shape[1]) for _ in range(5)]

best_w, train_losses, valid_losses, best_lambda, best_gamma, best_initial_w = reg_logistic_regression_hyperparameters(
    x_tr_normalized, 
    y_tr_oversampled, 
    x_va_normalized, 
    y_va[:,1], 
    lambdas, 
    gammas, 
    w_initials,
    max_iter=2000
)
print('min val loss: {l} for lambda={la}, gamma={g}, w_initial={w}'.format(l=np.min(valid_losses), la=best_lambda, w=best_initial_w, g=best_gamma))

In [None]:
np.unique(np.where(sigmoid(x_tr_normalized @ best_w) >= 0.5, 1, -1),return_counts=True)

In [None]:
from implementations import sigmoid
from util.evaluation import evaluation_summary
x_tr_predictions = np.where(sigmoid(x_tr_normalized @ w) >= 0.5, 1, -1)

evaluation_summary(y_tr_oversampled, x_tr_predictions)

In [257]:
x_valid_pred = np.where(sigmoid(x_va_normalized @ best_w) >= 0.5, 1, -1)

evaluation_summary(y_va[:,1], x_valid_pred)

Accuracy: 63.64%
Precision: 13.15%
Recall: 56.56%
F1-score: 21.33%


In [None]:
# to plot we run the reg logistic regression with all_lossess returned
best_w_, train_losses_, valid_losses_ = reg_logistic_regression_here(
    x_tr_normalized, 
    y_tr_oversampled, 
    x_va_normalized, y_va[:,1], 
    lambda_= best_lambda,
    gamma = best_gamma, 
    w=best_initial_w,
    max_iter=5000, # before it was 2000 to save some time. But they should be same in the final implementation
    batch_size=100,
    all_losses=True
)

In [None]:
import matplotlib.pyplot as plt
from util.plotting import loss_visualization
plt.figure(figsize=(10, 10))
loss_visualization(train_losses_, valid_losses_)
plt.yscale("log")

In [264]:
## to be implemented correctly
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

def confusion_matrix(true_labels: np.ndarray, predicted_labels: np.ndarray, class_names=None):
    # Calculate the confusion matrix
    classes = [1, -1]
    cm = np.zeros((len(classes), len(classes)), dtype=int)

    for i in range(len(classes)):
        for j in range(len(classes)):
            cm[i, j] = np.sum((true_labels == i) & (predicted_labels == j))

    if class_names is None:
        class_names = [str(i) for i in range(len(classes))]

    # Create the confusion matrix heatmap
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()

# # Example usage:
# true_labels = np.array([0, 1, 1, 0, 2, 2])
# predicted_labels = np.array([0, 1, 2, 0, 2, 2])
# confusion_matrix(true_labels, predicted_labels, class_names=['Class 0', 'Class 1', 'Class 2'])