In [None]:
import os
import pickle

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from util.data_loader import (load_dataset, split_train_validation, 
                              custom_random_oversampling, create_csv_submission,
                              change_negative_class)
from util.features_util import keep_features
from util.features_info import FEATURES_DICT, REPLACEMENT_LIST
from util.preprocessing import preprocessing_pipeline
from util.train import reg_logistic_regression
from util.plotting import loss_visualization
from util.evaluation import evaluation_summary
from util.predict import predict, predict_no_labels


BASE_PATH = os.path.dirname(os.getcwd()) + "/data"

# Data loading

### ⚠️ Note that the `x_sub` here refers to data for which predictions will be made, but for which we don't have the labels! To refer to the data used for local testing we'll use `x_va` and `y_va` (validation).

In [None]:
# Load data without subsampling
x_train, y_train, x_sub, feature_names = load_dataset(path_x_train=BASE_PATH + "/x_train.csv",
                                                      path_y_train=BASE_PATH + "/y_train.csv",
                                                      path_x_test=BASE_PATH + "/x_test.csv",
                                                      sub_sample=False)
# Create the feature index dictionary
feature_indexes = dict(zip(feature_names, range(len(feature_names))))

# Pre-processing

In [None]:
# Keeping only selected features both in train and validation set
x_train_clean, clean_features, clean_feature_index = keep_features(x_train, FEATURES_DICT.keys(), feature_names, feature_indexes, verbose=False)

In [None]:
# Call to the preprocessing pipeline both for train and test set
# NOTE: the preprocessing automatically performs also normalization

x_train_preprocessed = preprocessing_pipeline(x_train_clean, 
                                              where=clean_features,
                                              feature_index=clean_feature_index,
                                              nan_replacement=REPLACEMENT_LIST)

## Split local data into train and validation

#### From now on we'll use the local train and validation data: `x_tr` and `y_tr` for training and `x_va` and `y_va` for validation

In [None]:
# Translate labels from -1/1 to 0/1
y_train = change_negative_class(y_train[:, 1], current=-1, new=0)

# Split local data into train and validation
(x_tr, x_va, y_tr, y_va) = split_train_validation(x_train_preprocessed,
                                                  y_train,
                                                  valid_proportion=0.2)

## Data preparation for training

In [None]:
# Perform over sampling (due to unbalanced data)
x_tr_oversampled, y_tr_oversampled = custom_random_oversampling(x_tr, y_tr)

# NOTE: to properly validate the model we should not oversample the validation set!
# x_va_oversampled, y_va_oversampled = custom_random_oversampling(x_va, y_va[:,1])


# Add bias column to the training and validation data
x_tr_oversampled_bias = np.append(x_tr_oversampled, np.ones(shape=(x_tr_oversampled.shape[0], 1)), axis=1)
x_va_bias = np.append(x_va, np.ones(shape=(x_va.shape[0], 1)), axis=1)


# Training

In [None]:
from util.train import reg_logistic_regression_hyperparameters
import pickle

lambdas_ = [0.0005, 0.001, 0.01, 0.1]
gammas = [0.001, 0.01, 0.05, 0.1]
batch_sizes = [10, 500, 10000]

hyperparameters = {
    "lambda_": lambdas_,
    "gamma": gammas,
    "batch_size": batch_sizes,
    "optimizer": ["sgd"]
}

In [None]:
res = reg_logistic_regression_hyperparameters(
    x_tr_oversampled_bias,
    y_tr_oversampled,
    x_va_bias,
    y_va,
    hyperparameters
)

print(res)

In [None]:
with open('objs.pkl', 'wb') as f:
    pickle.dump(res, f)
    

## Load results

In [None]:
with open('objs.pkl', 'rb') as f:
    res = pickle.load(f)

res[0].keys()

In [None]:
# Plot a heatmap for each batch size
for batch_size in batch_sizes:
    # Filter data for the current batch size
    filtered_res = [d for d in res if d['batch_size'] == batch_size]

    # Create a pivot table for the heatmap
    pivot_res = np.zeros((len(np.unique([d['lambda_'] for d in res])),
                           len(np.unique([d['gamma'] for d in res]))))
    for item in filtered_res:
        lambda_idx = np.where(np.unique([d['lambda_'] for d in res]) == item['lambda_'])[0][0]
        gamma_idx = np.where(np.unique([d['gamma'] for d in res]) == item['gamma'])[0][0]
        pivot_res[lambda_idx, gamma_idx] = item['valid_loss']

    # Create a heatmap
    plt.figure(figsize=(5, 4))
    sns.heatmap(pivot_res, annot=True, xticklabels=np.unique([d['gamma'] for d in res]),
                yticklabels=np.unique([d['lambda_'] for d in res]))
    plt.xlabel('Gamma')
    plt.ylabel('Lambda')
    plt.title(f'Batch Size {batch_size} Heatmap')
    plt.savefig(os.path.dirname(os.getcwd()) + '/plots/' + f'heatmap_batch_size_{batch_size}.png')




## Evaluation on training set

In [None]:
res.sort(key=lambda x: x["valid_loss"], reverse=False)
w_opt = res[0]["best_weights"]

In [None]:
predicted_y_tr = predict(x_tr_oversampled_bias,
                         w_opt,
                         threshold=0.5,
                         negative_label=0)

print("EVALUATION SUMMARY ON TRAINING SET")
evaluation_summary(y_tr_oversampled, predicted_y_tr)

In [None]:
sns.histplot(predict_no_labels(x_tr_oversampled_bias, w_opt), bins=30, kde=True)
plt.title("Distribution of unlabeled predictions on training set")
plt.show()

# Evaluation on validation set

In [None]:
predicted_y_va = predict(x_va_bias,
                         w_opt,
                         threshold=0.24215,
                         negative_label=0)
print("EVALUATION SUMMARY ON VALIDATION SET")
evaluation_summary(y_va, predicted_y_va)

In [None]:
sns.histplot(predict_no_labels(x_va_bias, w_opt), bins=30, kde=True)
plt.title("Distribution of unlabeled predictions on validation set")
plt.show()

In [None]:
from util.plotting import decision_threshold
decision_threshold(x_va_bias, w_opt, y_va)



---

# COMPUTE RESULT FOR SUBMISSION

In [None]:
# Keep only selected features in submission
x_sub_clean, clean_features, clean_feature_index = keep_features(x_sub, FEATURES_DICT.keys(), feature_names, feature_indexes)

x_sub_preprocessed = preprocessing_pipeline(x_sub_clean,
                                            where=clean_features,
                                            feature_index=clean_feature_index,
                                            nan_replacement=REPLACEMENT_LIST)

# Add bias column to the submission data
x_sub_bias = np.append(x_sub_preprocessed, np.ones(shape=(x_sub_preprocessed.shape[0], 1)), axis=1)

In [None]:
predicted_y_sub = predict(x_sub_bias,
                          w,
                          threshold=0.5,
                          negative_label=-1)

sns.histplot(predict_no_labels(x_sub_bias, w), bins=30, kde=True)

In [None]:
### Note: for the following we don't have true labels, so we can't compute the accuracy and other metrics

In [None]:
# Save predictions to csv file

# FIXME: this is not working, we have to keep indexes!

create_csv_submission(ids=range(len(predicted_y_sub)), y_pred=predicted_y_sub, path=BASE_PATH + "/submission.csv")