In [None]:
import os

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from util.data_loader import (load_dataset, split_train_validation, 
                              custom_random_oversampling, create_csv_submission,
                              change_negative_class)
from util.features_util import keep_features
from util.features_info import FEATURES_DICT, REPLACEMENT_LIST
from util.preprocessing import preprocessing_pipeline
from util.train import reg_logistic_regression
from util.plotting import loss_visualization
from util.evaluation import evaluation_summary
from util.predict import predict, predict_no_labels
from util.util import init_random_seeds


BASE_PATH = os.path.dirname(os.getcwd()) + "/data"

init_random_seeds()

# Data loading

### ⚠️ Note that the `x_sub` here refers to data for which predictions will be made, but for which we don't have the labels! To refer to the data used for local testing we'll use `x_va` and `y_va` (validation).

In [None]:
# Load data without subsampling
x_train, y_train, x_sub, feature_names = load_dataset(path_x_train=BASE_PATH + "/x_train.csv",
                                                      path_y_train=BASE_PATH + "/y_train.csv",
                                                      path_x_test=BASE_PATH + "/x_test.csv",
                                                      sub_sample=False)
# Create the feature index dictionary
feature_indexes = dict(zip(feature_names, range(len(feature_names))))

# Pre-processing

In [None]:
# Keeping only selected features both in train and validation set
x_train_clean, clean_features, clean_feature_index = keep_features(x_train, FEATURES_DICT.keys(), feature_names, feature_indexes)

In [None]:
# Call to the preprocessing pipeline both for train and test set
# NOTE: the preprocessing automatically performs also normalization

x_train_preprocessed = preprocessing_pipeline(x_train_clean, 
                                              where=clean_features,
                                              feature_index=clean_feature_index,
                                              nan_replacement=REPLACEMENT_LIST)

## Split local data into train and validation

#### From now on we'll use the local train and validation data: `x_tr` and `y_tr` for training and `x_va` and `y_va` for validation

In [None]:
# Translate labels from -1/1 to 0/1
y_train = change_negative_class(y_train[:, 1], current=-1, new=0)

# Split local data into train and validation
(x_tr, x_va, y_tr, y_va) = split_train_validation(x_train_preprocessed,
                                                  y_train,
                                                  valid_proportion=0.2)

## Data preparation for training

In [None]:
# Perform over sampling (due to unbalanced data)
x_tr_oversampled, y_tr_oversampled = custom_random_oversampling(x_tr, y_tr)

# NOTE: to properly validate the model we should not over sample the validation set!
# x_va_oversampled, y_va_oversampled = custom_random_oversampling(x_va, y_va[:,1])


# Add bias column to the training and validation data
x_tr_oversampled_bias = np.append(x_tr_oversampled, np.ones(shape=(x_tr_oversampled.shape[0], 1)), axis=1)
x_va_bias = np.append(x_va, np.ones(shape=(x_va.shape[0], 1)), axis=1)


# Training

In [None]:
w, train_loss, valid_loss = reg_logistic_regression(x_tr_oversampled_bias,
                                                    y_tr_oversampled,
                                                    x_va_bias,
                                                    y_va,
                                                    lambda_=0.001, 
                                                    max_iter=5000,
                                                    gamma=0.003,
                                                    batch_size=100, 
                                                    w=np.random.random(size=x_tr_oversampled_bias.shape[1]),
                                                    optimizer="sgd",
                                                    all_losses=True)


In [None]:
# plot losses
loss_visualization(train_loss, valid_loss)
plt.yscale("log")

# Evaluation on training set

In [None]:
predicted_y_tr = predict(x_tr_oversampled_bias,
                         w,
                         threshold=0.5,
                         negative_label=0)

print("EVALUATION SUMMARY ON TRAINING SET")
evaluation_summary(y_tr_oversampled, predicted_y_tr)

In [None]:
sns.histplot(predict_no_labels(x_tr_oversampled_bias, w), bins=30, kde=True)
plt.title("Distribution of unlabeled predictions on training set")
plt.show()

# Evaluation on validation set

In [None]:
predicted_y_va = predict(x_va_bias,
                         w,
                         threshold=0.47,
                         negative_label=0)
print("EVALUATION SUMMARY ON VALIDATION SET")
evaluation_summary(y_va, predicted_y_va)

In [None]:
sns.histplot(predict_no_labels(x_va_bias, w), bins=30, kde=True)
plt.title("Distribution of unlabeled predictions on validation set")
plt.show()

In [None]:
from util.plotting import decision_threshold
decision_threshold(x_va_bias, w, y_va)



---

# COMPUTE RESULT FOR SUBMISSION

In [None]:
from util.evaluation import accuracy, f1_score

thresholds = np.linspace(0, 1, 200)
accuracies = []
f1_scores = []

for threshold in thresholds:
    y_prediction = predict(x_va_bias, w, threshold)
    accuracies.append(accuracy(y_va, y_prediction))
    f1_scores.append(f1_score(y_va, y_prediction))

f1_scores = np.array(f1_scores)
f1_scores = np.where(np.isnan(f1_scores), 0, f1_scores)

print(thresholds[np.argmax(f1_scores)])

In [None]:
from util.plotting import decision_threshold
decision_threshold(x_va_bias, w, y_va)


## Pre processing and data preparation

In [None]:
# Keep only selected features in submission data
ids = x_sub[:, 0]
x_sub_clean, clean_features, clean_feature_index = keep_features(x_sub, FEATURES_DICT.keys(), feature_names, feature_indexes, verbose=False)

x_sub_preprocessed = preprocessing_pipeline(x_sub_clean,
                                            where=clean_features,
                                            feature_index=clean_feature_index,
                                            nan_replacement=REPLACEMENT_LIST)

# Add bias column to the submission data
x_sub_bias = np.append(x_sub_preprocessed, np.ones(shape=(x_sub_preprocessed.shape[0], 1)), axis=1)

In [None]:
predicted_y_sub = predict(x_sub_bias,
                          w,
                          threshold=0.6,
                          negative_label=-1)

sns.histplot(predict_no_labels(x_sub_bias, w), bins=30, kde=True)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(15, 5))
sns.histplot(y_va, ax=axs[0])
axs[0].set_title("Labels distribution in validation set")
sns.histplot(predicted_y_sub, ax=axs[1])
axs[1].set_title("Predictions distribution")

In [None]:
# Save predictions to csv file
create_csv_submission(ids=ids, y_pred=predicted_y_sub, path=BASE_PATH + "/submission.csv")