In [None]:
import os

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from util.data_loader import (load_dataset, split_train_validation, 
                              custom_random_oversampling, create_csv_submission,
                              change_negative_class)
from util.features_util import keep_features
from util.features_info import FEATURES_DICT, REPLACEMENT_LIST, FEATURES_BY_CATEGORY
from util.preprocessing import preprocessing_pipeline
from util.custom_pca import CustomPCA
from util.train import reg_logistic_regression
from util.plotting import loss_visualization, decision_threshold
from util.evaluation import evaluation_summary
from util.predict import predict, predict_no_labels


BASE_PATH = os.path.dirname(os.getcwd()) + "/data"

# Data loading

### ⚠️ Note that the `x_sub` here refers to data for which predictions will be made, but for which we don't have the labels! To refer to the data used for local testing we'll use `x_va` and `y_va` (validation).

In [None]:
# Load data without subsampling
x_train, y_train, x_sub, feature_names = load_dataset(path_x_train=BASE_PATH + "/x_train.csv",
                                                      path_y_train=BASE_PATH + "/y_train.csv",
                                                      path_x_test=BASE_PATH + "/x_test.csv",
                                                      sub_sample=False)
# Create the feature index dictionary
feature_indexes = dict(zip(feature_names, range(len(feature_names))))

# Pre-processing

In [None]:
# Keeping only selected features both in train and validation set
x_train_clean, clean_features, clean_feature_index = keep_features(x_train, FEATURES_DICT.keys(), feature_names, feature_indexes)

In [None]:
# Call to the preprocessing pipeline both for train and test set
# NOTE: the preprocessing automatically performs also normalization

x_train_preprocessed = preprocessing_pipeline(x_train_clean, 
                                              where=clean_features,
                                              feature_index=clean_feature_index,
                                              nan_replacement=REPLACEMENT_LIST)

# Translate labels from -1/1 to 0/1
reduced_y_train = change_negative_class(y_train[:, 1], current=-1, new=0)

### PCA exploration

##### For each group of features we plot the correlation matrix. Then we're going to perform PCA on each group of features.

In [None]:
# Plot correlation matrix (2 plot per each row)
for i, category in enumerate(FEATURES_BY_CATEGORY.keys()):
    # Create list of features to keep
    tmp_fs_to_keep = [str(f) for f in FEATURES_BY_CATEGORY[category]]
    
    tmp_x, tmp_features, tmp_feature_index = keep_features(x_train_preprocessed,
                                                           features_to_keep=tmp_fs_to_keep,
                                                           features=clean_features,
                                                           feature_index=clean_feature_index,
                                                           verbose=False)

    if len(tmp_fs_to_keep) > 1:
        # Calculate the correlation matrix using numpy
        fig = plt.figure(figsize=(6, 6))
        ax = plt.gca()
        
        correlation_matrix = np.corrcoef(tmp_x, rowvar=False)
        sns.heatmap(correlation_matrix, annot=False, fmt='.2f', cmap='coolwarm', square=True,
                    xticklabels=tmp_features, yticklabels=tmp_features, ax=ax, vmin=-1, vmax=1)
    
        # Set axis labels and title
        plt.title(category)
        plt.tight_layout()
        plt.savefig(os.path.dirname(os.getcwd()) + "/plots/corr_matrix_{}.png".format(category))
        plt.close(fig)
        

In [None]:
# Perform PCA on each group of features, keeping only the smallest
# number of components that explain 90% of the variance

reduced_x_train = np.empty((x_train_preprocessed.shape[0], 0))

PCA_per_category = {}

for category in FEATURES_BY_CATEGORY.keys():
    
    if len(FEATURES_BY_CATEGORY[category]) > 1:
        # Create the array with data only for the current category
        fs_in_category = [str(f) for f in FEATURES_BY_CATEGORY[category]]
        fs_in_category_indices = [clean_feature_index[feature] for feature in clean_features if feature in fs_in_category]
        category_data = x_train_preprocessed[:, fs_in_category_indices]
        
        # Cerates the PCA object and fit the data
        tmp_pca = CustomPCA()
        tmp_pca.fit(category_data)
        # Save the PCA object so that we can use it later to transform the test data
        PCA_per_category[category] = tmp_pca
        
        # Append the reduced data to the combined matrix
        reduced_data = tmp_pca.transform(category_data, threshold=0.9)
        reduced_x_train = np.concatenate((reduced_x_train, reduced_data), axis=1)
        
    else:
        reduced_x_train = np.concatenate((reduced_x_train, x_train_preprocessed[:, clean_feature_index[str(FEATURES_BY_CATEGORY[category][0])]].reshape(-1, 1)), axis=1)


In [None]:
print("Number of features reduced to {} thanks to PCA".format(reduced_x_train.shape[1]))

## Split local data into train and validation

#### From now on we'll use the local train and validation data: `x_tr` and `y_tr` for training and `x_va` and `y_va` for validation

In [None]:


# Split local data into train and validation
(x_tr, x_va, y_tr, y_va) = split_train_validation(reduced_x_train,
                                                  reduced_y_train,
                                                  valid_proportion=0.2)

## Data preparation for training

In [None]:
# Perform over sampling (due to unbalanced data)
x_tr_oversampled, y_tr_oversampled = custom_random_oversampling(x_tr, y_tr)

# NOTE: to properly validate the model we should not oversampled the validation set!
# x_va_oversampled, y_va_oversampled = custom_random_oversampling(x_va, y_va[:,1])


# Add bias column to the training and validation data
x_tr_oversampled_bias = np.append(x_tr_oversampled, np.ones(shape=(x_tr_oversampled.shape[0], 1)), axis=1)
x_va_bias = np.append(x_va, np.ones(shape=(x_va.shape[0], 1)), axis=1)


# Training

In [None]:
w, train_loss, valid_loss = reg_logistic_regression(x_tr_oversampled_bias,
                                                    y_tr_oversampled,
                                                    x_va_bias,
                                                    y_va,
                                                    lambda_=0.001, 
                                                    max_iter=5000,
                                                    gamma=0.01,
                                                    batch_size=100, 
                                                    w=np.random.random(size=x_tr_oversampled_bias.shape[1]),
                                                    all_losses=True)


In [None]:
# plot losses
loss_visualization(train_loss, valid_loss)
plt.yscale("log")

# Evaluation on training set

In [None]:
predicted_y_tr = predict(x_tr_oversampled_bias,
                         w,
                         threshold=0.5,
                         negative_label=0)

print("EVALUATION SUMMARY ON TRAINING SET")
evaluation_summary(y_tr_oversampled, predicted_y_tr)

In [None]:
sns.histplot(predict_no_labels(x_tr_oversampled_bias, w), bins=30, kde=True)
plt.title("Distribution of unlabeled predictions on training set")
plt.show()

# Evaluation on validation set

In [None]:
predicted_y_va = predict(x_va_bias,
                         w,
                         threshold=0.65,
                         negative_label=0)
print("EVALUATION SUMMARY ON VALIDATION SET")
evaluation_summary(y_va, predicted_y_va)

In [None]:
sns.histplot(predict_no_labels(x_va_bias, w), bins=30, kde=True)
plt.title("Distribution of unlabeled predictions on validation set")
plt.show()

In [None]:
decision_threshold(x_va_bias, w, y_va)


---

# COMPUTE RESULT FOR SUBMISSION

## Pre processing and data preparation

In [None]:
# Keep only selected features in submission data
ids = x_sub[:, 0]

x_sub_clean, clean_features, clean_feature_index = keep_features(x_sub, FEATURES_DICT.keys(), feature_names, feature_indexes)

x_sub_preprocessed = preprocessing_pipeline(x_sub_clean,
                                            where=clean_features,
                                            feature_index=clean_feature_index,
                                            nan_replacement=REPLACEMENT_LIST)

In [None]:
# Perform PCA on each group of features, keeping the same components 
# that have explained 90% of the variance for the training data

reduced_x_sub = np.empty((x_sub_preprocessed.shape[0], 0))

for category in FEATURES_BY_CATEGORY.keys():

    if len(FEATURES_BY_CATEGORY[category]) > 1:
        # Create the array with data only for the current category
        fs_in_category = [str(f) for f in FEATURES_BY_CATEGORY[category]]
        fs_in_category_indices = [clean_feature_index[feature] for feature in clean_features if feature in fs_in_category]
        category_data_sub = x_sub_preprocessed[:, fs_in_category_indices]
        
        # Load the PCA object for the current category (already fitted on the training data)
        tmp_pca = PCA_per_category[category]
        # Append the reduced data to the combined matrix        
        reduced_data = tmp_pca.transform(category_data_sub, threshold=0.9)
        reduced_x_sub = np.concatenate((reduced_x_sub, reduced_data), axis=1)
    else:
        reduced_x_sub = np.concatenate((reduced_x_sub, x_sub_preprocessed[:, clean_feature_index[str(FEATURES_BY_CATEGORY[category][0])]].reshape(-1, 1)), axis=1)


In [None]:
# Add bias column to the submission data
x_sub_bias = np.append(reduced_x_sub, np.ones(shape=(reduced_x_sub.shape[0], 1)), axis=1)

In [None]:
predicted_y_sub = predict(x_sub_bias,
                          w,
                          threshold=0.65,
                          negative_label=-1)

sns.histplot(predict_no_labels(x_sub_bias, w), bins=30, kde=True)

In [None]:
# Save predictions to csv file

create_csv_submission(ids=ids, y_pred=predicted_y_sub, path=BASE_PATH + "/submission.csv")

reduced_y_train = change_negative_class(y_train[:, 1], current=-1, new=0)
