In [None]:
import sys
# Add the path to the scripts folder
sys.path.append('util/')

import numpy as np
from util.preprocessing import *
from util.features_util import *
from util.features_info import *

In [None]:
def load_train_data(sub_sample=False):
    path_x_train = "data/x_train.csv"
    path_y_train = "data/y_train.csv"

    features_names = np.genfromtxt(
        path_x_train, 
        delimiter=",", 
        dtype=str,
        max_rows=1
    )

    x_train = np.genfromtxt(
        path_x_train, 
        delimiter=",", 
        skip_header=1
    )
    
    y_train = np.genfromtxt(
        path_y_train,
        delimiter=",",
        skip_header=1,
        usecols=0
    )

    # sub-sample
    if sub_sample:
        x_train = x_train[::50]
        y_train = y_train[::50]

    return x_train, y_train, features_names

In [None]:
# Import data and create the dictionary of features
x, y, features = load_train_data()

feature_indexes = dict(zip(features, range(len(features))))

In [None]:
features

In [None]:
fs_to_keep = excercise_features = ["_TOTINDA", "METVL11_", "METVL21_", "MAXVO2_", "ACTIN11_", "ACTIN21_", "PADUR1_", "PADUR2_", "PAFREQ1_", "PAFREQ2_", "_MINAC11", "_MINAC21", "STRFREQ_", "PA1MIN_", "PAVIG11_", "PAVIG21_", "PA1VIGM_", "_PACAT1", "_PAINDX1", "_PA150R2", "_PA300R2", "_PA30021", "_PASTRNG"]
x_clean, features_clean, feature_indexes_clean = keep_features(x, fs_to_keep, features, feature_indexes)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

df = pd.DataFrame(x_clean, columns=features_clean)

# Compute the correlation matrix
correlation_matrix = df.corr()

# Create a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, cmap="coolwarm", square=True)

# Show the heatmap
plt.show()

In [None]:
def stats(data: np.ndarray, feature_index: Dict[str, int]):
    for f in feature_index.keys():
        print("----- {} -----".format(f))
        values, counts = np.unique(data[:,feature_index[f]], return_counts=True)
        print("\t(value, counts): {}".format([(value,count) for value, count in zip(values, counts)]))
        print("\tmean: {:.2f}".format(np.nanmean(data[:,feature_index[f]])))
        print("\tmedian: {:.2f}".format(np.nanmedian(data[:,feature_index[f]])))

In [None]:
# example of pipeline applied just to some features: "_TOTINDA", "METVL11_"
fs_test = ["_TOTINDA", "METVL11_"]
x_test, fs_test, feature_indexes_test = keep_features(x, fs_test, features, feature_indexes)

print("-> Original data")
stats(x_test, feature_indexes_test)

# put to nan everything that means nan (nan aliases)
x_test = align_nans(x_test, fs_test, feature_indexes_test)
print("-> NaN aliases resolved")
stats(x_test, feature_indexes_test)


# map values
x_test = map_values(x_test, fs_test, feature_indexes_test)
print("-> Mapped values")
stats(x_test, feature_indexes_test)

# replace nans
x_test = remove_nans(x_test, fs_test, feature_indexes_test)
print("-> Removed NaNs")
stats(x_test, feature_indexes_test)
