# Preprocessing the Dataset

In [None]:
%load_ext autoreload
%autoreload 2

## Imports

Importing all required packages

In [None]:
import pandas as pd
import sys
import pickle

sys.path.append('./')
import utils

from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import NearMiss

In [None]:
seed = 42

## Loading Rogue Wave Data

Note:

- The data are stored in a csv file of the size 1048574 x 29.
- Modified ina's input dataset, now the encoded values as well as original wave dir and obj weather data are in a same file

In [None]:
data_rogue_waves = pd.read_csv("ML_Matrix_full_encoded_st.csv", index_col=0)

In [None]:
print(f"The dataset has {data_rogue_waves.shape[1] - 1} variables that describe {data_rogue_waves.shape[0]} waves.")

In [None]:
data_rogue_waves.head()

### Preprocessing

Some of the target values are close to 3, which is not realistic, hence, they are excluded from the dataset.

In [None]:
thr_target = 2.7
data_rogue_waves = data_rogue_waves.loc[data_rogue_waves.AI_10min < thr_target]

Select key features

In [None]:
selected_features = ['H_s', 'lambda_40', 'lambda_30', 'L_deep', 's', 'mu', 'kh', 'T_p', 'eps', 'nu', 'Q_p', 'swell', 'v_wind', 'T_air', 'Delta_T', 'p', 'Delta_p_1h','AI_10min']
data_rogue_waves = data_rogue_waves.loc[:,selected_features]

In [None]:
data_rogue_waves.head()

In [None]:
utils.plot_distributions(dataset=data_rogue_waves, ncols=5)

For classification approach, we need to turn the target values into classes.  
We will have a look at three different target transformations:

- case 1: class 0: target < 2.0 and class 1: target > 2.0 
- case 2: class 0: target < 1.5 and class 1: target > 2.0
- case 3: class 0: target < 1.5, class 1: 1.5 < target < 2.0 and class 2: target > 2.0

**Case 1:**

In [None]:
# Define the threshold
threshold = 2.0

# Binarize target 
data_rogue_waves_case1 = data_rogue_waves.copy()
data_rogue_waves_case1['target'] = data_rogue_waves_case1['AI_10min'].apply(lambda x: 0 if x < threshold else 1)
data_rogue_waves_case1['target'] = data_rogue_waves_case1['target'].astype(int)

print('Dataset target distribution:')
print(Counter(data_rogue_waves_case1['target']))

**Case 2:**

In [None]:
# Define thresholds for three classes
threshold_1 = 1.5
threshold_2 = 2.0

# Binarize target 
data_rogue_waves_case2 = data_rogue_waves.copy()
data_rogue_waves_case2['target'] = data_rogue_waves_case2['AI_10min'].apply(lambda x: 0 if x < threshold_1 else (0.5 if x < threshold_2 else 1))
data_rogue_waves_case2 = data_rogue_waves_case2[data_rogue_waves_case2['target'].isin([0, 1])]
data_rogue_waves_case2['target'] = data_rogue_waves_case2['target'].astype(int)

print('Dataset target distribution:')
print(Counter(data_rogue_waves_case2['target']))

**Case 3:**

In [None]:
# Define thresholds for three classes
threshold_1 = 1.5
threshold_2 = 2.0

# Binarize target 
data_rogue_waves_case3 = data_rogue_waves.copy()
data_rogue_waves_case3['target'] = data_rogue_waves_case3['AI_10min'].apply(lambda x: 0 if x < threshold_1 else (1 if x < threshold_2 else 2))
data_rogue_waves_case3['target'] = data_rogue_waves_case3['target'].astype(int)

print('Dataset target distribution:')
print(Counter(data_rogue_waves_case3['target']))

In [None]:
col_target = "target"
col_features = data_rogue_waves_case1.columns.drop(col_target).to_list()

### Setup train and test data

Split data into train and test dataset with stratified split, to ensure having enough instances of the minority class in both train and test set. Then save the dataset to pickl.

To tackle the high class imbalance we will undersample the larger class using NearMiss Undersampler. Near-miss is an algorithm that can help in balancing an imbalanced dataset. It can be grouped under undersampling algorithms and is an efficient way to balance the data. The algorithm does this by looking at the class distribution and randomly eliminating samples from the larger class. When two points belonging to different classes are very close to each other in the distribution, this algorithm eliminates the datapoint of the larger class thereby trying to balance the distribution.

For expl of version argument and n_neighbours see https://hersanyagci.medium.com/under-sampling-methods-for-imbalanced-data-clustercentroids-randomundersampler-nearmiss-eae0eadcc145

In [None]:
for i, data in enumerate([data_rogue_waves_case1, data_rogue_waves_case2, data_rogue_waves_case3]):

    # We will first separate the target and feature columns.
    X = data[col_features]
    y = data[col_target]

    # split into train and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size=0.80, random_state=seed)

    X_train.reset_index(inplace=True, drop=True)
    X_test.reset_index(inplace=True, drop=True)
    y_train.reset_index(inplace=True, drop=True)
    y_test.reset_index(inplace=True, drop=True)

    print('Training dataset target distribution:')
    print(Counter(y_train))

    print('Test dataset target distribution:')
    print(Counter(y_test))

    # Save the data 
    data = [X_train, X_test, y_train, y_test]

    with open(f'../data/data_case{i+1}.pickle', 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # Undersample the larger class
    nm = NearMiss(version=1, sampling_strategy='auto', n_neighbors=5) 
    X_train, y_train = nm.fit_resample(X_train, y_train)    

    print('Resampled dataset shape:')
    print(Counter(y_train))

    # After undersampling, drop the continuous target variable from the dataset as we only use the binarized version for classification.
    X_train = X_train.drop(columns=['AI_10min'])
    X_test = X_test.drop(columns=['AI_10min'])

    data = [X_train, X_test, y_train, y_test]

    # Save the data 
    with open(f'../data/data_case{i+1}_undersampled.pickle', 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)