In [None]:
# Add parent directory to PYTHONPATH of the notebook
import os
import sys
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
print(f"Adding parent directory to sys.path: {parent_dir}")
if parent_dir not in sys.path:
    sys.path.append(parent_dir)


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Path to data files
data_path = '../data/'


%load_ext autoreload
%autoreload 2

Adding parent directory to sys.path: c:\Users\Łukasz\OneDrive - Politechnika Warszawska\MINI_1D\Semestr 8\Advanced ML\Project_2\AML_Feature_Selection


In [2]:
# Load the data
X_train = pd.read_csv(data_path + 'x_train.txt', delimiter=' ', header=None)
y_train = pd.read_csv(data_path + 'y_train.txt', delimiter=' ', header=None).squeeze()
X_test = pd.read_csv(data_path + 'x_test.txt', delimiter=' ', header=None)

# Rename columns for better readability
X_train.columns = [f'feature_{i}' for i in range(X_train.shape[1])]
X_test.columns = [f'feature_{i}' for i in range(X_test.shape[1])]

# Display basic information about the data
print("Training data shape:", X_train.shape)
print("Target variable shape:", y_train.shape)
print("Test data shape:", X_test.shape)
print("\nClass distribution in training data:")
print(y_train.value_counts(normalize=True) * 100)

# Split the training data into train and validation sets
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=RANDOM_STATE, stratify=y_train
)

print("\nAfter splitting:")
print("Train set shape:", X_train_split.shape)
print("Validation set shape:", X_val.shape)

display(X_train_split.head())
display(y_train_split)

Training data shape: (5000, 500)
Target variable shape: (5000,)
Test data shape: (5000, 500)

Class distribution in training data:
0
0    51.14
1    48.86
Name: proportion, dtype: float64

After splitting:
Train set shape: (4000, 500)
Validation set shape: (1000, 500)


Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_490,feature_491,feature_492,feature_493,feature_494,feature_495,feature_496,feature_497,feature_498,feature_499
1796,10.994454,7.223997,18.905844,15.302758,11.335159,11.26861,16.742562,11.595944,11.898338,11.381933,...,8.083485,9.417854,18.105234,2.648145,6.880692,4.95024,11.992575,8.532587,14.966706,21.897683
3243,18.329623,13.311553,24.541799,16.356399,20.784595,11.251973,15.867162,17.039225,10.326017,16.232807,...,13.244366,8.005045,4.562646,5.81464,11.137799,19.105339,4.509459,10.775882,3.348956,7.777499
2491,8.856867,8.177936,16.285177,9.685481,11.551787,4.839914,10.932191,7.374404,8.85378,10.378997,...,9.533385,12.313883,3.841512,6.604631,7.06597,7.968559,13.22461,9.949868,2.687109,11.251411
2106,23.597065,22.040934,43.749375,30.13102,30.13119,16.708132,29.810484,25.257863,24.452841,27.266843,...,8.631784,7.659637,12.655417,5.507021,12.309867,2.966007,4.071214,17.345237,6.848255,11.848773
1644,20.971488,18.626463,33.579955,19.998639,26.105883,9.787864,20.467935,18.329333,14.34748,23.615695,...,12.014632,16.713876,6.763858,13.76416,6.576413,21.474574,6.643161,11.400546,11.648409,12.634681


1796    0
3243    0
2491    0
2106    1
1644    1
       ..
4365    0
611     0
1003    0
3743    0
818     1
Name: 0, Length: 4000, dtype: int64

In [18]:
# Standard scaling (mean=0, std=1)
scaler = StandardScaler()
X_train_split_scaled = scaler.fit_transform(X_train_split)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Create DataFrames from the scaled arrays and index from original data
X_train_split_scaled_df = pd.DataFrame(X_train_split_scaled, columns=X_train_split.columns, index=X_train_split.index)
X_val_scaled_df = pd.DataFrame(X_val_scaled, columns=X_val.columns, index=X_val.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

# Save the new data
save_path = '../data/scaled/'
if not os.path.exists(save_path):
    os.makedirs(save_path)

# Ensure the data is consistent with the original data
# Before
display(X_train_split.head(3))
# After
display(X_train_split_scaled_df.head(3))
# y_train_split
display(y_train_split.head(3))

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_490,feature_491,feature_492,feature_493,feature_494,feature_495,feature_496,feature_497,feature_498,feature_499
1796,10.994454,7.223997,18.905844,15.302758,11.335159,11.26861,16.742562,11.595944,11.898338,11.381933,...,8.083485,9.417854,18.105234,2.648145,6.880692,4.95024,11.992575,8.532587,14.966706,21.897683
3243,18.329623,13.311553,24.541799,16.356399,20.784595,11.251973,15.867162,17.039225,10.326017,16.232807,...,13.244366,8.005045,4.562646,5.81464,11.137799,19.105339,4.509459,10.775882,3.348956,7.777499
2491,8.856867,8.177936,16.285177,9.685481,11.551787,4.839914,10.932191,7.374404,8.85378,10.378997,...,9.533385,12.313883,3.841512,6.604631,7.06597,7.968559,13.22461,9.949868,2.687109,11.251411


Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_490,feature_491,feature_492,feature_493,feature_494,feature_495,feature_496,feature_497,feature_498,feature_499
1796,-1.06176,-1.430582,-1.269855,-0.705374,-1.475228,-0.150262,-0.492853,-0.898017,-0.509204,-0.96176,...,-0.428594,-0.129728,1.770227,-1.641438,-0.694539,-1.101744,0.420421,-0.32732,1.109481,2.629266
3243,0.635321,0.173405,-0.459863,-0.49253,0.326602,-0.154819,-0.6573,0.327381,-0.862406,0.047546,...,0.720985,-0.447933,-1.201419,-0.931852,0.249667,2.052077,-1.233392,0.176505,-1.495024,-0.502881
2491,-1.556317,-1.179233,-1.646494,-1.840114,-1.433921,-1.911075,-1.584349,-1.848375,-1.193125,-1.170438,...,-0.105631,0.522542,-1.359657,-0.754821,-0.653445,-0.429249,0.692709,-0.009011,-1.6434,0.267704


1796    0
3243    0
2491    0
Name: 0, dtype: int64

In [20]:
X_train_split_scaled_df.to_csv(save_path + 'x_train_scaled.txt', index=True, header=True, sep=' ')
X_val_scaled_df.to_csv(save_path + 'x_val_scaled.txt', index=True, header=True, sep=' ')
X_test_scaled_df.to_csv(save_path + 'x_test_scaled.txt', index=True, header=True, sep=' ')
y_train_split.to_csv(save_path + 'y_train.txt', index=True, header=False, sep=' ')
y_val.to_csv(save_path + 'y_val.txt', index=True, header=False, sep=' ')