Data processing

In [5]:
#Import libraries
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler

from src.utils.split_data import train_val_split, split_features

In [6]:
#Load the data
DATASET_PATH = "../../data/raw/nn_challenge_train.pkl"
df = pd.read_pickle(DATASET_PATH)

In [7]:
df_t_0 = df[df["target"] == 0]
df_t_1 = df[df["target"] != 0]

In [8]:
print(len(df_t_0))
print(len(df_t_1))

66466
405956


In [9]:
df_t_1_sub = df_t_1.sample(n=len(df_t_0), random_state=42)

In [10]:
balanced_df = pd.concat([df_t_0, df_t_1_sub])

In [11]:
balanced_df

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_210,feature_211,feature_212,feature_213,feature_214,feature_215,feature_216,feature_217,feature_218,target
22,0.661364,1.0,0.0,0.000120,0.310606,0.009740,0.180457,0.752510,0.001050,0.500000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,1.0,0
23,0.672847,0.5,0.0,0.000131,0.327720,0.035065,0.685714,0.777475,0.003627,0.000000,...,0.054111,0.097624,0.005062,0.86365,0.033398,0.156841,0.025219,0.002243,0.0,0
36,0.728351,0.5,0.0,0.000068,0.245778,0.001299,0.571429,0.712884,0.000604,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,1.0,0
67,0.575738,0.5,0.0,0.000000,0.259794,0.011688,0.228571,0.624854,0.000986,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0
72,0.505637,0.0,0.0,,0.332217,0.001948,0.380914,0.847067,0.001845,0.666667,...,0.055426,0.058180,0.001687,0.75000,0.037872,0.134145,0.020597,0.003380,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397076,0.532182,1.0,0.0,0.000031,0.203233,0.001299,0.000000,0.757268,0.000000,,...,0.196904,0.181991,0.000562,0.50000,0.087432,0.261677,0.181991,0.009904,1.0,1
234658,0.607332,1.0,0.0,0.000106,0.404031,0.023377,0.285714,0.873888,0.002322,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,1.0,1
431068,0.677294,0.0,0.0,0.000000,0.200944,0.001948,0.190514,0.740053,0.001559,0.500000,...,0.035081,0.075729,0.001687,0.83335,0.031148,0.103993,0.073463,0.001517,0.0,1
103629,0.607917,1.0,0.0,0.001009,0.278486,0.012987,0.380914,0.754901,0.001145,1.000000,...,0.217268,0.035608,0.003937,0.77275,0.057811,0.319861,0.020597,0.004438,1.0,1


In [12]:
balanced_df["target"].value_counts()

target
0    66466
1    66466
Name: count, dtype: int64

In [13]:
balanced_df = balanced_df.drop(["feature_15", "feature_19", "feature_39", "feature_148"], axis=1)

In [14]:
balanced_df.dtypes

feature_1      float64
feature_2      float32
feature_3      float32
feature_4      float32
feature_5      float64
                ...   
feature_215    float32
feature_216    float32
feature_217    float32
feature_218    float32
target           int32
Length: 215, dtype: object

In [15]:
X = balanced_df.drop("target", axis=1)
y = balanced_df["target"].copy()

In [16]:
y = pd.DataFrame(y, columns=["target"])

In [17]:
category_columns = ["feature_17", "feature_87", "feature_118", "feature_119", "feature_139", "feature_144", "feature_147", "feature_158", "feature_159", "feature_195"]

X_category = X[category_columns].copy()
X_numeric = X.drop(category_columns, axis=1)

In [18]:
X_numeric.dtypes

feature_1      float64
feature_2      float32
feature_3      float32
feature_4      float32
feature_5      float64
                ...   
feature_214    float32
feature_215    float32
feature_216    float32
feature_217    float32
feature_218    float32
Length: 204, dtype: object

In [19]:
X_category.dtypes

feature_17     category
feature_87     category
feature_118    category
feature_119    category
feature_139    category
feature_144    category
feature_147    category
feature_158    category
feature_159    category
feature_195    category
dtype: object

In [20]:
#Null values are padded
numeric_imputer = SimpleImputer(strategy='median')
X_numeric_padded = numeric_imputer.fit_transform(X_numeric)

In [21]:
#Null values are padded
category_imputer = SimpleImputer(strategy='most_frequent')
X_category_padded = category_imputer.fit_transform(X_category)

In [22]:
X_numeric_df = pd.DataFrame(X_numeric_padded, columns=X_numeric.columns, index=y.index)
X_category_df = pd.DataFrame(X_category_padded, columns=X_category.columns, index=y.index)

In [23]:
#Encoding category features
X_category_encoded = pd.DataFrame(index=y.index)

encoders = {}

for column in X_category_df.columns:
    encoder = LabelEncoder()
    X_category_encoded[column] = encoder.fit_transform(X_category_df[column])
    encoders[column] = encoder

In [24]:
X_category_encoded.isna().sum()

feature_17     0
feature_87     0
feature_118    0
feature_119    0
feature_139    0
feature_144    0
feature_147    0
feature_158    0
feature_159    0
feature_195    0
dtype: int64

In [25]:
X_df_processed = pd.concat([X_numeric_df, X_category_encoded], axis=1)

In [26]:
X_df_processed.isna().sum()

feature_1      0
feature_2      0
feature_3      0
feature_4      0
feature_5      0
              ..
feature_144    0
feature_147    0
feature_158    0
feature_159    0
feature_195    0
Length: 214, dtype: int64

In [27]:
X_df_processed

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_17,feature_87,feature_118,feature_119,feature_139,feature_144,feature_147,feature_158,feature_159,feature_195
22,0.661364,1.0,0.0,0.000120,0.310606,0.009740,0.180457,0.752510,0.001050,0.500000,...,7,2,1,0,0,3,3,5,11,4
23,0.672847,0.5,0.0,0.000131,0.327720,0.035065,0.685714,0.777475,0.003627,0.000000,...,4,0,0,0,0,1,1,6,6,2
36,0.728351,0.5,0.0,0.000068,0.245778,0.001299,0.571429,0.712884,0.000604,0.000000,...,6,1,1,1,4,3,4,6,11,4
67,0.575738,0.5,0.0,0.000000,0.259794,0.011688,0.228571,0.624854,0.000986,0.000000,...,3,1,1,1,1,2,5,0,5,5
72,0.505637,0.0,0.0,0.000049,0.332217,0.001948,0.380914,0.847067,0.001845,0.666667,...,3,1,1,5,7,3,2,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397076,0.532182,1.0,0.0,0.000031,0.203233,0.001299,0.000000,0.757268,0.000000,0.500000,...,3,0,0,2,1,1,5,3,6,2
234658,0.607332,1.0,0.0,0.000106,0.404031,0.023377,0.285714,0.873888,0.002322,0.000000,...,0,2,0,5,0,1,1,5,6,4
431068,0.677294,0.0,0.0,0.000000,0.200944,0.001948,0.190514,0.740053,0.001559,0.500000,...,7,0,0,3,7,3,4,0,13,2
103629,0.607917,1.0,0.0,0.001009,0.278486,0.012987,0.380914,0.754901,0.001145,1.000000,...,3,0,2,4,7,3,0,1,7,1


In [28]:
balanced_processed_df = pd.concat([X_df_processed, y], axis=1)

In [29]:
balanced_processed_df

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_87,feature_118,feature_119,feature_139,feature_144,feature_147,feature_158,feature_159,feature_195,target
22,0.661364,1.0,0.0,0.000120,0.310606,0.009740,0.180457,0.752510,0.001050,0.500000,...,2,1,0,0,3,3,5,11,4,0
23,0.672847,0.5,0.0,0.000131,0.327720,0.035065,0.685714,0.777475,0.003627,0.000000,...,0,0,0,0,1,1,6,6,2,0
36,0.728351,0.5,0.0,0.000068,0.245778,0.001299,0.571429,0.712884,0.000604,0.000000,...,1,1,1,4,3,4,6,11,4,0
67,0.575738,0.5,0.0,0.000000,0.259794,0.011688,0.228571,0.624854,0.000986,0.000000,...,1,1,1,1,2,5,0,5,5,0
72,0.505637,0.0,0.0,0.000049,0.332217,0.001948,0.380914,0.847067,0.001845,0.666667,...,1,1,5,7,3,2,1,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397076,0.532182,1.0,0.0,0.000031,0.203233,0.001299,0.000000,0.757268,0.000000,0.500000,...,0,0,2,1,1,5,3,6,2,1
234658,0.607332,1.0,0.0,0.000106,0.404031,0.023377,0.285714,0.873888,0.002322,0.000000,...,2,0,5,0,1,1,5,6,4,1
431068,0.677294,0.0,0.0,0.000000,0.200944,0.001948,0.190514,0.740053,0.001559,0.500000,...,0,0,3,7,3,4,0,13,2,1
103629,0.607917,1.0,0.0,0.001009,0.278486,0.012987,0.380914,0.754901,0.001145,1.000000,...,0,2,4,7,3,0,1,7,1,1


In [30]:
balanced_processed_df['target'].value_counts()

target
0    66466
1    66466
Name: count, dtype: int64

In [31]:
balanced_processed_df['target'].unique()

array([0, 1], dtype=int32)

In [32]:
is_null = balanced_processed_df.isna().any()
is_null[is_null]

Series([], dtype: bool)

In [33]:
X_processed = balanced_processed_df.drop("target", axis=1)

In [34]:
#Scale data
scaler = RobustScaler()
X_balanced_processed = scaler.fit_transform(X_processed.copy())

X_balanced_processed_scaled_df = pd.DataFrame(X_balanced_processed, columns=X_df_processed.columns, index=y.index)

In [35]:
balanced_processed_scaled_df = pd.concat([X_balanced_processed_scaled_df, y], axis=1)

In [36]:
balanced_processed_scaled_df.isna().sum()

feature_1      0
feature_2      0
feature_3      0
feature_4      0
feature_5      0
              ..
feature_147    0
feature_158    0
feature_159    0
feature_195    0
target         0
Length: 215, dtype: int64

In [37]:
#Split the train,and validation set
train_set, val_set = train_val_split(balanced_processed_scaled_df)

In [59]:
#Split the train,and validation set without scaled
train_set_original, val_set_original = train_val_split(balanced_processed_df)

In [60]:
print(len(train_set))
print(len(val_set))
print(len(train_set_original))
print(len(val_set_original))

112992
19940
112992
19940


In [39]:
#Split the inputs and labels
X_train, y_train = split_features(train_set, "target")
X_val, y_val = split_features(val_set, "target")

In [61]:
X_train_original, y_train_original = split_features(train_set_original, "target")
X_val_original, y_val_original = split_features(val_set_original, "target")

In [64]:
X_train

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_17,feature_87,feature_118,feature_119,feature_139,feature_144,feature_147,feature_158,feature_159,feature_195
22026,-0.115445,0.0,0.0,-0.373402,-0.243591,0.916667,1.500000,-0.132275,3.540540,0.30,...,0.0,0.0,0.0,0.0,-0.166667,0.0,0.0,0.75,1.0,0.0
187587,0.002386,-1.0,0.0,61.453959,-0.227537,0.500000,-0.119423,0.174540,-0.945946,0.00,...,-3.0,0.0,-1.0,0.0,0.000000,0.0,0.0,0.50,-0.2,3.0
386192,0.071340,0.0,0.0,-0.373402,-0.232410,-0.041667,0.145669,-0.345741,-0.162162,-1.50,...,4.0,0.0,-1.0,1.0,0.333333,-1.0,-1.0,-0.25,-0.4,-2.0
353343,-0.448292,0.0,0.0,-0.373402,-0.487079,0.750000,-1.312336,-0.758409,1.108108,-0.50,...,0.0,1.0,-1.0,1.0,0.333333,0.0,0.5,0.75,-1.2,0.0
186313,-1.052741,0.0,0.0,0.000000,-0.032318,6.208333,-0.843832,0.161558,-0.432432,0.00,...,0.0,1.0,0.0,0.0,-0.833333,0.0,0.0,-0.50,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153537,-1.348937,1.0,0.0,-0.373402,1.253944,1.583333,-1.312336,0.929271,1.324324,0.00,...,0.0,1.0,-1.0,1.0,0.333333,-1.0,-1.0,0.50,-0.4,0.0
270676,-0.642716,0.0,0.0,-0.019182,-0.371776,1.625000,0.202100,0.680236,0.702703,-0.75,...,0.0,1.0,1.0,1.0,-0.666667,0.0,-1.0,-0.75,0.8,-1.0
100803,0.091785,0.0,0.0,0.014066,-0.308113,-0.083333,-1.312336,-1.188681,-0.243243,-0.50,...,3.0,1.0,-1.0,0.0,0.333333,-1.0,0.5,0.00,0.0,-2.0
63977,-0.984365,-1.0,0.0,2.109974,-0.819176,4.000000,0.000000,0.181636,2.081081,-1.50,...,-3.0,1.0,0.0,0.0,0.166667,0.0,0.5,-0.25,1.2,-1.0


In [40]:
SAVE_PATH = "../../data/processed/"
balanced_processed_df.to_csv(SAVE_PATH+"processed_data.csv", index=False)
balanced_processed_scaled_df.to_csv(SAVE_PATH+"processed_scaled_data.csv", index=False)

In [62]:
#Save the data in npz format
np.savez(SAVE_PATH + 'train_data', inputs=X_train, labels=y_train)
np.savez(SAVE_PATH + 'validation_data', inputs=X_val, labels=y_val)

np.savez(SAVE_PATH + 'train_data_original', inputs=X_train_original, labels=y_train_original)
np.savez(SAVE_PATH + 'validation_data_original', inputs=X_val_original, labels=y_val_original)

Validation data

In [42]:
val_df = pd.read_pickle("../../data/raw/validation_sample_1.pkl")

In [43]:
val_df

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_209,feature_210,feature_211,feature_212,feature_213,feature_214,feature_215,feature_216,feature_217,feature_218
0,0.616277,0.5,0.0,0.000000,0.099574,0.009777,0.250000,0.777850,0.004666,0.666667,...,0.55000,0.084877,0.084301,0.009744,0.85000,0.053579,0.205928,0.084301,0.014128,0.0
1,0.239266,0.0,0.0,0.000189,0.391774,0.006983,0.208375,0.561785,0.001886,0.500000,...,0.50000,0.027562,0.034084,0.003654,1.00000,0.035562,0.118498,0.028083,0.004438,1.0
2,0.486676,0.5,0.0,0.000000,0.427581,0.000000,0.277750,0.582175,0.006553,0.500000,...,0.00000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,1.0
3,0.455066,0.5,0.0,,0.363796,0.001397,0.096125,0.657557,0.002383,0.666667,...,0.00000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.0
4,0.490666,0.0,0.0,0.000000,0.253376,0.009777,0.500000,0.632926,0.000000,,...,0.00000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.602765,0.5,0.0,0.003109,0.268040,0.008380,0.367625,0.822913,0.004269,1.000000,...,0.00000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.0
496,0.213441,0.5,0.0,0.000946,0.213526,0.009777,0.113625,0.692089,0.004170,0.500000,...,0.00000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.0
497,0.409559,0.5,0.0,0.000327,0.275425,0.000000,0.000000,0.835031,0.004865,0.000000,...,0.50000,0.036565,0.222277,0.006090,1.00000,0.039430,0.128271,0.191911,0.004304,0.0
498,0.462198,1.0,0.0,,0.261926,0.000000,0.500000,0.708778,0.002482,0.000000,...,0.66665,0.199577,0.240511,0.002436,0.58335,0.111624,0.381345,0.049855,0.024964,1.0


In [44]:
val_df = val_df.drop(["feature_15", "feature_19", "feature_39", "feature_148"], axis=1)

In [45]:
category_columns = ["feature_17", "feature_87", "feature_118", "feature_119", "feature_139", "feature_144",
                    "feature_147", "feature_158", "feature_159", "feature_195"]

X_category_val = val_df[category_columns].copy()
X_numeric_val = val_df.drop(category_columns, axis=1)

In [46]:
#Null values are padded
X_numeric_padded_val = numeric_imputer.transform(X_numeric_val)

In [47]:
X_category_padded_val = category_imputer.transform(X_category_val)

In [48]:
X_numeric_df_val = pd.DataFrame(X_numeric_padded_val, columns=X_numeric_val.columns)
X_category_df_val = pd.DataFrame(X_category_padded_val, columns=X_category_val.columns)

In [49]:
# Supongamos que tienes un conjunto de prueba llamado X_test_category_padded

# Crear un nuevo DataFrame para las características categóricas codificadas del conjunto de prueba
X_val_category_encoded = pd.DataFrame()

# Iterar a través de cada columna categórica en X_test_category_padded
for column in X_category_df_val.columns:
    encoder = encoders[column]
    X_val_category_encoded[column] = encoder.transform(X_category_df_val[column])


In [50]:
X_val_category_encoded

Unnamed: 0,feature_17,feature_87,feature_118,feature_119,feature_139,feature_144,feature_147,feature_158,feature_159,feature_195
0,3,0,1,1,7,2,4,5,13,2
1,6,0,1,5,7,3,5,3,7,1
2,3,0,2,3,4,3,4,5,6,4
3,0,1,0,3,2,2,1,6,14,5
4,0,2,0,1,7,1,1,6,6,5
...,...,...,...,...,...,...,...,...,...,...
495,0,2,0,2,0,1,2,5,11,5
496,0,1,2,1,7,1,3,2,7,5
497,7,2,0,2,1,1,4,5,6,2
498,3,2,0,1,7,1,3,6,6,2


In [51]:
X_val_category_encoded.isna().sum()

feature_17     0
feature_87     0
feature_118    0
feature_119    0
feature_139    0
feature_144    0
feature_147    0
feature_158    0
feature_159    0
feature_195    0
dtype: int64

In [52]:
X_df_processed_val = pd.concat([X_numeric_df_val, X_val_category_encoded], axis=1)

In [53]:
X_df_processed_val

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_17,feature_87,feature_118,feature_119,feature_139,feature_144,feature_147,feature_158,feature_159,feature_195
0,0.616277,0.5,0.0,0.000000,0.099574,0.009777,0.250000,0.777850,0.004666,0.666667,...,3,0,1,1,7,2,4,5,13,2
1,0.239266,0.0,0.0,0.000189,0.391774,0.006983,0.208375,0.561785,0.001886,0.500000,...,6,0,1,5,7,3,5,3,7,1
2,0.486676,0.5,0.0,0.000000,0.427581,0.000000,0.277750,0.582175,0.006553,0.500000,...,3,0,2,3,4,3,4,5,6,4
3,0.455066,0.5,0.0,0.000049,0.363796,0.001397,0.096125,0.657557,0.002383,0.666667,...,0,1,0,3,2,2,1,6,14,5
4,0.490666,0.0,0.0,0.000000,0.253376,0.009777,0.500000,0.632926,0.000000,0.500000,...,0,2,0,1,7,1,1,6,6,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.602765,0.5,0.0,0.003109,0.268040,0.008380,0.367625,0.822913,0.004269,1.000000,...,0,2,0,2,0,1,2,5,11,5
496,0.213441,0.5,0.0,0.000946,0.213526,0.009777,0.113625,0.692089,0.004170,0.500000,...,0,1,2,1,7,1,3,2,7,5
497,0.409559,0.5,0.0,0.000327,0.275425,0.000000,0.000000,0.835031,0.004865,0.000000,...,7,2,0,2,1,1,4,5,6,2
498,0.462198,1.0,0.0,0.000049,0.261926,0.000000,0.500000,0.708778,0.002482,0.000000,...,3,2,0,1,7,1,3,6,6,2


In [54]:
X_df_processed_val.isna().sum()

feature_1      0
feature_2      0
feature_3      0
feature_4      0
feature_5      0
              ..
feature_144    0
feature_147    0
feature_158    0
feature_159    0
feature_195    0
Length: 214, dtype: int64

In [55]:
X_balanced_processed_val = scaler.transform(X_df_processed_val.copy())

X_balanced_processed_scaled_df_val = pd.DataFrame(X_balanced_processed_val, columns=X_df_processed_val.columns)

In [56]:
X_balanced_processed_scaled_df_val.isna().sum()

feature_1      0
feature_2      0
feature_3      0
feature_4      0
feature_5      0
              ..
feature_144    0
feature_147    0
feature_158    0
feature_159    0
feature_195    0
Length: 214, dtype: int64

In [57]:
X_balanced_processed_scaled_df_val

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_17,feature_87,feature_118,feature_119,feature_139,feature_144,feature_147,feature_158,feature_159,feature_195
0,0.663893,0.0,0.0,-0.373402,-1.275314,0.210661,0.123031,0.204859,3.018225,0.5,...,0.0,-1.0,0.0,0.00,0.333333,-0.5,0.5,0.50,1.0,0.0
1,-1.915673,-1.0,0.0,1.060806,0.652474,0.031425,-0.115957,-1.245085,0.656591,0.0,...,3.0,-1.0,0.0,1.00,0.333333,0.0,1.0,0.00,-0.2,-1.0
2,-0.222856,0.0,0.0,-0.373402,0.888714,-0.416667,0.282357,-1.108257,4.620763,0.0,...,0.0,-1.0,1.0,0.50,-0.166667,0.0,0.5,0.50,-0.4,2.0
3,-0.439138,0.0,0.0,0.000000,0.467888,-0.327048,-0.760437,-0.602389,1.078312,0.5,...,-3.0,0.0,-1.0,0.50,-0.500000,-0.5,-1.0,0.75,1.2,3.0
4,-0.195553,-1.0,0.0,-0.373402,-0.260605,0.210661,1.558399,-0.767678,-0.945946,0.0,...,-3.0,1.0,-1.0,0.00,0.333333,-1.0,-1.0,0.75,-0.4,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.571445,0.0,0.0,23.211343,-0.163859,0.121043,0.798372,0.507263,2.680849,1.5,...,-3.0,1.0,-1.0,0.25,-0.833333,-1.0,-0.5,0.50,0.6,3.0
496,-2.092372,0.0,0.0,6.804879,-0.523517,0.210661,-0.659962,-0.370658,2.596505,0.0,...,-3.0,0.0,1.0,0.00,0.333333,-1.0,0.0,-0.25,-0.2,3.0
497,-0.750503,0.0,0.0,2.103866,-0.115135,-0.416667,-1.312336,0.588580,3.186914,-1.5,...,4.0,1.0,-1.0,0.25,-0.666667,-1.0,0.5,0.50,-0.4,0.0
498,-0.390338,1.0,0.0,0.000000,-0.204196,-0.416667,1.558399,-0.258662,1.162656,-1.5,...,0.0,1.0,-1.0,0.00,0.333333,-1.0,0.0,0.75,-0.4,0.0


In [58]:
#Save the data in npz format
np.savez(SAVE_PATH + 'validation_inputs_data', inputs=X_balanced_processed_scaled_df_val)