In [74]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats

from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras import backend as K
from keras.callbacks import EarlyStopping
from datetime import datetime
from sklearn.metrics import mean_squared_error
import joblib

In [123]:
X_train = pd.read_parquet('../data/X_train.parquet')
y_train = pd.read_parquet('../data/y_train.parquet')
X_test = pd.read_parquet('../data/X_test.parquet')

In [106]:
ids = X_train[['id']]
X_train = X_train.drop(columns=['id', 'date'])
y_train = y_train.drop(columns=['id', 'date'])

In [107]:
estimator = LinearRegression()
rfe = RFE(estimator, n_features_to_select=50, step=0.05)

In [108]:
rfe.fit(X_train, y_train)

RFE(estimator=LinearRegression(), n_features_to_select=50, step=0.05)

In [109]:
selected_features = X_train.columns[rfe.support_]

In [110]:
selected_features

Index(['0', '24', '26', '29', '51', '63', '72', '77', '94', '96', '99', '104',
       '125', '127', '147', '155', '159', '164', '168', '170', '171', '188',
       '192', '207', '217', '232', '240', '256', '259', '267', '283', '289',
       '305', '315', '319', '340', '343', '344', '354', '378', '386', '387',
       '399', '400', '406', '407', '420', '437', '444', '455'],
      dtype='object')

# Transformation

In [111]:
X_train = X_train[selected_features]

In [112]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [113]:
#Get validation dataset
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=42, shuffle=True, test_size=0.1)

In [114]:
#Get train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=42, shuffle=True, test_size=0.3)

# Neural Network

In [115]:
import tensorflow as tf
from tensorflow import keras
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras import backend as K
from keras.callbacks import EarlyStopping
from datetime import datetime
from sklearn.metrics import mean_squared_error
import joblib

In [116]:
def get_spearman_rankcor(y_true, y_pred):
    return ( tf.py_function(stats.spearmanr, [tf.cast(y_pred, tf.float32),
                                        tf.cast(y_true, tf.float32)], Tout = tf.float32) )

In [117]:
mc = ModelCheckpoint(f'../resources/adia_neural_network.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=5,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=True)

model = keras.Sequential([
    #keras.layers.InputLayer(78),
    keras.layers.Dense(50, activation='selu', kernel_initializer='lecun_normal'),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(50, activation='selu', kernel_initializer='lecun_normal'),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(25, activation='selu', kernel_initializer='lecun_normal'),
    keras.layers.BatchNormalization(),
    #keras.layers.Dense(10, activation='selu', kernel_initializer='lecun_normal'),
    #keras.layers.BatchNormalization(),
    keras.layers.Dense(3, activation='selu', kernel_initializer='lecun_normal'),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(1, activation='relu', kernel_initializer='lecun_normal')
])

In [118]:
optimizer = keras.optimizers.Adam(learning_rate=0.0001)

model.compile(optimizer=optimizer,
              loss='mean_squared_error',
              metrics=[get_spearman_rankcor])

In [119]:
history = model.fit(
    X_train,
    y_train,
    batch_size=5000,
    epochs=5000,
    validation_data=[X_test, y_test],
    callbacks=[mc, early_stopping],
    shuffle=True,
    use_multiprocessing=True
)

Epoch 1/5000
Epoch 1: val_loss improved from inf to 0.42501, saving model to ../resources\adia_neural_network.h5
Epoch 2/5000
Epoch 2: val_loss improved from 0.42501 to 0.37242, saving model to ../resources\adia_neural_network.h5
Epoch 3/5000
Epoch 3: val_loss improved from 0.37242 to 0.35394, saving model to ../resources\adia_neural_network.h5
Epoch 4/5000
Epoch 4: val_loss improved from 0.35394 to 0.34552, saving model to ../resources\adia_neural_network.h5
Epoch 5/5000
Epoch 5: val_loss improved from 0.34552 to 0.34123, saving model to ../resources\adia_neural_network.h5
Epoch 6/5000
Epoch 6: val_loss improved from 0.34123 to 0.33872, saving model to ../resources\adia_neural_network.h5
Epoch 7/5000
Epoch 7: val_loss improved from 0.33872 to 0.33714, saving model to ../resources\adia_neural_network.h5
Epoch 8/5000
Epoch 8: val_loss improved from 0.33714 to 0.33609, saving model to ../resources\adia_neural_network.h5
Epoch 9/5000
Epoch 9: val_loss improved from 0.33609 to 0.33533, sav

In [120]:
predictions = model.predict(X_val)
get_spearman_rankcor(y_val, predictions)



<tf.Tensor: shape=(), dtype=float32, numpy=0.011423049>

In [94]:
from pathlib import Path
model_directory_path = '../resources'
joblib.dump(scaler, Path(model_directory_path) / 'scaler.h5')

['..\\resources\\scaler.h5']

NameError: name 'y' is not defined

In [124]:
y_train

Unnamed: 0,date,id,y
0,0,dae29c8061b3176b9208f26afbb96e2ca50886db41902d...,0.192308
1,0,2f71f1b5d49fbd131351df95848dc91ab14662af62d4d0...,-0.476959
2,0,b8d41ef950b69f94c380410f59f47e15666c57b74573b6...,0.080645
3,0,cdce060d04ce28a551eaab653cc4b01f5ad878aeb932ec...,0.953125
4,0,86f6e6d9407ad3abfab91a3bbfb7ad71553e3f968765b8...,-0.979263
...,...,...,...
742665,268,5a18ddc0f252fa17cbd2a5bfe2f3786c0afb5052dd92be...,-0.712248
742666,268,73c197cf1cb75641710562fe26d4f562c8228847a67949...,0.443895
742667,268,bad7ff9ebc5579589e5ef36cb58f962c90c864fd3dfb22...,0.302521
742668,268,5b968ca44ac0550be6f31470a96e572cd1c58d36cc26c7...,-0.726644
