In [34]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats

from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras import backend as K
from keras.callbacks import EarlyStopping
from datetime import datetime
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from scipy.stats import spearmanr
import joblib

In [177]:
X_train = pd.read_parquet('../data/X_train.parquet')
y_train = pd.read_parquet('../data/y_train.parquet')
X_test = pd.read_parquet('../data/X_test.parquet')

In [178]:
X_orig = X_train[X_train['date'] <= 3]
X_train = X_train[X_train['date'] <= 3]
y_train = y_train[y_train['date'] <= 3]

In [28]:
#We'll see how the model does with all features first, and then go back and refine

# selected_features = ['0', '24', '26', '29', '51', '63', '72', '77', '94', '96', '99', '104',
#                      '125', '127', '147', '155', '159', '164', '168', '170', '171', '188',
#                      '192', '207', '217', '232', '240', '256', '259', '267', '283', '289',
#                      '305', '315', '319', '340', '343', '344', '354', '378', '386', '387',
#                      '399', '400', '406', '407', '420', '437', '444', '455']

In [179]:
X_train = np.asarray(X_train)
X_test = np.asarray(X_test)

y_train = np.asarray(list(y_train['y']))

X_train = X_train[:, 2:]
X_test = X_test[:, 2:]

In [180]:
def convert_to_pairwise(X_train, y_train):
    pairs = []
    labels = []
    n_samples = X_train.shape[0]
    for i in range(n_samples):
        for j in range(i+1, n_samples):
            pairs.append([X_train[i], X_train[j]])
            labels.append(1 if y_train[i] > y_train[j] else 0)
    return np.array(pairs), np.array(labels)

In [181]:
#Scaling and PCA
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

n_components = 40  # Adjust the number of components as per your requirements
pca = PCA(n_components=n_components)
X_train = pca.fit_transform(X_train)

In [182]:
X_train, y_train = convert_to_pairwise(X_train, y_train)

(3414775, 2, 40)

# Model

In [162]:
#Get validation dataset
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=42, shuffle=True, test_size=0.1)

In [163]:
#Get train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=42, shuffle=True, test_size=0.3)

In [164]:
def get_spearman_rankcor(y_true, y_pred):
    return ( tf.py_function(stats.spearmanr, [tf.cast(y_pred, tf.float32),
                                              tf.cast(y_true, tf.float32)], Tout = tf.float32) )

# Custom loss function based on Spearman correlation
def spearman_loss(y_true, y_pred):
    true_ranks = tf.argsort(tf.argsort(y_true))
    pred_ranks = tf.argsort(tf.argsort(y_pred))
    srcc = tf.py_function(spearmanr, [true_ranks, pred_ranks], tf.float32)
    return -srcc

# Custom metric based on Spearman correlation
def spearman_correlation(y_true, y_pred):
    true_ranks = tf.argsort(tf.argsort(y_true))
    pred_ranks = tf.argsort(tf.argsort(y_pred))
    srcc = tf.py_function(spearmanr, [true_ranks, pred_ranks], tf.float32)
    return srcc

In [165]:
mc = ModelCheckpoint(f'../resources/adia_neural_network.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=5,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=True)

model = keras.Sequential([
    keras.layers.Dense(800, activation='relu', kernel_initializer='lecun_normal', input_shape=(X_train.shape[1], X_train.shape[2])),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(500, activation='relu', kernel_initializer='lecun_normal'),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(250, activation='relu', kernel_initializer='lecun_normal'),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(100, activation='relu', kernel_initializer='lecun_normal'),
    keras.layers.BatchNormalization(),
    keras.layers.Flatten(),
    keras.layers.Dense(1, activation='sigmoid', kernel_initializer='lecun_normal')
])

optimizer = keras.optimizers.Adam(learning_rate=3e-4)

model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [166]:
history = model.fit(
    X_train,
    y_train,
    batch_size=5000,
    epochs=5000,
    validation_data=[X_test, y_test],
    callbacks=[mc, early_stopping],
    shuffle=False,
    use_multiprocessing=True
)

Epoch 1/5000

KeyboardInterrupt: 

In [183]:
preds = model.predict(X_train)



In [186]:
spearmanr(y_train, preds)

SpearmanrResult(correlation=0.7233887508091469, pvalue=0.0)

In [185]:
min_rank = np.min(preds)
max_rank = np.max(preds)

# Step 2: Remap the rankings to the interval [-1, 1]
preds_remapped = -1 + 2 * (preds - min_rank) / (max_rank - min_rank)

In [170]:
spearmanr(y_train, preds)

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 3414775 and the array at index 1 has size 542028

In [193]:
n_samples = X_train.shape[0]
row_index = 0

for i in range(n_samples):
    num_pairs = n_samples - i - 1
    indices = np.where((X_train[i] == 1).all(axis=1))[0]
    X_orig.iloc[indices, -1] = preds_remapped[row_index : row_index + num_pairs]
    row_index += num_pairs

In [194]:
X_orig

Unnamed: 0,date,id,0,1,2,3,4,5,6,7,...,452,453,454,455,456,457,458,459,460,predicted_rank
0,0,dae29c8061b3176b9208f26afbb96e2ca50886db41902d...,-0.909515,0.388808,-1.535913,-0.133312,-1.826404,-0.532795,0.351273,0.158866,...,-0.456020,-0.257331,0.396074,0.318007,-0.538754,-0.625193,-0.753419,0.154403,1.069385,-0.611957
1,0,2f71f1b5d49fbd131351df95848dc91ab14662af62d4d0...,-0.107694,-0.097967,-0.539599,-0.331276,-0.942609,-0.054123,-1.212772,1.688034,...,-0.984907,-0.429806,0.199055,0.202587,1.612578,0.302153,-0.165713,0.905807,0.083180,-0.611957
2,0,b8d41ef950b69f94c380410f59f47e15666c57b74573b6...,0.092316,0.052596,-0.652025,1.218241,0.382968,-0.861838,-0.318937,-0.744261,...,-0.046016,1.147463,0.696961,-0.574426,1.255969,0.270394,1.272939,-0.643112,0.433585,-0.611957
3,0,cdce060d04ce28a551eaab653cc4b01f5ad878aeb932ec...,4.119639,1.018918,3.687519,1.597563,0.055918,-1.406041,0.652994,0.251138,...,-1.155922,-1.108540,-2.046100,1.311100,-0.322965,0.999248,-1.238640,0.882844,-1.333590,-0.611957
4,0,86f6e6d9407ad3abfab91a3bbfb7ad71553e3f968765b8...,0.109644,-0.290280,-0.278987,-0.603259,0.136952,-1.725076,-0.062219,-0.183102,...,-0.482311,-0.269142,-0.899796,1.083332,0.674665,-1.095657,-0.402669,0.677189,0.319992,-0.611957
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3288,3,a618c77e3fc3bcd66278ea82876546048ca77304ecd3de...,-0.610014,-1.151982,-0.892403,0.020550,-0.333657,0.439891,-0.368870,0.571765,...,-1.047030,-0.172235,1.586273,0.733316,1.223943,-0.396021,-1.005701,0.128281,0.046620,-0.611957
3289,3,162835451eb55e87dff9e06a9d02c58bc4999cbc00a31d...,0.262623,0.295104,-0.481327,-0.074884,0.500263,1.051355,0.738217,0.397935,...,-1.421944,-0.155835,1.846273,0.788146,0.315591,-0.088606,0.318583,0.434937,-0.458458,-0.611957
3290,3,c98afeee57fc25534ebf42c6da388efebec82eab438a3e...,0.586663,-0.849009,0.169657,-0.031505,-0.284460,-1.163243,-0.655527,0.940109,...,-0.116776,-1.297815,0.327170,-1.064325,0.893458,-0.071305,0.479180,0.168155,-0.063388,-0.611957
3291,3,1c5c9e101b98b3d46af8b93007eac21e278b1191df4d66...,-0.690025,-0.665957,-0.744144,0.046095,0.226594,-1.986957,0.718556,-0.167248,...,0.103249,0.040602,-0.820882,0.535778,0.113347,0.367496,0.473489,0.136284,0.370755,-0.611957
