In [105]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats

from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras import backend as K
from keras.callbacks import EarlyStopping
from datetime import datetime
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from scipy.stats import spearmanr
import joblib

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
import sys
tf.__version__

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 8362137652244168140
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 5718933504
locality {
  bus_id: 1
  links {
  }
}
incarnation: 7569189527252057222
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 3070, pci bus id: 0000:01:00.0, compute capability: 8.6"
xla_global_id: 416903419
]


'2.10.1'

In [106]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [167]:
X_train = pd.read_parquet('../data/X_train.parquet')
y_train = pd.read_parquet('../data/y_train.parquet')
X_test = pd.read_parquet('../data/X_test.parquet')

In [168]:
X_orig = X_train[X_train['date'] <= 3]
y_orig = y_train[y_train['date'] <= 3]
X_train = X_train[X_train['date'] <= 3]
y_train = y_train[y_train['date'] <= 3]

orig = pd.merge(X_orig, y_orig, on=['date', 'id'], how='outer')

In [169]:
#We'll see how the model does with all features first, and then go back and refine

# selected_features = ['0', '24', '26', '29', '51', '63', '72', '77', '94', '96', '99', '104',
#                      '125', '127', '147', '155', '159', '164', '168', '170', '171', '188',
#                      '192', '207', '217', '232', '240', '256', '259', '267', '283', '289',
#                      '305', '315', '319', '340', '343', '344', '354', '378', '386', '387',
#                      '399', '400', '406', '407', '420', '437', '444', '455']

In [170]:
X_train = np.asarray(X_train)
X_test = np.asarray(X_test)

y_train = np.asarray(list(y_train['y']))

In [171]:
X_train[:,1:]

array([['dae29c8061b3176b9208f26afbb96e2ca50886db41902d2831fe878852970133',
        -0.9095147252082825, 0.3888077139854431, ...,
        -0.7534186840057373, 0.1544029414653778, 1.0693851709365845],
       ['2f71f1b5d49fbd131351df95848dc91ab14662af62d4d0a94113c1a6cd2f2cf7',
        -0.10769376903772354, -0.09796717017889023, ...,
        -0.16571277379989624, 0.9058074951171875, 0.0831795483827591],
       ['b8d41ef950b69f94c380410f59f47e15666c57b74573b662f1c0e616a31cac7e',
        0.09231600910425186, 0.05259564518928528, ...,
        1.2729392051696777, -0.6431118249893188, 0.4335854947566986],
       ...,
       ['c98afeee57fc25534ebf42c6da388efebec82eab438a3ee7faa0d7dc6e347233',
        0.5866634249687195, -0.8490085601806641, ...,
        0.47918009757995605, 0.16815486550331116, -0.06338760256767273],
       ['1c5c9e101b98b3d46af8b93007eac21e278b1191df4d668202ff60be74d83ecf',
        -0.6900245547294617, -0.6659571528434753, ...,
        0.4734887480735779, 0.1362840086221695, 0

In [182]:
def convert_to_pairwise(X_train, y_train):
    pairs = []
    labels = []
    ids = []
    n_samples = X_train.shape[0]
    for i in range(n_samples):
        for j in range(i+1, n_samples):
            pairs.append([X_train[i, 2:], X_train[j, 2:]])
            ids.append([X_train[i, :2], X_train[j, :2]])
            labels.append(1 if y_train[i] > y_train[j] else 0)
    return np.array(pairs), np.array(labels), np.array(ids)

In [173]:
#Scaling and PCA
scaler = StandardScaler()
X_train[:,2:] = scaler.fit_transform(X_train[:,2:])

n_components = 40  # Adjust the number of components as per your requirements
pca = PCA(n_components=n_components)
pca_ids = X_train[:,:2]
pca_features = pca.fit_transform(X_train[:,2:])

In [188]:
pca_ids

array([[0,
        'dae29c8061b3176b9208f26afbb96e2ca50886db41902d2831fe878852970133'],
       [0,
        '2f71f1b5d49fbd131351df95848dc91ab14662af62d4d0a94113c1a6cd2f2cf7'],
       [0,
        'b8d41ef950b69f94c380410f59f47e15666c57b74573b662f1c0e616a31cac7e'],
       ...,
       [3,
        'c98afeee57fc25534ebf42c6da388efebec82eab438a3ee7faa0d7dc6e347233'],
       [3,
        '1c5c9e101b98b3d46af8b93007eac21e278b1191df4d668202ff60be74d83ecf'],
       [3,
        '55e49d8de87412ecd5abf28a43e48e79ff391405270dfd2d6150cc1a011d504a']],
      dtype=object)

In [179]:
X_train = np.concatenate((pca_ids, pca_features), axis=1)

In [183]:
X_train, y_train, X_train_ids = convert_to_pairwise(X_train, y_train)

# Model

In [114]:
#Get validation dataset
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=42, shuffle=True, test_size=0.1)

In [115]:
#Get train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=42, shuffle=True, test_size=0.3)

In [116]:
def get_spearman_rankcor(y_true, y_pred):
    return ( tf.py_function(stats.spearmanr, [tf.cast(y_pred, tf.float32),
                                              tf.cast(y_true, tf.float32)], Tout = tf.float32) )

# Custom loss function based on Spearman correlation
def spearman_loss(y_true, y_pred):
    true_ranks = tf.argsort(tf.argsort(y_true))
    pred_ranks = tf.argsort(tf.argsort(y_pred))
    srcc = tf.py_function(spearmanr, [true_ranks, pred_ranks], tf.float32)
    return -srcc

# Custom metric based on Spearman correlation
def spearman_correlation(y_true, y_pred):
    true_ranks = tf.argsort(tf.argsort(y_true))
    pred_ranks = tf.argsort(tf.argsort(y_pred))
    srcc = tf.py_function(spearmanr, [true_ranks, pred_ranks], tf.float32)
    return srcc

In [117]:
mc = ModelCheckpoint(f'../resources/adia_neural_network.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=5,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=True)

model = keras.Sequential([
    keras.layers.Dense(800, activation='relu', kernel_initializer='lecun_normal', input_shape=(X_train.shape[1], X_train.shape[2])),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(500, activation='relu', kernel_initializer='lecun_normal'),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(250, activation='relu', kernel_initializer='lecun_normal'),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(100, activation='relu', kernel_initializer='lecun_normal'),
    keras.layers.BatchNormalization(),
    keras.layers.Flatten(),
    keras.layers.Dense(1, activation='sigmoid', kernel_initializer='lecun_normal')
])

optimizer = keras.optimizers.Adam(learning_rate=3e-4)

model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [118]:
history = model.fit(
    X_train,
    y_train,
    batch_size=5000,
    epochs=3,
    validation_data=[X_test, y_test],
    callbacks=[mc, early_stopping],
    shuffle=False,
    use_multiprocessing=True
)

Epoch 1/3
Epoch 1: val_loss improved from inf to 0.14724, saving model to ../resources\adia_neural_network.h5
Epoch 2/3
Epoch 2: val_loss improved from 0.14724 to 0.11466, saving model to ../resources\adia_neural_network.h5
Epoch 3/3
Epoch 3: val_loss improved from 0.11466 to 0.10851, saving model to ../resources\adia_neural_network.h5


In [119]:
preds = model.predict(X_train, batch_size=5000)



In [120]:
get_spearman_rankcor(y_train, preds)

<tf.Tensor: shape=(), dtype=float32, numpy=0.8541922>

In [121]:
spearmanr(y_train, preds)

SignificanceResult(statistic=0.8541922135410357, pvalue=0.0)

In [54]:
spearmanr(y_train, preds)

SignificanceResult(statistic=0.8558184744174377, pvalue=0.0)

In [122]:
# Calculate average ranking
avg_ranking = preds.mean(axis=1)

# Sort the dataset based on id and date
orig.sort_values(by=['id', 'date'], inplace=True)

# Assign the calculated ranks to the original dataset
orig['rank'] = avg_ranking

ValueError: Length of values (3414775) does not match length of index (3293)

In [123]:
preds.shape

(3414775, 1)

In [124]:
orig

Unnamed: 0,date,id,0,1,2,3,4,5,6,7,...,452,453,454,455,456,457,458,459,460,y
351,0,00086f670ab6f60f36f4226c88a3474c94a3022ae814ea...,-0.392756,-1.053607,-1.004719,0.897401,-1.435284,0.642041,1.109745,-0.913487,...,0.283107,0.747310,1.022710,0.760914,-1.290996,-1.268141,-0.175948,0.010643,-2.239050,-0.956221
1765,2,00092392146e36b35f4766d0bc031a244f599d98e3e10a...,-1.159224,-0.684363,-0.367090,-0.972604,0.009752,-0.223766,-0.444980,-0.503634,...,1.439849,-0.195490,-0.548266,-0.518988,0.441649,-0.956321,0.353774,-0.476675,-0.861099,-0.255869
1821,2,001c8409b3a896a93dee368ec7d370c4e338810ef54492...,0.995234,0.213245,1.067418,-0.452851,1.351322,-0.150080,-1.356108,0.722105,...,0.058828,-0.209973,-0.582079,-1.164643,0.817361,0.316394,0.134055,0.927457,0.070424,0.302817
1659,2,002125fe502250eddd11684171c1c017bab7b3b24fb5a5...,1.754235,0.869792,1.305935,-0.501795,1.977401,0.387814,-1.333420,0.600955,...,-0.630083,-0.233698,-0.056356,-0.342771,0.949664,1.100303,0.270003,0.534746,-0.059752,-0.538922
410,0,002c98d8b9637a366e25bf628820db372cdc26af221574...,-0.688544,2.333238,-0.685584,1.391184,0.645614,-2.344432,-0.375670,0.974930,...,-1.878088,0.720554,-1.451522,0.884413,-0.149627,0.438543,0.117368,-1.476501,1.588611,-0.244792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2225,2,ffbe5b07ba887eed9eddc1661a5fb35c27b47fb4adac2c...,0.389726,0.789453,0.094558,0.251189,0.166130,0.360018,-1.080861,-0.422209,...,0.331404,-0.154158,0.556353,0.129029,1.363718,-0.224603,0.032384,0.404710,-1.013188,0.826347
1360,1,ffc445d61fdf7c65489c64051a5259dbdab578b4e041c5...,0.354836,-0.016632,1.654979,0.612811,-2.558559,-0.326919,2.902143,0.217578,...,2.927136,0.793230,-1.499924,0.714872,-1.911354,1.046456,-0.694162,-4.021513,3.511131,-0.188482
217,0,ffd5cf0fd66f89d3505fc1f91c7636dadd2aac9112e536...,0.562925,1.196389,-0.635116,-0.056753,0.730782,-0.043954,-0.573020,0.440961,...,-0.765363,-0.194500,-0.668569,-0.113827,-0.119953,0.730667,0.881603,0.191178,0.074462,0.950549
1036,1,ffd70eeff7fd812165040b98a328320672a2d0210bf1c9...,1.642180,-0.658324,1.081356,1.041215,-1.244178,-1.037753,0.648093,1.374484,...,-0.601683,0.308228,-0.374773,0.979396,0.917428,1.139827,0.285111,1.684537,-1.514750,-0.408377
