In [93]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats

from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras import backend as K
from keras.callbacks import EarlyStopping
from datetime import datetime
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from scipy.stats import spearmanr
import joblib

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
import sys
tf.__version__

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 3003821190938328815
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 5718933504
locality {
  bus_id: 1
  links {
  }
}
incarnation: 8373500491870864921
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 3070, pci bus id: 0000:01:00.0, compute capability: 8.6"
xla_global_id: 416903419
]


'2.10.1'

In [2]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [94]:
X_train_original_df = pd.read_parquet('../data/X_train.parquet')
y_train_original_df = pd.read_parquet('../data/y_train.parquet')
X_test_original_df = pd.read_parquet('../data/X_test.parquet')

In [96]:
X_train_original_df['date'].unique().to_list()

AttributeError: 'numpy.ndarray' object has no attribute 'to_list'

In [78]:
X_orig = X_train_original_df[X_train_original_df['date'] <= 2]
y_orig = y_train_original_df[y_train_original_df['date'] <= 2]
X_train = X_train_original_df[X_train_original_df['date'] <= 2]
y_train = y_train_original_df[y_train_original_df['date'] <= 2]

orig = pd.merge(X_orig, y_orig, on=['date', 'id'], how='outer')

In [18]:
#We'll see how the model does with all features first, and then go back and refine

# selected_features = ['0', '24', '26', '29', '51', '63', '72', '77', '94', '96', '99', '104',
#                      '125', '127', '147', '155', '159', '164', '168', '170', '171', '188',
#                      '192', '207', '217', '232', '240', '256', '259', '267', '283', '289',
#                      '305', '315', '319', '340', '343', '344', '354', '378', '386', '387',
#                      '399', '400', '406', '407', '420', '437', '444', '455']

In [76]:
X_train = np.asarray(X_train)

y_train = np.asarray(list(y_train['y']))

In [20]:
def convert_to_pairwise(X_train, y_train):
    pairs = []
    labels = []
    ids = []
    n_samples = X_train.shape[0]
    for i in range(n_samples):
        for j in range(i+1, n_samples):
            pairs.append([X_train[i, 2:], X_train[j, 2:]])
            ids.append([X_train[i, :2], X_train[j, :2]])
            labels.append(1 if y_train[i] > y_train[j] else 0)
    return np.array(pairs).astype('float32'), np.array(labels).astype('float32'), np.array(ids)

In [21]:
#Scaling and PCA
scaler = StandardScaler()
X_train[:,2:] = scaler.fit_transform(X_train[:,2:])

n_components = 40  # Adjust the number of components as per your requirements
pca = PCA(n_components=n_components)
pca_ids = X_train[:,:2]
pca_features = pca.fit_transform(X_train[:,2:])

In [22]:
X_train_concat = np.concatenate((pca_ids, pca_features), axis=1)

In [23]:
X_train_pairs, y_train_labels, X_train_ids = convert_to_pairwise(X_train_concat, y_train)

# Model

In [24]:
#Get train and test datasets
X_train_nn, X_test_nn, y_train_nn, y_test_nn = train_test_split(X_train_pairs, y_train_labels, random_state=42, shuffle=True, test_size=0.3)

In [25]:
def get_spearman_rankcor(y_true, y_pred):
    return ( tf.py_function(stats.spearmanr, [tf.cast(y_pred, tf.float32),
                                              tf.cast(y_true, tf.float32)], Tout = tf.float32) )

# Custom loss function based on Spearman correlation
def spearman_loss(y_true, y_pred):
    true_ranks = tf.argsort(tf.argsort(y_true))
    pred_ranks = tf.argsort(tf.argsort(y_pred))
    srcc = tf.py_function(spearmanr, [true_ranks, pred_ranks], tf.float32)
    return -srcc

# Custom metric based on Spearman correlation
def spearman_correlation(y_true, y_pred):
    true_ranks = tf.argsort(tf.argsort(y_true))
    pred_ranks = tf.argsort(tf.argsort(y_pred))
    srcc = tf.py_function(spearmanr, [true_ranks, pred_ranks], tf.float32)
    return srcc

In [26]:
mc = ModelCheckpoint(f'../resources/adia_neural_network.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=5,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=True)

model = keras.Sequential([
    keras.layers.Dense(800, activation='relu', kernel_initializer='lecun_normal', input_shape=(X_train_nn.shape[1], X_train_nn.shape[2])),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(500, activation='relu', kernel_initializer='lecun_normal'),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(250, activation='relu', kernel_initializer='lecun_normal'),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(100, activation='relu', kernel_initializer='lecun_normal'),
    keras.layers.BatchNormalization(),
    keras.layers.Flatten(),
    keras.layers.Dense(1, activation='sigmoid', kernel_initializer='lecun_normal')
])

optimizer = keras.optimizers.Adam(learning_rate=3e-4)

model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [27]:
history = model.fit(
    X_train_nn,
    y_train_nn,
    batch_size=5000,
    epochs=3,
    validation_data=[X_test_nn, y_test_nn],
    callbacks=[mc, early_stopping],
    shuffle=False,
    use_multiprocessing=True
)

Epoch 1/3
Epoch 1: val_loss improved from inf to 0.18204, saving model to ../resources\adia_neural_network.h5
Epoch 2/3
Epoch 2: val_loss improved from 0.18204 to 0.14159, saving model to ../resources\adia_neural_network.h5
Epoch 3/3
Epoch 3: val_loss improved from 0.14159 to 0.12626, saving model to ../resources\adia_neural_network.h5


In [28]:
preds = model.predict(X_train_pairs, batch_size=3000)



In [30]:
get_spearman_rankcor(y_train_labels, preds)

<tf.Tensor: shape=(), dtype=float32, numpy=0.84999716>

In [58]:
preds_df_1 = pd.DataFrame({'id': X_train_ids[:,0,1].flatten(), 'date': X_train_ids[:,0,0].flatten(), 'prediction': preds.flatten()})
#preds_df_2 = pd.DataFrame({'id': X_train_ids[:,1,1].flatten(), 'date': X_train_ids[:,1,0].flatten(), 'prediction': preds.flatten()})

# Merge the predictions with the original dataset based on ids and dates
result = pd.merge(orig, preds_df_1, on=['id', 'date'], how='left')

In [59]:
#result = pd.concat([result, preds_df_2], axis=0)
result = result[['date', 'id', 'prediction']].groupby(['date', 'id']).mean().reset_index()
result

Unnamed: 0,date,id,prediction
0,0,00086f670ab6f60f36f4226c88a3474c94a3022ae814ea...,0.026589
1,0,002c98d8b9637a366e25bf628820db372cdc26af221574...,0.369351
2,0,00bce20d560663f9578898d727c6b7594e368c4a0916e7...,0.863204
3,0,00ef81dcf5e0e82f4c6e34cdc67cb5dbed174465db4dd6...,0.622007
4,0,0135bb59a8c5e5a0af25ac1a2ca7787fc67a559e4c29dd...,0.380088
...,...,...,...
2381,2,ff423e0ac8b473b5df86daefe10bf9915f9c5cffe8a060...,0.024422
2382,2,ff4d6c412b95517450955d0fb7880c5738bed79efe740c...,0.354214
2383,2,ff814593d8304285e14d448a1199e64bc157ed2a3d5191...,0.371606
2384,2,ffbe5b07ba887eed9eddc1661a5fb35c27b47fb4adac2c...,0.894865


In [40]:
orig

Unnamed: 0,date,id,0,1,2,3,4,5,6,7,...,452,453,454,455,456,457,458,459,460,y
0,0,dae29c8061b3176b9208f26afbb96e2ca50886db41902d...,-0.909515,0.388808,-1.535913,-0.133312,-1.826404,-0.532795,0.351273,0.158866,...,-0.456020,-0.257331,0.396074,0.318007,-0.538754,-0.625193,-0.753419,0.154403,1.069385,0.192308
1,0,2f71f1b5d49fbd131351df95848dc91ab14662af62d4d0...,-0.107694,-0.097967,-0.539599,-0.331276,-0.942609,-0.054123,-1.212772,1.688034,...,-0.984907,-0.429806,0.199055,0.202587,1.612578,0.302153,-0.165713,0.905807,0.083180,-0.476959
2,0,b8d41ef950b69f94c380410f59f47e15666c57b74573b6...,0.092316,0.052596,-0.652025,1.218241,0.382968,-0.861838,-0.318937,-0.744261,...,-0.046016,1.147463,0.696961,-0.574426,1.255969,0.270394,1.272939,-0.643112,0.433585,0.080645
3,0,cdce060d04ce28a551eaab653cc4b01f5ad878aeb932ec...,4.119639,1.018918,3.687519,1.597563,0.055918,-1.406041,0.652994,0.251138,...,-1.155922,-1.108540,-2.046100,1.311100,-0.322965,0.999248,-1.238640,0.882844,-1.333590,0.953125
4,0,86f6e6d9407ad3abfab91a3bbfb7ad71553e3f968765b8...,0.109644,-0.290280,-0.278987,-0.603259,0.136952,-1.725076,-0.062219,-0.183102,...,-0.482311,-0.269142,-0.899796,1.083332,0.674665,-1.095657,-0.402669,0.677189,0.319992,-0.979263
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2381,2,680d730b3461c2eb4afb2180e74470aff04dde0785f8b3...,-0.934337,-0.751031,1.880259,1.460534,-0.514566,0.140587,-0.070087,-0.413820,...,0.894572,0.577557,-0.947367,-3.114039,-1.490575,2.679698,2.641528,-2.363223,0.957594,0.720657
2382,2,b65df8d975c30b01386cfc48f558a022d0b798a81588a1...,0.203653,-0.166871,-0.747866,-0.260202,0.531357,0.440906,-0.085927,0.535370,...,0.773311,0.364477,0.331518,-1.230837,0.839055,-0.898724,0.608445,0.013442,0.775747,-0.246479
2383,2,831d68a034bdf0b1eced4c8ec39a8462774dd6a9d4bd62...,-1.344671,-1.820279,0.992266,0.110530,0.554084,-1.874186,-0.754604,-0.385738,...,1.165428,0.314596,-0.296145,-0.338750,0.690880,-0.446585,1.206716,-0.088316,-0.030771,-0.058685
2384,2,49d3f2c79cd56ba9be104a7e84bf9cdd3052fa5ed547c2...,0.996614,-0.567071,0.394580,0.396793,0.553907,0.981516,0.090325,-0.356672,...,-1.165739,0.358323,0.398288,-0.554795,-0.234889,2.252474,0.808473,1.421106,0.636106,0.687831


In [33]:
preds_df = pd.DataFrame({'id': X_train_ids[:,0,1].flatten(), 'date': X_train_ids[:,0,0].flatten(), 'prediction': preds.flatten()})

In [70]:
test_df = preds_df.groupby(['id', 'date']).mean().reset_index().sort_values(by='date')

In [60]:
test = pd.merge(orig, result, on=['id', 'date'], how='left')
#test[['prediction']] = MinMaxScaler(feature_range=(-1,1)).fit_transform(test[['prediction']])

In [66]:
test['prediction'] = test['prediction'].fillna(0)
spearmanr(test['y'], test['prediction'])

SignificanceResult(statistic=0.9826015637453094, pvalue=0.0)

In [42]:
from sklearn.preprocessing import MinMaxScaler

In [64]:
lower, upper = -1, 1
test['prediction'] = [lower + (upper - lower) * x for x in test['prediction']]

In [65]:
test

Unnamed: 0,date,id,0,1,2,3,4,5,6,7,...,453,454,455,456,457,458,459,460,y,prediction
0,0,dae29c8061b3176b9208f26afbb96e2ca50886db41902d...,-0.909515,0.388808,-1.535913,-0.133312,-1.826404,-0.532795,0.351273,0.158866,...,-0.257331,0.396074,0.318007,-0.538754,-0.625193,-0.753419,0.154403,1.069385,0.192308,0.157790
1,0,2f71f1b5d49fbd131351df95848dc91ab14662af62d4d0...,-0.107694,-0.097967,-0.539599,-0.331276,-0.942609,-0.054123,-1.212772,1.688034,...,-0.429806,0.199055,0.202587,1.612578,0.302153,-0.165713,0.905807,0.083180,-0.476959,-0.552857
2,0,b8d41ef950b69f94c380410f59f47e15666c57b74573b6...,0.092316,0.052596,-0.652025,1.218241,0.382968,-0.861838,-0.318937,-0.744261,...,1.147463,0.696961,-0.574426,1.255969,0.270394,1.272939,-0.643112,0.433585,0.080645,0.097009
3,0,cdce060d04ce28a551eaab653cc4b01f5ad878aeb932ec...,4.119639,1.018918,3.687519,1.597563,0.055918,-1.406041,0.652994,0.251138,...,-1.108540,-2.046100,1.311100,-0.322965,0.999248,-1.238640,0.882844,-1.333590,0.953125,0.951378
4,0,86f6e6d9407ad3abfab91a3bbfb7ad71553e3f968765b8...,0.109644,-0.290280,-0.278987,-0.603259,0.136952,-1.725076,-0.062219,-0.183102,...,-0.269142,-0.899796,1.083332,0.674665,-1.095657,-0.402669,0.677189,0.319992,-0.979263,-0.953230
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2381,2,680d730b3461c2eb4afb2180e74470aff04dde0785f8b3...,-0.934337,-0.751031,1.880259,1.460534,-0.514566,0.140587,-0.070087,-0.413820,...,0.577557,-0.947367,-3.114039,-1.490575,2.679698,2.641528,-2.363223,0.957594,0.720657,0.524311
2382,2,b65df8d975c30b01386cfc48f558a022d0b798a81588a1...,0.203653,-0.166871,-0.747866,-0.260202,0.531357,0.440906,-0.085927,0.535370,...,0.364477,0.331518,-1.230837,0.839055,-0.898724,0.608445,0.013442,0.775747,-0.246479,-0.248415
2383,2,831d68a034bdf0b1eced4c8ec39a8462774dd6a9d4bd62...,-1.344671,-1.820279,0.992266,0.110530,0.554084,-1.874186,-0.754604,-0.385738,...,0.314596,-0.296145,-0.338750,0.690880,-0.446585,1.206716,-0.088316,-0.030771,-0.058685,0.000269
2384,2,49d3f2c79cd56ba9be104a7e84bf9cdd3052fa5ed547c2...,0.996614,-0.567071,0.394580,0.396793,0.553907,0.981516,0.090325,-0.356672,...,0.358323,0.398288,-0.554795,-0.234889,2.252474,0.808473,1.421106,0.636106,0.687831,1.000000


In [89]:
X_test = X_test_original_df

In [90]:
X_test_orig = X_test
X_test_orig['y'] = 0

X_test = np.asarray(X_test_orig.drop(columns=['y']))

y_test = np.asarray(list(X_test_orig['y']))

In [92]:
X_test

array([[269,
        'c6e83eda40042dab1af117e195d542f00a417627e3173a1fe5c4de88aa25bb00',
        0.39107057452201843, ..., -0.17491409182548523,
        -1.0125234127044678, 0.16360142827033997],
       [269,
        '97ae3194605438cbd2c59a3827f7c615dafa40d6cc3f42cceeb6cdf977fd8fdc',
        0.2599969506263733, ..., -2.126491069793701,
        -0.020631518214941025, 0.9291010499000549],
       [269,
        '310382927ec56f64c6f2f834fd320c9f732e26df639e67169bb3392aaf14cadd',
        -0.3788648843765259, ..., 0.29570913314819336,
        -2.752091884613037, 0.4612125754356384],
       ...,
       [273,
        '002647639e3b83fd884eed0eddf72a702f15c5d70fb75d63676eb7d302166067',
        -1.8477901220321655, ..., -1.1143118143081665,
        -0.37705767154693604, 0.5314030647277832],
       [273,
        '7d840103d2370a80cc9b8376bfaf04b2aa5ff46bcbab0396802998509955b621',
        0.24425959587097168, ..., -0.8167691230773926,
        -0.3566073179244995, -0.5588969588279724],
       [273,
  

In [88]:
X_test = np.asarray(X_test.drop(columns=['y']))

y_test = np.asarray(list(X_test['y']))

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [85]:
y_test = np.asarray(list(X_test['y']))

In [80]:
y_train['y']

0       0.192308
1      -0.476959
2       0.080645
3       0.953125
4      -0.979263
          ...   
2381    0.720657
2382   -0.246479
2383   -0.058685
2384    0.687831
2385   -0.958084
Name: y, Length: 2386, dtype: float64

In [81]:
X_test['y']

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [103]:
result_df = pd.DataFrame(columns=['date', 'id', 'value'])

In [100]:
preds_df = preds_df.rename(columns={'prediction': 'value'})

In [102]:
preds_df = preds_df[['date', 'id', 'value']]

In [104]:
pd.concat([result_df, preds_df], axis=0, ignore_index=True)

Unnamed: 0,date,id,value
0,0,dae29c8061b3176b9208f26afbb96e2ca50886db41902d...,0.991433
1,0,dae29c8061b3176b9208f26afbb96e2ca50886db41902d...,0.784938
2,0,dae29c8061b3176b9208f26afbb96e2ca50886db41902d...,0.000139
3,0,dae29c8061b3176b9208f26afbb96e2ca50886db41902d...,0.999991
4,0,dae29c8061b3176b9208f26afbb96e2ca50886db41902d...,0.980615
...,...,...,...
2845300,2,b65df8d975c30b01386cfc48f558a022d0b798a81588a1...,0.000021
2845301,2,b65df8d975c30b01386cfc48f558a022d0b798a81588a1...,0.999987
2845302,2,831d68a034bdf0b1eced4c8ec39a8462774dd6a9d4bd62...,0.000270
2845303,2,831d68a034bdf0b1eced4c8ec39a8462774dd6a9d4bd62...,0.999999
