In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from keras.models import load_model
from pathlib import Path
import tensorflow as tf
from tensorflow import keras
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from sklearn.decomposition import PCA
import pickle
import math

In [6]:
def split(a, n):
    k, m = divmod(len(a), n)
    return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))

def convert_to_pairwise(X_train, y_train):
    pairs = []
    labels = []
    ids = []
    n_samples = X_train.shape[0]
    for i in range(n_samples):
        for j in range(i+1, n_samples):
            pairs.append([X_train[i, 2:], X_train[j, 2:]])
            ids.append([X_train[i, :2], X_train[j, :2]])
            labels.append(1 if y_train[i] > y_train[j] else 0)
    return np.array(pairs).astype('float32'), np.array(labels).astype('float32'), np.array(ids)

def train(X_train: pd.DataFrame, y_train: pd.DataFrame, model_directory_path: str = "resources") -> None:
    X_train_orig = X_train.copy()
    y_train_orig = y_train.copy()
    date_list = list(X_train_orig['date'].unique())[-100:]
    dates_array = list(split(date_list, 20))

    for dates_to_include in dates_array:
        X_train = X_train_orig[X_train_orig['date'].isin(dates_to_include)].copy().sort_index()
        y_train = y_train_orig[y_train_orig['date'].isin(dates_to_include)].copy().sort_index()

        X_train = np.asarray(X_train)

        y_train = np.asarray(list(y_train['y']))

        print(f"Loaded Dates for {min(dates_to_include)} to {max(dates_to_include)}")

        #Train or Update Training on Model
        model_pathname = Path(model_directory_path) / "model.keras"

        if model_pathname.is_file():
            print(f"Opened Model for Date {date}")
            #Load Scaler
            with open(Path(model_directory_path) / 'scaler.pkl', 'rb') as file:
                scaler = pickle.load(file)

            #Load PCA
            with open(Path(model_directory_path) / 'pca.pkl', 'rb') as file:
                pca = pickle.load(file)

            #Scaling
            X_train[:,2:] = scaler.transform(X_train[:,2:])

            #PCA
            pca_ids = X_train[:,:2]
            pca_features = pca.transform(X_train[:,2:])
            X_train_concat = np.concatenate((pca_ids, pca_features), axis=1)
            del X_train

            X_train_pairs = np.array()
            y_train_labels = np.array()
            X_train_ids = np.array()

            for date in dates_to_include:
                #Pairwise Transformation
                X_array_from_date = X_train_concat[X_train_concat[,0] == date]
                X_train_pair_array, y_train_labels_array, X_train_ids_array = convert_to_pairwise(X_array_from_date, y_train)
                X_train_pairs = np.concatenate((X_train_pairs, X_train_pair_array), axis=0)
                y_train_labels.append((y_train_labels, y_train_labels_array), axis=0)
                X_train_ids.append((X_train_ids, X_train_ids_array), axis=0)
                del y_train
                del X_array_from_date

            del X_train_concat

            X_train_nn, X_test_nn, y_train_nn, y_test_nn = train_test_split(X_train_pairs, y_train_labels, random_state=42, shuffle=True, test_size=0.3)
            del X_train_pairs
            del y_train_labels

            model = load_model(model_pathname)

            history = model.fit(
                X_train_nn,
                y_train_nn,
                batch_size=5000,
                epochs=10,
                validation_data=[X_test_nn, y_test_nn],
                callbacks=[mc, early_stopping],
                shuffle=False,
                use_multiprocessing=True
            )

        else:

            #Scaling
            scaler = StandardScaler()
            X_train[:,2:] = scaler.fit_transform(X_train[:,2:])

            #Save Scaler
            with open(Path(model_directory_path) / 'scaler.pkl', 'wb') as file:
                pickle.dump(scaler, file)

            #PCA
            n_components = 40
            pca = PCA(n_components=n_components)
            pca_ids = X_train[:,:2]
            pca_features = pca.fit_transform(X_train[:,2:])
            X_train_concat = np.concatenate((pca_ids, pca_features), axis=1)
            del X_train

            #Save PCA
            with open(Path(model_directory_path) / 'pca.pkl', 'wb') as file:
                pickle.dump(pca, file)

            #Pairwise Transformation
            X_train_pairs, y_train_labels, X_train_ids = convert_to_pairwise(X_train_concat, y_train)
            del X_train_concat
            del y_train

            #Get train and test datasets
            X_train_nn, X_test_nn, y_train_nn, y_test_nn = train_test_split(X_train_pairs, y_train_labels, random_state=42, shuffle=True, test_size=0.3)
            del X_train_pairs
            del y_train_labels

            #Neural Network Model
            mc = ModelCheckpoint(model_pathname, monitor='val_loss', mode='min', verbose=1, save_best_only=True)

            early_stopping = EarlyStopping(
                monitor='val_loss',
                patience=1,
                verbose=0,
                mode='auto',
                baseline=None,
                restore_best_weights=True)

            model = keras.Sequential([
                keras.layers.Dense(800, activation='relu', kernel_initializer='lecun_normal', input_shape=(X_train_nn.shape[1], X_train_nn.shape[2])),
                keras.layers.BatchNormalization(),
                keras.layers.Dense(500, activation='relu', kernel_initializer='lecun_normal'),
                keras.layers.BatchNormalization(),
                keras.layers.Dense(250, activation='relu', kernel_initializer='lecun_normal'),
                keras.layers.BatchNormalization(),
                keras.layers.Dense(100, activation='relu', kernel_initializer='lecun_normal'),
                keras.layers.BatchNormalization(),
                keras.layers.Flatten(),
                keras.layers.Dense(1, activation='sigmoid', kernel_initializer='lecun_normal')
            ])

            optimizer = keras.optimizers.Adam(learning_rate=3e-4)

            model.compile(optimizer=optimizer,
                          loss='binary_crossentropy',
                          metrics=['accuracy'])

            history = model.fit(
                X_train_nn,
                y_train_nn,
                batch_size=10000,
                epochs=10,
                validation_data=[X_test_nn, y_test_nn],
                callbacks=[mc, early_stopping],
                shuffle=False,
                use_multiprocessing=True
            )

            model.save(model_pathname)

        print(f"Finished training for Date {date}")

    # make sure that the train function correctly save the trained model
    # in the model_directory_path
    # print(f"Saving model in {model_pathname}")
    # joblib.dump(model, model_pathname)

In [7]:
def infer(X_test: pd.DataFrame, model_directory_path: str = "resources") -> pd.DataFrame:
    X_test_orig = X_test.copy()
    dates = list(X_test_orig['date'].unique())

    result_df = pd.DataFrame(columns=['date', 'id', 'value'])
    for date in dates:
        X_test = X_test_orig[X_test_orig['date'] == date]

        #Dummy 'y' variable
        X_test_date = X_test.copy()
        X_test_date['y'] = 0

        X_test = np.asarray(X_test_date.drop(columns=['y']))

        y_test = np.asarray(list(X_test_date['y']))

        #Load Scaler
        with open(Path(model_directory_path) / 'scaler.pkl', 'rb') as file:
            scaler = pickle.load(file)

        #Load PCA
        with open(Path(model_directory_path) / 'pca.pkl', 'rb') as file:
            pca = pickle.load(file)

        #Load Model
        model_pathname = Path(model_directory_path) / "model.keras"
        model = load_model(model_pathname)

        #Scaling
        X_test[:,2:] = scaler.transform(X_test[:,2:])

        #PCA
        pca_ids = X_test[:,:2]
        pca_features = pca.transform(X_test[:,2:])
        X_test_concat = np.concatenate((pca_ids, pca_features), axis=1)

        #Pairwise Transformation
        X_test_pairs, y_test_labels, X_test_ids = convert_to_pairwise(X_test_concat, y_test)

        print(f"Predicting for Date {date} in Test")
        preds = model.predict(X_test_pairs, batch_size=3000)

        preds_df_1 = pd.DataFrame({'id': X_test_ids[:,0,1].flatten(), 'date': X_test_ids[:,0,0].flatten(), 'value': preds.flatten()})

        result = preds_df_1.groupby(['date', 'id']).mean().reset_index()

        result = pd.merge(X_test_date, result, on=['id', 'date'], how='left')

        result = result[['date', 'id', 'value']]

        lower, upper = -1, 1
        result['value'] = [lower + (upper - lower) * x for x in result['value']]

        result['value'] = result['value'].fillna(0)

        result_df = pd.concat([result_df, result], ignore_index=False, axis=0)

        print(f"Finished predictions for Date {date} in Test")

    return result_df

In [2]:
X_train = pd.read_parquet('data/X_train.parquet')
y_train = pd.read_parquet('data/y_train.parquet')

In [9]:
train(X_train, y_train)

Loaded Dates for 169 to 173



KeyboardInterrupt



In [23]:
max(date_list)

268

In [90]:

for i in range(dates_length):
    begin_date =
    end_date =
    X_train = X_train_orig[X_train_orig['date'] == date].copy()
    y_train = y_train_orig[y_train_orig['date'] == date].copy()

    X_train = np.asarray(X_train)

    y_train = np.asarray(list(y_train['y']))

Unnamed: 0,date,id,0,1,2,3,4,5,6,7,...,451,452,453,454,455,456,457,458,459,460
0,0,dae29c8061b3176b9208f26afbb96e2ca50886db41902d...,-0.909515,0.388808,-1.535913,-0.133312,-1.826404,-0.532795,0.351273,0.158866,...,-0.731349,-0.456020,-0.257331,0.396074,0.318007,-0.538754,-0.625193,-0.753419,0.154403,1.069385
1,0,2f71f1b5d49fbd131351df95848dc91ab14662af62d4d0...,-0.107694,-0.097967,-0.539599,-0.331276,-0.942609,-0.054123,-1.212772,1.688034,...,0.610428,-0.984907,-0.429806,0.199055,0.202587,1.612578,0.302153,-0.165713,0.905807,0.083180
2,0,b8d41ef950b69f94c380410f59f47e15666c57b74573b6...,0.092316,0.052596,-0.652025,1.218241,0.382968,-0.861838,-0.318937,-0.744261,...,0.212365,-0.046016,1.147463,0.696961,-0.574426,1.255969,0.270394,1.272939,-0.643112,0.433585
3,0,cdce060d04ce28a551eaab653cc4b01f5ad878aeb932ec...,4.119639,1.018918,3.687519,1.597563,0.055918,-1.406041,0.652994,0.251138,...,1.254787,-1.155922,-1.108540,-2.046100,1.311100,-0.322965,0.999248,-1.238640,0.882844,-1.333590
4,0,86f6e6d9407ad3abfab91a3bbfb7ad71553e3f968765b8...,0.109644,-0.290280,-0.278987,-0.603259,0.136952,-1.725076,-0.062219,-0.183102,...,-2.007721,-0.482311,-0.269142,-0.899796,1.083332,0.674665,-1.095657,-0.402669,0.677189,0.319992
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
742665,268,5a18ddc0f252fa17cbd2a5bfe2f3786c0afb5052dd92be...,0.790984,1.560877,-0.328996,-0.190068,0.314971,-0.001609,0.313957,-0.315743,...,-1.450422,-1.044100,0.631455,-1.322626,-0.407846,0.578026,0.830650,1.414314,-0.845734,0.399335
742666,268,73c197cf1cb75641710562fe26d4f562c8228847a67949...,-1.129492,0.696247,-1.494771,-0.404022,0.909996,-0.658659,0.688591,1.634416,...,-0.475011,0.319023,-1.038112,0.222924,0.804017,-0.969177,-1.011879,-0.921781,-0.067543,0.491890
742667,268,bad7ff9ebc5579589e5ef36cb58f962c90c864fd3dfb22...,1.656413,-1.267060,0.748902,-0.196263,0.831206,-1.590837,3.079856,0.498583,...,-0.010330,-0.426130,-0.624393,-0.236483,-0.244052,1.280749,-2.001158,-1.036838,-1.959235,-2.534523
742668,268,5b968ca44ac0550be6f31470a96e572cd1c58d36cc26c7...,0.282704,0.156104,-1.165022,0.513334,-1.111948,-1.368465,-1.347184,-0.926533,...,0.411093,0.225324,-0.112838,-0.366831,-0.385833,-0.301606,0.395659,-0.895311,-0.819201,-0.996246


In [105]:
date_thing = math.ceil(dates_length / 25) + 1
for i in range(1, date_thing):
    print(i)

1
2
3
4
5
6
7
8
9
10
11


In [13]:
date_list = list(X_train['date'].unique())[-100:]

In [10]:
X_train_pairs = np.empty()
y_train_labels = np.empty()
X_train_ids = np.empty()

TypeError: empty() missing required argument 'shape' (pos 0)

In [107]:
def split(a, n):
    k, m = divmod(len(a), n)
    return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))

In [115]:
dates_array = list(split(dates_list, 10))

In [116]:
for i in dates_array:
    print(i)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
[27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]
[54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80]
[81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107]
[108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134]
[135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161]
[162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188]
[189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215]
[216, 217, 218, 219, 2

In [30]:
date_list = list(X_train['date'].unique())[-100:]
dates_array = list(split(date_list, 20))

In [31]:
dates_array

[[169, 170, 171, 172, 173],
 [174, 175, 176, 177, 178],
 [179, 180, 181, 182, 183],
 [184, 185, 186, 187, 188],
 [189, 190, 191, 192, 193],
 [194, 195, 196, 197, 198],
 [199, 200, 201, 202, 203],
 [204, 205, 206, 207, 208],
 [209, 210, 211, 212, 213],
 [214, 215, 216, 217, 218],
 [219, 220, 221, 222, 223],
 [224, 225, 226, 227, 228],
 [229, 230, 231, 232, 233],
 [234, 235, 236, 237, 238],
 [239, 240, 241, 242, 243],
 [244, 245, 246, 247, 248],
 [249, 250, 251, 252, 253],
 [254, 255, 256, 257, 258],
 [259, 260, 261, 262, 263],
 [264, 265, 266, 267, 268]]

In [4]:
X_train = X_train[X_train['date'] <= 10]
x = np.asarray(X_train)

In [8]:
x[x[:,0] == 1]

array([[1,
        'a9580bb984c328091d2b70b497f97dce963bfd785621c80a4ba47d357863335e',
        -0.08658834546804428, ..., -1.7963533401489258,
        -0.25102466344833374, 1.7424898147583008],
       [1,
        '56bccba09d92107ecf3af54246dcc504e059c6ea7917d61a303ce1976f6343bb',
        1.5079907178878784, ..., 1.211098551750183, 0.6537860035896301,
        -1.0638591051101685],
       [1,
        '0e3de002c861737e58efaed85626573c3d0ce8d6de9537ec252f81adceff7205',
        -1.2383681535720825, ..., -0.5632125735282898,
        1.4093866348266602, -0.1598205417394638],
       ...,
       [1,
        '9e2c934ee7036e4c009f854cbad7c1918c15affa87de087fefd3a2fceec16387',
        0.9924005270004272, ..., 0.15136367082595825, 0.984168291091919,
        0.21882113814353943],
       [1,
        '4b66c74a9c71dff9bc7acf34704df667f6fdc2aa364c30c9dca8e4ab7e01bffb',
        0.32769083976745605, ..., 1.0924345254898071, 1.4107588529586792,
        0.8826435804367065],
       [1,
        '76bf116e4e803