In [25]:
import pandas as pd
import numpy as np
import joblib

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from keras.models import load_model
from pathlib import Path
import tensorflow as tf
from tensorflow import keras
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from sklearn.decomposition import PCA
import pickle

In [26]:
def convert_to_pairwise(X_train, y_train):
    pairs = []
    labels = []
    ids = []
    n_samples = X_train.shape[0]
    for i in range(n_samples):
        for j in range(i+1, n_samples):
            pairs.append([X_train[i, 2:], X_train[j, 2:]])
            ids.append([X_train[i, :2], X_train[j, :2]])
            labels.append(1 if y_train[i] > y_train[j] else 0)
    return np.array(pairs).astype('float32'), np.array(labels).astype('float32'), np.array(ids)

def train(X_train: pd.DataFrame, y_train: pd.DataFrame, model_directory_path: str = "resources") -> None:
    X_train_orig = X_train.copy()
    y_train_orig = y_train.copy()
    dates = list(X_train_orig['date'].unique())

    for date in dates:
        X_train = X_train_orig[X_train_orig['date'] == date].copy()
        y_train = y_train_orig[y_train_orig['date'] == date].copy()

        X_train = np.asarray(X_train)

        y_train = np.asarray(list(y_train['y']))

        #Train or Update Training on Model
        model_pathname = Path(model_directory_path) / "model.keras"

        if model_pathname.is_file():
            print(f"Opened Model for Date {date}")
            #Load Scaler
            with open(Path(model_directory_path) / 'scaler.pkl', 'rb') as file:
                scaler = pickle.load(file)

            #Load PCA
            with open(Path(model_directory_path) / 'pca.pkl', 'rb') as file:
                pca = pickle.load(file)

            #Scaling
            X_train[:,2:] = scaler.transform(X_train[:,2:])

            #PCA
            pca_ids = X_train[:,:2]
            pca_features = pca.transform(X_train[:,2:])
            X_train_concat = np.concatenate((pca_ids, pca_features), axis=1)

            #Pairwise Transformation
            X_train_pairs, y_train_labels, X_train_ids = convert_to_pairwise(X_train_concat, y_train)

            X_train_nn, X_test_nn, y_train_nn, y_test_nn = train_test_split(X_train_pairs, y_train_labels, random_state=42, shuffle=True, test_size=0.3)

            model = load_model(model_pathname)

            history = model.fit(
                X_train_nn,
                y_train_nn,
                batch_size=5000,
                epochs=10,
                validation_data=[X_test_nn, y_test_nn],
                callbacks=[mc, early_stopping],
                shuffle=False,
                use_multiprocessing=True
            )

        else:

            #Scaling
            scaler = StandardScaler()
            X_train[:,2:] = scaler.fit_transform(X_train[:,2:])

            #Save Scaler
            with open(Path(model_directory_path) / 'scaler.pkl', 'wb') as file:
                pickle.dump(scaler, file)

            #PCA
            n_components = 40
            pca = PCA(n_components=n_components)
            pca_ids = X_train[:,:2]
            pca_features = pca.fit_transform(X_train[:,2:])
            X_train_concat = np.concatenate((pca_ids, pca_features), axis=1)

            #Save PCA
            with open(Path(model_directory_path) / 'pca.pkl', 'wb') as file:
                pickle.dump(pca, file)

            #Pairwise Transformation
            X_train_pairs, y_train_labels, X_train_ids = convert_to_pairwise(X_train_concat, y_train)

            #Get train and test datasets
            X_train_nn, X_test_nn, y_train_nn, y_test_nn = train_test_split(X_train_pairs, y_train_labels, random_state=42, shuffle=True, test_size=0.3)

            #Neural Network Model
            mc = ModelCheckpoint(model_pathname, monitor='val_loss', mode='min', verbose=1, save_best_only=True)

            early_stopping = EarlyStopping(
                monitor='val_loss',
                patience=2,
                verbose=0,
                mode='auto',
                baseline=None,
                restore_best_weights=True)

            model = keras.Sequential([
                keras.layers.Dense(800, activation='relu', kernel_initializer='lecun_normal', input_shape=(X_train_nn.shape[1], X_train_nn.shape[2])),
                keras.layers.BatchNormalization(),
                keras.layers.Dense(500, activation='relu', kernel_initializer='lecun_normal'),
                keras.layers.BatchNormalization(),
                keras.layers.Dense(250, activation='relu', kernel_initializer='lecun_normal'),
                keras.layers.BatchNormalization(),
                keras.layers.Dense(100, activation='relu', kernel_initializer='lecun_normal'),
                keras.layers.BatchNormalization(),
                keras.layers.Flatten(),
                keras.layers.Dense(1, activation='sigmoid', kernel_initializer='lecun_normal')
            ])

            optimizer = keras.optimizers.Adam(learning_rate=3e-4)

            model.compile(optimizer=optimizer,
                          loss='binary_crossentropy',
                          metrics=['accuracy'])

            history = model.fit(
                X_train_nn,
                y_train_nn,
                batch_size=5000,
                epochs=10,
                validation_data=[X_test_nn, y_test_nn],
                callbacks=[mc, early_stopping],
                shuffle=False,
                use_multiprocessing=True
            )

            model.save(model_pathname)

        print(f"Finished training for Date {date}")

    # make sure that the train function correctly save the trained model
    # in the model_directory_path
    # print(f"Saving model in {model_pathname}")
    # joblib.dump(model, model_pathname)

In [85]:
def infer(X_test: pd.DataFrame, model_directory_path: str = "resources") -> pd.DataFrame:
    X_test_orig = X_test.copy()
    dates = list(X_test_orig['date'].unique())

    result_df = pd.DataFrame(columns=['date', 'id', 'value'])
    for date in dates:
        X_test = X_test_orig[X_test_orig['date'] == date]

        #Dummy 'y' variable
        X_test_date = X_test.copy()
        X_test_date['y'] = 0

        X_test = np.asarray(X_test_date.drop(columns=['y']))

        y_test = np.asarray(list(X_test_date['y']))

        #Load Scaler
        with open(Path(model_directory_path) / 'scaler.pkl', 'rb') as file:
            scaler = pickle.load(file)

        #Load PCA
        with open(Path(model_directory_path) / 'pca.pkl', 'rb') as file:
            pca = pickle.load(file)

        #Load Model
        model_pathname = Path(model_directory_path) / "model.keras"
        model = load_model(model_pathname)

        #Scaling
        X_test[:,2:] = scaler.transform(X_test[:,2:])

        #PCA
        pca_ids = X_test[:,:2]
        pca_features = pca.transform(X_test[:,2:])
        X_test_concat = np.concatenate((pca_ids, pca_features), axis=1)

        #Pairwise Transformation
        X_test_pairs, y_test_labels, X_test_ids = convert_to_pairwise(X_test_concat, y_test)

        print(f"Predicting for Date {date} in Test")
        preds = model.predict(X_test_pairs, batch_size=3000)

        preds_df_1 = pd.DataFrame({'id': X_test_ids[:,0,1].flatten(), 'date': X_test_ids[:,0,0].flatten(), 'value': preds.flatten()})

        result = preds_df_1.groupby(['date', 'id']).mean().reset_index()

        result = pd.merge(X_test_date, result, on=['id', 'date'], how='left')

        result = result[['date', 'id', 'value']]

        lower, upper = -1, 1
        result['value'] = [lower + (upper - lower) * x for x in result['value']]

        result['value'] = result['value'].fillna(0)

        result_df = pd.concat([result_df, result], ignore_index=False, axis=0)

        print(f"Finished predictions for Date {date} in Test")

    return result_df