In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.models import load_model
from pathlib import Path
import tensorflow as tf
from tensorflow import keras
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from sklearn.decomposition import PCA
import pickle
import math
import gc
from collections import defaultdict
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, Lambda
from tensorflow.keras import backend as K

In [2]:
##### FUNCTIONS #####


def pairwise_combinations_with_ids_and_values(X):
    n_samples = X.shape[0]
    generated_pairs = set()  # To store generated pairs and avoid duplicates
    X_pairs = []
    X_pair_ids = []

    for i in range(n_samples):
        date_i = X[i, 0]
        same_date_indices = [j for j in range(i + 1, n_samples) if X[j, 0] == date_i]
        for j in same_date_indices:
            pair_key = tuple(sorted((i, j)))  # Create a unique key for the pair
            if pair_key not in generated_pairs:  # Check if pair is already generated
                X_pair = np.array([X[i, 2:], X[j, 2:]])
                X_pair_id = np.array([X[i, :2], X[j, :2]])
                generated_pairs.add(pair_key)
                X_pairs.append(X_pair)
                X_pair_ids.append(X_pair_id)

    return np.array(X_pairs, dtype='float32'), np.array(X_pair_ids)




def pairwise_generator5(X, y, batch_size):
    n_samples = X.shape[0]

    # Create a dictionary to store indices for each date
    date_indices = defaultdict(list)
    for i in range(n_samples):
        date_indices[X[i, 0]].append(i)

    while True:
        indices = np.arange(n_samples)
        np.random.shuffle(indices)  # Shuffle indices to create random batches
        batch_X = X[indices]
        batch_y = y[indices]

        X_batch = []
        y_batch = []
        generated_pairs = set()  # Reset generated pairs for each batch

        for i in range(n_samples):
            date_i = batch_X[i, 0]
            same_date_indices = date_indices[date_i]
            for j in same_date_indices:
                if i >= j:  # Ignore duplicate pairs and self-pairs
                    continue
                pair_key = tuple(sorted([i, j]))  # Use sorted indices as a unique key
                if pair_key not in generated_pairs:  # Check if pair is already generated
                    X_pair = np.array([batch_X[i, 2:], batch_X[j, 2:]])
                    y_pair = 1 if batch_y[i, 2] > batch_y[j, 2] else 0
                    X_batch.append(X_pair)
                    y_batch.append(y_pair)
                    generated_pairs.add(pair_key)

                if len(X_batch) == batch_size:
                    X_batch_array = np.array(X_batch, dtype='float32')
                    yield [X_batch_array[:, 0], X_batch_array[:, 1]], np.array(y_batch, dtype='float32').reshape(-1,1)
                    X_batch, y_batch = [], []  # Reset batches


def create_siamese_model(input_shape):
    # Define the tensors for the two input images
    left_input = Input(input_shape)
    right_input = Input(input_shape)

    # Neural Network
    model = Sequential()
    model.add(Dense(400, activation='relu', kernel_initializer='lecun_normal', kernel_regularizer=keras.regularizers.l2(0.01)))
    model.add(Dropout(0.5))
    model.add(Dense(200, activation='relu', kernel_initializer='lecun_normal', kernel_regularizer=keras.regularizers.l2(0.01)))
    model.add(Dropout(0.5))
    model.add(Dense(50, activation='relu', kernel_initializer='lecun_normal', kernel_regularizer=keras.regularizers.l2(0.01)))
    model.add(Dropout(0.5))

    # Generate the encodings for the two images
    encoded_l = model(left_input)
    encoded_r = model(right_input)

    # Add a customized layer to compute the absolute difference between the encodings
    L1_layer = Lambda(lambda tensors: K.abs(tensors[0] - tensors[1]))
    L1_distance = L1_layer([encoded_l, encoded_r])

    # Add a dense layer with a sigmoid unit to generate the similarity score
    prediction = Dense(1, activation='sigmoid')(L1_distance)

    # Connect the inputs with the outputs
    siamese_net = Model(inputs=[left_input, right_input], outputs=prediction)

    # Return the model
    return siamese_net

In [3]:
##### TRAIN #####
def train(X_train: pd.DataFrame, y_train: pd.DataFrame, model_directory_path: str = "../resources") -> None:
    max_date = X_train['date'].max()
    min_date = 0
    X_train_orig = X_train[(X_train['date'] < max_date) & (X_train['date'] > min_date)]
    X_test = X_train[X_train['date'] == max_date]
    y_train_orig = y_train[(y_train['date'] < max_date) & (y_train['date'] > min_date)]
    y_test = y_train[y_train['date'] == max_date]

    #Scaling
    scaler = StandardScaler()
    X_ids = np.asarray(X_train_orig[['date', 'id']])
    X_scale_pca = X_train_orig.drop(columns=['date', 'id'])
    X_scale_pca = scaler.fit_transform(X_scale_pca)


    X_test_ids = np.asarray(X_test[['date', 'id']])
    X_test_scale_pca = X_test.drop(columns=['date', 'id'])
    X_test_scale_pca = scaler.transform(X_test_scale_pca)

    #PCA
    n_components = 40
    pca = PCA(n_components=n_components)
    pca_features = pca.fit_transform(X_scale_pca)
    X_train_concat = np.concatenate((X_ids, pca_features), axis=1)
    y_train = np.asarray(y_train)

    pca_features_test = pca.transform(X_test_scale_pca)
    X_test_concat = np.concatenate((X_test_ids, pca_features_test), axis=1)
    y_test = np.asarray(y_test)


    #Save out Scaler and PCA
    with open(Path(model_directory_path) / 'scaler.pkl', 'wb') as file:
        pickle.dump(scaler, file)

    with open(Path(model_directory_path) / 'pca.pkl', 'wb') as file:
        pickle.dump(pca, file)

    date_list = list(set(X_train_orig['date']))

    batch_size = 500
    train_generator = pairwise_generator5(X_train_concat, y_train, batch_size)
    test_generator= pairwise_generator5(X_test_concat, y_test, batch_size)

    print(X_train_concat.shape)


    #Model Training
    model_pathname = Path('../resources') / "model.keras"

    if model_pathname.is_file():
        early_stopping = EarlyStopping(
            monitor='val_loss',
            patience=8,
            verbose=0,
            mode='auto',
            baseline=None)

        history = model.fit(
            train_generator,
            batch_size=batch_size,
            steps_per_epoch = 1000,
            epochs=1000,
            validation_data=test_generator,
            validation_steps = 500,
            callbacks=[mc, early_stopping],
            shuffle=True,
            use_multiprocessing=False,
            verbose=1
        )

    else:
        #Neural Network Model
        mc = ModelCheckpoint(model_pathname, monitor='val_loss', mode='min', verbose=1, save_best_only=True)

        early_stopping = EarlyStopping(
            monitor='val_loss',
            patience=8,
            verbose=1,
            mode='auto',
            baseline=None)

        model = create_siamese_model((X_train_concat.shape[1] - 2,))

        optimizer = keras.optimizers.Adam(learning_rate=0.001)

        model.compile(optimizer=optimizer,
                      loss='binary_crossentropy',
                      metrics=['auc'])

        history = model.fit(
            train_generator,
            batch_size=batch_size,
            steps_per_epoch=1000,
            epochs=1000,
            validation_data=test_generator,
            validation_steps=500,
            callbacks=[mc, early_stopping],
            shuffle=True,
            use_multiprocessing=False,
            verbose=1
        )

        model.save(model_pathname)




    gc.collect()

    print("Finished All Training")

In [4]:
def infer(X_test: pd.DataFrame, model_directory_path: str = "../resources") -> pd.DataFrame:
    X_test_orig = X_test.copy()

    # Load Scaler
    with open(Path(model_directory_path) / 'scaler.pkl', 'rb') as file:
        scaler = pickle.load(file)


    # Scaling
    X_ids = np.asarray(X_test_orig[['date', 'id']])
    X_scale_pca = X_test_orig.drop(columns=['date', 'id'])
    X_scale_pca = scaler.transform(X_scale_pca)

    X_test_concat = np.concatenate((X_ids, X_scale_pca), axis=1)


    result_df = pd.DataFrame(columns=['date', 'id', 'value'])

    # Load Model
    model_pathname = Path(model_directory_path) / "model.keras"
    model = load_model(model_pathname)

    # Pairwise Transformation using the pairwise generator
    batch_size = 1000
    X_test, X_test_ids = pairwise_combinations_with_ids_and_values(X_test_concat)

    print("Predicting for Test Data")
    preds = model.predict(X_test)

    preds_df = pd.DataFrame({'id': X_test_ids[:, 0, 1].flatten(), 'date': X_test_ids[:, 0, 0].flatten(), 'value': preds.flatten()})
    preds_df = preds_df.groupby(['date', 'id']).mean().reset_index()

    result_df = pd.merge(X_test_orig, preds_df, on=['id', 'date'], how='left')
    result_df['value'] = result_df['value'].fillna(result_df['value'].mean())

    minmax = MinMaxScaler(feature_range=(-1, 1))
    result_df['value'] = minmax.fit_transform(result_df[['value']])

    print("Finished predicting Test Data")

    return result_df

In [5]:
X_train = pd.read_parquet('../data/X_train.parquet')
y_train = pd.read_parquet('../data/y_train.parquet')
X_test = pd.read_parquet('../data/X_test.parquet')

In [6]:
train(X_train, y_train)

(737864, 463)


UnboundLocalError: local variable 'model' referenced before assignment

In [None]:
results = infer(X_test)

In [10]:
X_train

Unnamed: 0,date,id,0,1,2,3,4,5,6,7,...,451,452,453,454,455,456,457,458,459,460
0,0,dae29c8061b3176b9208f26afbb96e2ca50886db41902d...,-0.909515,0.388808,-1.535913,-0.133312,-1.826404,-0.532795,0.351273,0.158866,...,-0.731349,-0.456020,-0.257331,0.396074,0.318007,-0.538754,-0.625193,-0.753419,0.154403,1.069385
1,0,2f71f1b5d49fbd131351df95848dc91ab14662af62d4d0...,-0.107694,-0.097967,-0.539599,-0.331276,-0.942609,-0.054123,-1.212772,1.688034,...,0.610428,-0.984907,-0.429806,0.199055,0.202587,1.612578,0.302153,-0.165713,0.905807,0.083180
2,0,b8d41ef950b69f94c380410f59f47e15666c57b74573b6...,0.092316,0.052596,-0.652025,1.218241,0.382968,-0.861838,-0.318937,-0.744261,...,0.212365,-0.046016,1.147463,0.696961,-0.574426,1.255969,0.270394,1.272939,-0.643112,0.433585
3,0,cdce060d04ce28a551eaab653cc4b01f5ad878aeb932ec...,4.119639,1.018918,3.687519,1.597563,0.055918,-1.406041,0.652994,0.251138,...,1.254787,-1.155922,-1.108540,-2.046100,1.311100,-0.322965,0.999248,-1.238640,0.882844,-1.333590
4,0,86f6e6d9407ad3abfab91a3bbfb7ad71553e3f968765b8...,0.109644,-0.290280,-0.278987,-0.603259,0.136952,-1.725076,-0.062219,-0.183102,...,-2.007721,-0.482311,-0.269142,-0.899796,1.083332,0.674665,-1.095657,-0.402669,0.677189,0.319992
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
742665,268,5a18ddc0f252fa17cbd2a5bfe2f3786c0afb5052dd92be...,0.790984,1.560877,-0.328996,-0.190068,0.314971,-0.001609,0.313957,-0.315743,...,-1.450422,-1.044100,0.631455,-1.322626,-0.407846,0.578026,0.830650,1.414314,-0.845734,0.399335
742666,268,73c197cf1cb75641710562fe26d4f562c8228847a67949...,-1.129492,0.696247,-1.494771,-0.404022,0.909996,-0.658659,0.688591,1.634416,...,-0.475011,0.319023,-1.038112,0.222924,0.804017,-0.969177,-1.011879,-0.921781,-0.067543,0.491890
742667,268,bad7ff9ebc5579589e5ef36cb58f962c90c864fd3dfb22...,1.656413,-1.267060,0.748902,-0.196263,0.831206,-1.590837,3.079856,0.498583,...,-0.010330,-0.426130,-0.624393,-0.236483,-0.244052,1.280749,-2.001158,-1.036838,-1.959235,-2.534523
742668,268,5b968ca44ac0550be6f31470a96e572cd1c58d36cc26c7...,0.282704,0.156104,-1.165022,0.513334,-1.111948,-1.368465,-1.347184,-0.926533,...,0.411093,0.225324,-0.112838,-0.366831,-0.385833,-0.301606,0.395659,-0.895311,-0.819201,-0.996246
