In [None]:
import pandas as pd
import numpy as np
import joblib

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.models import load_model
from pathlib import Path
import tensorflow as tf
from tensorflow import keras
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from sklearn.decomposition import PCA
import pickle
import math
import gc
from collections import defaultdict
from scipy.stats import zscore

In [26]:
def split(a, n):
    k, m = divmod(len(a), n)
    return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))

def convert_to_pairwise_train(X_train, y_train):
    pairs = []
    labels = []
    ids = []
    n_samples = X_train.shape[0]
    for i in range(n_samples):
        for j in range(i+1, n_samples):
            pairs.append([X_train[i, 2:], X_train[j, 2:]])
            ids.append([X_train[i, :2], X_train[j, :2]])
            labels.append(1 if y_train[i] > y_train[j] else 0)
    return np.array(pairs).astype('float32'), np.array(labels).astype('float32'), np.array(ids)

def convert_to_pairwise_test(X_test):
    pairs = []
    ids = []
    n_samples = X_test.shape[0]
    for i in range(n_samples):
        for j in range(i+1, n_samples):
            pairs.append([X_test[i, 2:], X_test[j, 2:]])
            ids.append([X_test[i, :2], X_test[j, :2]])
    return np.array(pairs).astype('float32'), np.array(ids)


def pairwise_generator(X, y, batch_size):
    n_samples = X.shape[0]

    while True:
        for start in range(0, n_samples, batch_size):
            end = min(start + batch_size, n_samples)
            batch_X = X[start:end]
            batch_y = y[start:end]

            X_batch = []
            y_batch = []
            for i in range(len(batch_X)):
                date_i = batch_X[i, 0]
                for j in range(i+1, len(batch_X)):
                    date_j = batch_X[j, 0]
                    if date_i == date_j:
                        X_pair = [batch_X[i, 2:], batch_X[j, 2:]]
                        X_batch.append(X_pair)
                        y_batch.append(1 if batch_y[i] > batch_y[j] else 0)
                        X_id_pair = [batch_X[i, :2], batch_X[j, :2]]

            yield np.array(X_batch), np.array(y_batch)


def pairwise_generator_ids(X, y, batch_size):
    n_samples = X.shape[0]

    while True:
        for start in range(0, n_samples, batch_size):
            end = min(start + batch_size, n_samples)
            batch_X = X[start:end]
            batch_y = y[start:end]

            X_batch = []
            X_ids_batch = []
            y_batch = []
            for i in range(len(batch_X)):
                date_i = batch_X[i, 0]
                for j in range(i+1, len(batch_X)):
                    date_j = batch_X[j, 0]
                    if date_i == date_j:
                        X_pair = [batch_X[i, 2:], batch_X[j, 2:]]
                        X_batch.append(X_pair)
                        y_batch.append(1 if batch_y[i, 2] > batch_y[j, 2] else 0)
                        X_id_pair = [batch_X[i, :2], batch_X[j, :2]]
                        X_ids_batch.append(X_id_pair)

            yield np.array(X_batch), np.array(X_ids_batch), np.array(y_batch)


def pairwise_generator2(X, y, batch_size):
    n_samples = X.shape[0]

    while True:
        indices = np.arange(n_samples)
        np.random.shuffle(indices)  # Shuffle indices to create random batches
        for start in range(0, n_samples, batch_size):
            end = min(start + batch_size, n_samples)
            batch_indices = indices[start:end]
            batch_X = X[batch_indices]
            batch_y = y[batch_indices]

            X_batch = []
            y_batch = []
            for i in range(len(batch_X)):
                date_i = batch_X[i, 0]
                same_date_indices = [j for j in range(i+1, len(batch_X)) if batch_X[j, 0] == date_i]
                for j in same_date_indices:
                    X_pair = [batch_X[i, 2:], batch_X[j, 2:]]  # Pairs are now complete rows
                    X_batch.append(X_pair)
                    y_batch.append(1 if batch_y[i, 2] > batch_y[j, 2] else 0)

            yield np.array(X_batch, dtype='float32'), np.array(y_batch, dtype='float32')


def pairwise_generator3(X, y, batch_size):
    n_samples = X.shape[0]
    generated_pairs = set()  # To store generated pairs and avoid duplicates

    while True:
        indices = np.arange(n_samples)
        np.random.shuffle(indices)  # Shuffle indices to create random batches
        for start in range(0, n_samples, batch_size):
            end = min(start + batch_size, n_samples)
            batch_indices = indices[start:end]
            batch_X = X[batch_indices]
            batch_y = y[batch_indices]

            X_batch = []
            y_batch = []
            for i in range(len(batch_X)):
                date_i = batch_X[i, 0]
                same_date_indices = [j for j in range(i+1, len(batch_X)) if batch_X[j, 0] == date_i]
                for j in same_date_indices:
                    pair_key = tuple(sorted((i, j)))  # Create a unique key for the pair
                    if pair_key not in generated_pairs:  # Check if pair is already generated
                        X_pair = [batch_X[i, 2:], batch_X[j, 2:]]
                        y_pair = 1 if batch_y[i, 2] > batch_y[j, 2] else 0
                        X_batch.append(X_pair)
                        y_batch.append(y_pair)
                        generated_pairs.add(pair_key)

            yield np.array(X_batch, dtype='float32'), np.array(y_batch, dtype='float32')


def pairwise_generator4(X, y, batch_size):
    n_samples = X.shape[0]
    generated_pairs = set()  # To store generated pairs and avoid duplicates

    while True:
        indices = np.arange(n_samples)
        np.random.shuffle(indices)  # Shuffle indices to create random batches
        batch_X = X[indices]
        batch_y = y[indices]

        X_batch = []
        y_batch = []
        X_batch_ids = []

        for i in range(n_samples):
            date_i = batch_X[i, 0]
            same_date_indices = [j for j in range(i+1, n_samples) if batch_X[j, 0] == date_i]
            for j in same_date_indices:
                pair_key = tuple(sorted((i, j)))  # Create a unique key for the pair
                if pair_key not in generated_pairs:  # Check if pair is already generated
                    X_pair = [batch_X[i, 2:], batch_X[j, 2:]]
                    X_pair_ids = [batch_X[i, :2], batch_X[j, :2]]
                    y_pair = 1 if batch_y[i, 2] > batch_y[j, 2] else 0
                    X_batch.append(X_pair)
                    X_batch_ids.append(X_batch_ids)
                    y_batch.append(y_pair)
                    generated_pairs.add(pair_key)

                    if len(X_batch) == batch_size:
                        yield np.array(X_batch, dtype='float32'), np.array(y_batch, dtype='float32'), np.array(X_batch_ids)
                        X_batch = []
                        y_batch = []
                        X_batch_ids = []


def pairwise_combinations_with_ids_and_values(X):
    n_samples = X.shape[0]
    generated_pairs = set()  # To store generated pairs and avoid duplicates
    X_pairs = []
    X_pair_ids = []

    for i in range(n_samples):
        date_i = X[i, 0]
        same_date_indices = [j for j in range(i + 1, n_samples) if X[j, 0] == date_i]
        for j in same_date_indices:
            pair_key = tuple(sorted((i, j)))  # Create a unique key for the pair
            if pair_key not in generated_pairs:  # Check if pair is already generated
                X_pair = np.array([X[i, 2:], X[j, 2:]])
                X_pair_id = np.array([X[i, :2], X[j, :2]])
                generated_pairs.add(pair_key)
                X_pairs.append(X_pair)
                X_pair_ids.append(X_pair_id)

    return np.array(X_pairs, dtype='float32'), np.array(X_pair_ids)




def pairwise_generator5(X, y, batch_size):
    n_samples = X.shape[0]
    generated_pairs = set()  # To store generated pairs and avoid duplicates

    while True:
        indices = np.arange(n_samples)
        np.random.shuffle(indices)  # Shuffle indices to create random batches
        batch_X = X[indices]
        batch_y = y[indices]

        X_batch = []
        y_batch = []

        # Create a dictionary to store indices for each date
        date_indices = defaultdict(list)
        for i in range(n_samples):
            date_indices[batch_X[i, 0]].append(i)

        for i in range(n_samples):
            date_i = batch_X[i, 0]
            same_date_indices = date_indices[date_i]
            for j in same_date_indices:
                if i >= j:
                    continue
                pair_key = (i, j)  # Use indices directly as a unique key
                if pair_key not in generated_pairs:  # Check if pair is already generated
                    X_pair = np.array([batch_X[i, 2:], batch_X[j, 2:]])
                    y_pair = 1 if batch_y[i, 2] > batch_y[j, 2] else 0
                    X_batch.append(X_pair)
                    y_batch.append(y_pair)
                    generated_pairs.add(pair_key)

                    if len(X_batch) == batch_size:
                        yield np.array(X_batch, dtype='float32'), np.array(y_batch, dtype='float32')
                        X_batch = []
                        y_batch = []

In [27]:
### TRAIN METHODOLOGY 2 ###
def train(X_train: pd.DataFrame, y_train: pd.DataFrame, model_directory_path: str = "../resources") -> None:
    max_date = X_train['date'].max()
    min_date = 150
    X_train_orig = X_train[(X_train['date'] < max_date) & (X_train['date'] > min_date)]
    X_test = X_train[X_train['date'] == max_date]
    y_train_orig = y_train[(y_train['date'] < max_date) & (y_train['date'] > min_date)]
    y_test = y_train[y_train['date'] == max_date]

    #Scaling
    scaler = StandardScaler()
    X_ids = np.asarray(X_train_orig[['date', 'id']])
    X_scale_pca = X_train_orig.drop(columns=['date', 'id'])
    X_scale_pca = scaler.fit_transform(X_scale_pca)


    X_test_ids = np.asarray(X_test[['date', 'id']])
    X_test_scale_pca = X_test.drop(columns=['date', 'id'])
    X_test_scale_pca = scaler.transform(X_test_scale_pca)

    #PCA
    n_components = 40
    pca = PCA(n_components=n_components)
    pca_features = pca.fit_transform(X_scale_pca)
    X_train_concat = np.concatenate((X_ids, pca_features), axis=1)
    y_train = np.asarray(y_train)

    pca_features_test = pca.transform(X_test_scale_pca)
    X_test_concat = np.concatenate((X_test_ids, pca_features_test), axis=1)
    y_test = np.asarray(y_test)


    #Save out Scaler and PCA
    with open(Path(model_directory_path) / 'scaler.pkl', 'wb') as file:
        pickle.dump(scaler, file)

    with open(Path(model_directory_path) / 'pca.pkl', 'wb') as file:
        pickle.dump(pca, file)

    date_list = list(set(X_train_orig['date']))

    batch_size = 500
    train_generator = pairwise_generator5(X_train_concat, y_train, batch_size)
    test_generator= pairwise_generator5(X_test_concat, y_test, batch_size)

    print(X_train_concat.shape)


    #Model Training
    model_pathname = Path('../resources') / "model.keras"

    if model_pathname.is_file():
        early_stopping = EarlyStopping(
            monitor='val_loss',
            patience=20,
            verbose=0,
            mode='auto',
            baseline=None)

        history = model.fit(
            train_generator,
            steps_per_epoch = len(X_train_concat) // batch_size,
            batch_size=batch_size,
            epochs=30,
            validation_data=test_generator,
            validation_steps = len(X_test_concat) // batch_size,
            callbacks=[mc, early_stopping],
            shuffle=True,
            use_multiprocessing=False,
            verbose=1
        )

    else:
        #Neural Network Model
        mc = ModelCheckpoint(model_pathname, monitor='val_loss', mode='min', verbose=1, save_best_only=False)

        early_stopping = EarlyStopping(
            monitor='val_loss',
            patience=20,
            verbose=1,
            mode='auto',
            baseline=None)

        model = keras.Sequential([
            keras.layers.Dense(200, activation='relu', kernel_initializer='lecun_normal', kernel_regularizer=keras.regularizers.l2(0.01), input_shape=(2, (X_train_concat.shape[1] - 2))),
            #keras.layers.BatchNormalization(),
            keras.layers.Dropout(0.5),  # Adding dropout regularization
            keras.layers.Dense(100, activation='relu', kernel_initializer='lecun_normal', kernel_regularizer=keras.regularizers.l2(0.01)),
            #keras.layers.BatchNormalization(),
            # keras.layers.Dropout(0.5),  # Adding dropout regularization
            # keras.layers.Dense(250, activation='relu', kernel_initializer='lecun_normal'),
            # keras.layers.BatchNormalization(),
            # keras.layers.Dropout(0.5),  # Adding dropout regularization
            # keras.layers.Dense(100, activation='relu', kernel_initializer='lecun_normal'),
            # keras.layers.BatchNormalization(),
            keras.layers.Dropout(0.5),  # Adding dropout regularization
            keras.layers.Dense(25, activation='relu', kernel_initializer='lecun_normal', kernel_regularizer=keras.regularizers.l2(0.01)),
            keras.layers.Dropout(0.5),  # Adding dropout regularization
            keras.layers.Dense(1, activation='sigmoid', kernel_initializer='lecun_normal')
        ])

        optimizer = keras.optimizers.Adam(learning_rate=3e-4)

        model.compile(optimizer=optimizer,
                      loss='binary_crossentropy',
                      metrics=['accuracy'])

        history = model.fit(
            train_generator,
            batch_size=batch_size,
            steps_per_epoch=len(X_train_concat) // batch_size,
            epochs=30,
            validation_steps=len(X_test_concat) // batch_size,
            validation_data=test_generator,
            callbacks=[mc, early_stopping],
            shuffle=True,
            use_multiprocessing=False,
            verbose=1
        )

        model.save(model_pathname)




    gc.collect()

    print("Finished All Training")

    # make sure that the train function correctly save the trained model
    # in the model_directory_path
    # print(f"Saving model in {model_pathname}")
    # joblib.dump(model, model_pathname)

In [28]:
def infer(X_test: pd.DataFrame, model_directory_path: str = "../resources") -> pd.DataFrame:
    X_test_orig = X_test.copy()

    # Load Scaler
    with open(Path(model_directory_path) / 'scaler.pkl', 'rb') as file:
        scaler = pickle.load(file)

    # Load PCA
    with open(Path(model_directory_path) / 'pca.pkl', 'rb') as file:
        pca = pickle.load(file)

    # Scaling
    X_ids = np.asarray(X_test_orig[['date', 'id']])
    X_scale_pca = X_test_orig.drop(columns=['date', 'id'])
    X_scale_pca = scaler.transform(X_scale_pca)

    # PCA
    pca_features = pca.transform(X_scale_pca)
    X_test_concat = np.concatenate((X_ids, pca_features), axis=1)


    result_df = pd.DataFrame(columns=['date', 'id', 'value'])

    # Load Model
    model_pathname = Path(model_directory_path) / "model.keras"
    model = load_model(model_pathname)

    # Pairwise Transformation using the pairwise generator
    batch_size = 1000
    X_test, X_test_ids = pairwise_combinations_with_ids_and_values(X_test_concat)

    print("Predicting for Test Data")
    preds = model.predict(X_test)

    preds_df = pd.DataFrame({'id': X_test_ids[:, 0, 1].flatten(), 'date': X_test_ids[:, 0, 0].flatten(), 'value': preds.flatten()})
    preds_df = preds_df.groupby(['date', 'id']).mean().reset_index()

    result_df = pd.merge(X_test_orig, preds_df, on=['id', 'date'], how='left')
    result_df['value'] = result_df['value'].fillna(result_df['value'].mean())

    minmax = MinMaxScaler(feature_range=(-1, 1))
    result_df['value'] = minmax.fit_transform(result_df[['value']])

    print("Finished predicting Test Data")

    return result_df

In [16]:
X_train = pd.read_parquet('../data/X_train.parquet')
y_train = pd.read_parquet('../data/y_train.parquet')
X_test = pd.read_parquet('../data/X_test.parquet')

In [17]:
train(X_train, y_train)

(415591, 42)
Epoch 1/30


ValueError: in user code:

    File "C:\Users\mikea\anaconda3\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\mikea\anaconda3\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\mikea\anaconda3\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\mikea\anaconda3\lib\site-packages\keras\engine\training.py", line 994, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "C:\Users\mikea\anaconda3\lib\site-packages\keras\engine\training.py", line 1052, in compute_loss
        return self.compiled_loss(
    File "C:\Users\mikea\anaconda3\lib\site-packages\keras\engine\compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "C:\Users\mikea\anaconda3\lib\site-packages\keras\losses.py", line 152, in __call__
        losses = call_fn(y_true, y_pred)
    File "C:\Users\mikea\anaconda3\lib\site-packages\keras\losses.py", line 272, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "C:\Users\mikea\anaconda3\lib\site-packages\keras\losses.py", line 2162, in binary_crossentropy
        backend.binary_crossentropy(y_true, y_pred, from_logits=from_logits),
    File "C:\Users\mikea\anaconda3\lib\site-packages\keras\backend.py", line 5677, in binary_crossentropy
        return tf.nn.sigmoid_cross_entropy_with_logits(

    ValueError: `logits` and `labels` must have the same shape, received ((None, 2, 1) vs (None,)).


In [59]:
X_test = X_train[X_train['date'] == 200]
y_test = y_train[y_train['date'] == 200]

In [60]:
X_test

Unnamed: 0,date,id,0,1,2,3,4,5,6,7,...,451,452,453,454,455,456,457,458,459,460
489422,200,b2014a4eb6f5be0b43de60668d6cda68f481fd1a3554de...,-0.395584,0.467767,0.334684,-0.003670,-0.084305,-0.610649,-1.911305,1.869958,...,0.072215,0.064613,0.063679,-0.942424,0.437068,0.162571,-0.511908,0.700700,0.531277,0.554467
489423,200,77a12377576612a9a90570b270df5e6fe86f20fd8bacfb...,1.560541,-0.345553,0.395517,0.538143,0.061281,-0.426155,-0.099393,0.442215,...,2.041005,-1.091513,-0.949481,0.495675,0.373371,-0.370041,1.043447,-0.436087,-1.206550,0.870037
489424,200,f0cce9977234a16b9171182664a0ef16f2fa373eb4b8c1...,-0.004308,1.535068,-0.351770,-0.574043,-0.736015,-0.483100,1.136352,0.610554,...,-0.076245,0.613645,-0.076734,0.458068,-1.385604,-1.153192,2.216623,1.369278,-0.214589,1.349027
489425,200,15e3163819739dc4b9318670fbda0c7361b25dc31b3025...,1.032348,-0.108589,0.112654,-0.984792,0.000401,-0.497814,0.447101,-0.370030,...,0.302309,-0.917909,0.796511,-1.143900,-0.840966,-1.124376,-0.075706,0.875465,-0.645114,0.108495
489426,200,b61439cc04e2a6666b7b3834124952b00742885c74b4a3...,0.764759,-0.020000,-0.174101,0.250532,-0.470467,1.320027,0.844867,-0.067772,...,0.526456,-1.779329,-0.287168,1.196917,0.474941,0.009121,-0.285514,-1.211052,3.517855,-1.681823
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492839,200,ebcc110654e89b43d198347ffdf95155a29dd820cba947...,0.745428,0.440497,-0.381767,1.300897,0.678993,-1.070824,-0.163306,0.051261,...,-0.479455,-1.193991,0.057748,-0.266845,-1.978869,0.173025,0.636986,-0.145487,-1.612342,0.670242
492840,200,ed93af2fb3db2b9150cd7896b2aed25973867f7b2ed9eb...,-0.961760,-0.365210,-2.539697,2.138631,0.538042,-0.479963,0.536242,-0.829076,...,-0.181034,-0.642921,1.288435,-0.998922,-0.983312,0.108591,-0.109811,0.109938,-1.127231,0.548930
492841,200,92d48ab6987d778ddc8b2eca74ff6a5c52d13afbf0e1ad...,-2.151686,2.589724,0.579317,-0.722497,-0.456387,-0.235593,-1.049381,0.218806,...,-0.037265,1.733668,0.443702,0.438292,0.559468,-1.700752,-2.230316,-0.355390,0.310145,-1.594635
492842,200,2cc2ea6e7ee8eaabe679620c414f07840c5e9226f28d0a...,-1.108856,0.340626,1.462280,0.606334,-0.547911,-0.259995,0.471951,1.366694,...,0.862505,0.735640,-1.822171,-0.510724,-2.559841,-0.556591,1.892561,1.818054,-0.408390,0.190328


In [41]:
test = infer(X_test)

Predicting for Test Data
Finished predicting Test Data


In [24]:
model.summary()

NameError: name 'model' is not defined

# TESTING

In [64]:
X_train = pd.read_parquet('../data/X_train.parquet')
y_train = pd.read_parquet('../data/y_train.parquet')
X_test = pd.read_parquet('../data/X_test.parquet')

In [61]:
max_date = X_train['date'].max()
model_directory_path = '../resources'
min_date = 150
X_train_orig = X_train[(X_train['date'] < max_date) & (X_train['date'] > min_date)]
X_test = X_train[X_train['date'] == max_date]
y_train_orig = y_train[(y_train['date'] < max_date) & (y_train['date'] > min_date)]
y_test = y_train[y_train['date'] == max_date]

#Scaling
scaler = StandardScaler()
X_ids = np.asarray(X_train_orig[['date', 'id']])
X_scale_pca = X_train_orig.drop(columns=['date', 'id'])
X_scale_pca = scaler.fit_transform(X_scale_pca)


X_test_ids = np.asarray(X_test[['date', 'id']])
X_test_scale_pca = X_test.drop(columns=['date', 'id'])
X_test_scale_pca = scaler.transform(X_test_scale_pca)

#PCA
n_components = 40
pca = PCA(n_components=n_components)
pca_features = pca.fit_transform(X_scale_pca)
X_train_concat = np.concatenate((X_ids, pca_features), axis=1)
y_train = np.asarray(y_train)

pca_features_test = pca.transform(X_test_scale_pca)
X_test_concat = np.concatenate((X_test_ids, pca_features_test), axis=1)
y_test = np.asarray(y_test)


#Save out Scaler and PCA
with open(Path(model_directory_path) / 'scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

with open(Path(model_directory_path) / 'pca.pkl', 'wb') as file:
    pickle.dump(pca, file)

date_list = list(set(X_train_orig['date']))

batch_size = 1500
train_generator = pairwise_generator5(X_train_concat, y_train, batch_size)
test_generator= pairwise_generator5(X_test_concat, y_test, batch_size)

print(X_train_concat.shape)


#Model Training
model_pathname = Path('../resources') / "model.keras"

if model_pathname.is_file():
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=20,
        verbose=0,
        mode='auto',
        baseline=None)

    history = model.fit(
        train_generator,
        steps_per_epoch = len(X_train_concat) // batch_size,
        batch_size=batch_size,
        epochs=30,
        validation_data=test_generator,
        validation_steps = len(X_test_concat) // batch_size,
        callbacks=[mc, early_stopping],
        shuffle=True,
        use_multiprocessing=False,
        verbose=1
    )

else:
    #Neural Network Model
    mc = ModelCheckpoint(model_pathname, monitor='val_loss', mode='min', verbose=1, save_best_only=False)

    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=20,
        verbose=1,
        mode='auto',
        baseline=None)

    model = keras.Sequential([
        keras.layers.Dense(200, activation='relu', kernel_initializer='lecun_normal', kernel_regularizer=keras.regularizers.l2(0.01), input_shape=(2, (X_train_concat.shape[1] - 2))),
        #keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.5),  # Adding dropout regularization
        keras.layers.Dense(100, activation='relu', kernel_initializer='lecun_normal', kernel_regularizer=keras.regularizers.l2(0.01)),
        #keras.layers.BatchNormalization(),
        # keras.layers.Dropout(0.5),  # Adding dropout regularization
        # keras.layers.Dense(250, activation='relu', kernel_initializer='lecun_normal'),
        # keras.layers.BatchNormalization(),
        # keras.layers.Dropout(0.5),  # Adding dropout regularization
        keras.layers.Dense(100, activation='relu', kernel_initializer='lecun_normal'),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.5),  # Adding dropout regularization
        keras.layers.Dense(25, activation='relu', kernel_initializer='lecun_normal', kernel_regularizer=keras.regularizers.l2(0.01)),
        keras.layers.Dropout(0.5),  # Adding dropout regularization,
        keras.layers.Flatten(),
        keras.layers.Dense(1, activation='sigmoid', kernel_initializer='lecun_normal')
    ])
    model.summary()
    optimizer = keras.optimizers.Adam(learning_rate=3e-4)

    model.compile(optimizer=optimizer,
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    history = model.fit(
        train_generator,
        batch_size=batch_size,
        steps_per_epoch=len(X_train_concat) // batch_size,
        epochs=30,
        validation_steps=len(X_test_concat) // batch_size,
        validation_data=test_generator,
        callbacks=[mc, early_stopping],
        shuffle=True,
        use_multiprocessing=False,
        verbose=1
    )

(415591, 42)
Epoch 1/30
Epoch 1: saving model to ..\resources\model.keras
Epoch 2/30
Epoch 2: saving model to ..\resources\model.keras
Epoch 3/30
Epoch 3: saving model to ..\resources\model.keras
Epoch 4/30
Epoch 4: saving model to ..\resources\model.keras
Epoch 5/30
Epoch 5: saving model to ..\resources\model.keras
Epoch 6/30
Epoch 6: saving model to ..\resources\model.keras
Epoch 7/30
Epoch 7: saving model to ..\resources\model.keras
Epoch 8/30
Epoch 8: saving model to ..\resources\model.keras
Epoch 9/30
Epoch 9: saving model to ..\resources\model.keras
Epoch 10/30
 52/277 [====>.........................] - ETA: 1s - loss: 0.6913 - accuracy: 0.5712

KeyboardInterrupt: 

In [20]:
X_pairs, y_pairs = next(train_generator)
X_pairs.shape

(100, 2, 40)

In [23]:
y_pairs[0]

0.0

In [62]:
def pairwise_generator4(X, y, batch_size):
    n_samples = X.shape[0]
    generated_pairs = set()  # To store generated pairs and avoid duplicates

    while True:
        indices = np.arange(n_samples)
        np.random.shuffle(indices)  # Shuffle indices to create random batches
        batch_X = X[indices]
        batch_y = y[indices]

        X_batch = []
        y_batch = []

        for i in range(n_samples):
            date_i = batch_X[i, 0]
            same_date_indices = [j for j in range(i+1, n_samples) if batch_X[j, 0] == date_i]
            for j in same_date_indices:
                pair_key = tuple(sorted((i, j)))  # Create a unique key for the pair
                if pair_key not in generated_pairs:  # Check if pair is already generated
                    X_pair = [batch_X[i, 2:], batch_X[j, 2:]]
                    y_pair = 1 if batch_y[i, 2] > batch_y[j, 2] else 0
                    X_batch.append(X_pair)
                    y_batch.append(y_pair)
                    generated_pairs.add(pair_key)

                    if len(X_batch) == batch_size:
                        yield np.array(X_batch, dtype='float32'), np.array(y_batch, dtype='float32')
                        X_batch = []
                        y_batch = []

In [58]:
test_x, test_y = next(train_generator)

In [65]:
X_test = X_train[X_train['date'] == 200]
y_test = y_train[y_train['date'] == 200]

In [66]:
X_test

Unnamed: 0,date,id,0,1,2,3,4,5,6,7,...,451,452,453,454,455,456,457,458,459,460
489422,200,b2014a4eb6f5be0b43de60668d6cda68f481fd1a3554de...,-0.395584,0.467767,0.334684,-0.003670,-0.084305,-0.610649,-1.911305,1.869958,...,0.072215,0.064613,0.063679,-0.942424,0.437068,0.162571,-0.511908,0.700700,0.531277,0.554467
489423,200,77a12377576612a9a90570b270df5e6fe86f20fd8bacfb...,1.560541,-0.345553,0.395517,0.538143,0.061281,-0.426155,-0.099393,0.442215,...,2.041005,-1.091513,-0.949481,0.495675,0.373371,-0.370041,1.043447,-0.436087,-1.206550,0.870037
489424,200,f0cce9977234a16b9171182664a0ef16f2fa373eb4b8c1...,-0.004308,1.535068,-0.351770,-0.574043,-0.736015,-0.483100,1.136352,0.610554,...,-0.076245,0.613645,-0.076734,0.458068,-1.385604,-1.153192,2.216623,1.369278,-0.214589,1.349027
489425,200,15e3163819739dc4b9318670fbda0c7361b25dc31b3025...,1.032348,-0.108589,0.112654,-0.984792,0.000401,-0.497814,0.447101,-0.370030,...,0.302309,-0.917909,0.796511,-1.143900,-0.840966,-1.124376,-0.075706,0.875465,-0.645114,0.108495
489426,200,b61439cc04e2a6666b7b3834124952b00742885c74b4a3...,0.764759,-0.020000,-0.174101,0.250532,-0.470467,1.320027,0.844867,-0.067772,...,0.526456,-1.779329,-0.287168,1.196917,0.474941,0.009121,-0.285514,-1.211052,3.517855,-1.681823
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492839,200,ebcc110654e89b43d198347ffdf95155a29dd820cba947...,0.745428,0.440497,-0.381767,1.300897,0.678993,-1.070824,-0.163306,0.051261,...,-0.479455,-1.193991,0.057748,-0.266845,-1.978869,0.173025,0.636986,-0.145487,-1.612342,0.670242
492840,200,ed93af2fb3db2b9150cd7896b2aed25973867f7b2ed9eb...,-0.961760,-0.365210,-2.539697,2.138631,0.538042,-0.479963,0.536242,-0.829076,...,-0.181034,-0.642921,1.288435,-0.998922,-0.983312,0.108591,-0.109811,0.109938,-1.127231,0.548930
492841,200,92d48ab6987d778ddc8b2eca74ff6a5c52d13afbf0e1ad...,-2.151686,2.589724,0.579317,-0.722497,-0.456387,-0.235593,-1.049381,0.218806,...,-0.037265,1.733668,0.443702,0.438292,0.559468,-1.700752,-2.230316,-0.355390,0.310145,-1.594635
492842,200,2cc2ea6e7ee8eaabe679620c414f07840c5e9226f28d0a...,-1.108856,0.340626,1.462280,0.606334,-0.547911,-0.259995,0.471951,1.366694,...,0.862505,0.735640,-1.822171,-0.510724,-2.559841,-0.556591,1.892561,1.818054,-0.408390,0.190328


In [67]:
X_test_orig = X_test.copy()
model_directory_path = '../resources'

# Load Scaler
with open(Path(model_directory_path) / 'scaler.pkl', 'rb') as file:
    scaler = pickle.load(file)

# Load PCA
with open(Path(model_directory_path) / 'pca.pkl', 'rb') as file:
    pca = pickle.load(file)

# Scaling
X_ids = np.asarray(X_test_orig[['date', 'id']])
X_scale_pca = X_test_orig.drop(columns=['date', 'id'])
X_scale_pca = scaler.transform(X_scale_pca)

# PCA
pca_features = pca.transform(X_scale_pca)
X_test_concat = np.concatenate((X_ids, pca_features), axis=1)


result_df = pd.DataFrame(columns=['date', 'id', 'value'])

# Load Model
model_pathname = Path(model_directory_path) / "model.keras"
model = load_model(model_pathname)

# Pairwise Transformation using the pairwise generator
batch_size = 1000
print(X_test_concat.shape)
X_test, X_test_ids = pairwise_combinations_with_ids_and_values(X_test_concat)

print(X_test.shape)
print(X_test_ids.shape)

print("Predicting for Test Data")
preds = model.predict(X_test)

(3422, 42)
(5853331, 2, 40)
(5853331, 2, 2)
Predicting for Test Data


In [68]:
preds_df = pd.DataFrame({'id': X_test_ids[:, 0, 1].flatten(), 'date': X_test_ids[:, 0, 0].flatten(), 'value': preds.flatten()})

In [69]:
preds_df.groupby(['date', 'id']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,value
date,id,Unnamed: 2_level_1
200,0015fa8023c5a2dc26aa87db2f08ec22f81cf6c61f2bcd924c4c9977592fdeab,0.493242
200,001928d043ab0d61e39b427167ba901e23d285dedf7b0bb311baeddfd34b3bbd,0.493242
200,001f30cfd62dbb5e7b1276d8a276020b02dce7a92f87fbb74ce4a043358411b7,0.493242
200,00265585a914d27805c5a8eb6553c8329b746e461b8ea37ea371aaf9adc7a436,0.493242
200,0026c90075906a177273f76e8ab9fee6b68673d375039c8bb489a70820bb305a,0.493242
200,...,...
200,ffc43b32123546b772e465d21512cd89ea97fc7063537f60690993b58d92fb34,0.493242
200,ffd0a94ebb8f96e980096862bbc838900ddf7859b342096beb7ffb6fab9e934a,0.493242
200,ffe02c37d359764065d96d007255b4b27e34fe8f81cb4766a6dcf6f2ba40d799,0.493242
200,ffe43c8362b114c35887ce0a2dda3a6ffef413856e854fae8a2151fac2321c58,0.493242
