In [3]:
import pandas as pd
import joblib
import scipy.stats as stats
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from keras.models import load_model
from pathlib import Path
import pickle

In [8]:
def train(X_train: pd.DataFrame, y_train: pd.DataFrame, model_directory_path: str = "resources") -> None:
    #Recursive Feature Elimination
    estimator = LinearRegression()
    rfe = RFE(estimator, n_features_to_select=50, step=0.05)
    X_train = X_train.iloc[:, 2:]
    y_train = y_train.iloc[:,2:]
    rfe.fit(X_train, y_train)
    selected_features = X_train.columns[rfe.support_]
    X_train = X_train[selected_features]

    with open(Path(model_directory_path) / 'selected_features.pkl', 'wb') as file:
        pickle.dump(selected_features, file)

    #Scaler
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    joblib.dump(scaler, Path(model_directory_path) / 'scaler.h5')

    #Get train and test datasets
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=42, shuffle=True, test_size=0.3)

    #Spearman Rank Metric
    def get_spearman_rankcor(y_true, y_pred):
        return ( tf.py_function(stats.spearmanr, [tf.cast(y_pred, tf.float32),
                                                  tf.cast(y_true, tf.float32)], Tout = tf.float32) )

    model_pathname = Path(model_directory_path) / "model.h5"
    #Neural Network Model
    mc = ModelCheckpoint(model_pathname, monitor='val_loss', mode='min', verbose=1, save_best_only=True)

    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=5,
        verbose=0,
        mode='auto',
        baseline=None,
        restore_best_weights=True)

    model = keras.Sequential([
        keras.layers.Dense(50, activation='selu', kernel_initializer='lecun_normal'),
        keras.layers.BatchNormalization(),
        keras.layers.Dense(50, activation='selu', kernel_initializer='lecun_normal'),
        keras.layers.BatchNormalization(),
        keras.layers.Dense(25, activation='selu', kernel_initializer='lecun_normal'),
        keras.layers.BatchNormalization(),
        keras.layers.Dense(3, activation='selu', kernel_initializer='lecun_normal'),
        keras.layers.BatchNormalization(),
        keras.layers.Dense(1, activation='relu', kernel_initializer='lecun_normal')
    ])

    optimizer = keras.optimizers.Adam(learning_rate=0.0001)

    model.compile(optimizer=optimizer,
                  loss='mean_squared_error',
                  metrics=[get_spearman_rankcor])

    history = model.fit(
        X_train,
        y_train,
        batch_size=5000,
        epochs=5000,
        validation_data=[X_test, y_test],
        callbacks=[mc, early_stopping],
        shuffle=True,
        use_multiprocessing=True
    )

    # make sure that the train function correctly save the trained model
    # in the model_directory_path
    # print(f"Saving model in {model_pathname}")
    # joblib.dump(model, model_pathname)

In [9]:
def infer(X_test: pd.DataFrame, model_directory_path: str = "resources") -> pd.DataFrame:
    scaler = joblib.load(Path(model_directory_path) / 'scaler.h5')
    model = load_model(Path(model_directory_path) / "model.h5")

    with open(Path(model_directory_path) / 'selected_features.pkl', 'rb') as file:
        selected_features = pickle.load(file)

    y_test_predicted = X_test[["date", "id"]].copy()

    X_test = X_test.iloc[:, 2:]
    X_test = X_test[selected_features]
    X_test = scaler.transform(X_test)
    # creating the predicted label dataframe with correct dates and ids
    y_test_predicted["value"] = model.predict(X_test)

    return y_test_predicted

In [10]:
X_test = pd.read_parquet('data/X_test.parquet')
X_train = pd.read_parquet('data/X_train.parquet')
y_train = pd.read_parquet('data/X_train.parquet')

In [11]:
train(X_train, y_train)

Epoch 1/5000
Epoch 1: val_loss improved from inf to 1.14290, saving model to resources\model.h5
Epoch 2/5000
Epoch 2: val_loss improved from 1.14290 to 1.06608, saving model to resources\model.h5
Epoch 3/5000
Epoch 3: val_loss improved from 1.06608 to 1.03818, saving model to resources\model.h5
Epoch 4/5000

KeyboardInterrupt: 