In [42]:
import os
from tqdm import tqdm
import numpy as np
from xgboost import XGBRegressor
import pandas as pd
import tensorflow as tf
import tensorflow.keras.backend as K
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import KFold
from tensorflow.keras.callbacks import Callback, ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

In [43]:
X_train = pd.read_pickle('./397/X_train_base.pkl').astype(float)
y_train = pd.read_pickle('./397/y_train_base.pkl').astype(float)
X_val = pd.read_pickle('./397/X_val_base.pkl').astype(float)
y_val = pd.read_pickle('./397/y_val_base.pkl').astype(float)
X_test = pd.read_pickle('./397/X_test_base.pkl').astype(float)
y_test = pd.read_pickle('./397/y_test_base.pkl').astype(float)

# XGBoost 

In [44]:
n_folds = 3
kfold = KFold(n_splits=n_folds, shuffle=True, random_state=0)
train_fold_predict = np.zeros((X_train.shape[0], 1))
test_predict = np.zeros((X_test.shape[0], n_folds))

In [45]:
xgboost_reg = XGBRegressor(n_estimators = 400,
                            learning_rate = 0.04,
                            colsample_bytree = 0.8,
                            subsample = 0.7 )

In [46]:
y_train

116913    0.006292
115458    0.005039
375236    0.011085
400420    0.004263
158032    0.002898
            ...   
213316    0.002781
220941    0.001824
378784    0.002106
402197    0.001141
247224    0.007090
Name: target, Length: 299050, dtype: float64

In [47]:
for cv_num, (train_index, val_index) in tqdm(enumerate(kfold.split(X_train))):
    X_train_ = X_train.iloc[train_index,:]
    y_train_ = y_train.iloc[train_index]
    X_val_ = X_train.iloc[val_index,:]

    xgboost_reg.fit(X_train_,y_train_)

    train_fold_predict[val_index,:] = xgboost_reg.predict(X_val_).reshape(-1,1)
    test_predict[:,cv_num] = xgboost_reg.predict(X_test)

xgb_test_predict_mean = np.mean(test_predict, axis=1).reshape(-1,1)
xgb_train_predict = train_fold_predict

3it [19:05, 381.83s/it]


In [48]:
xgb_test_predict_mean.shape

(85444, 1)

In [49]:
xgb_train_predict.shape

(299050, 1)

# CNN

In [50]:
X_train = pd.read_pickle('./397/X_train_base.pkl').astype(float)
y_train = pd.read_pickle('./397/y_train_base.pkl').astype(float)
X_val = pd.read_pickle('./397/X_val_base.pkl').astype(float)
y_val = pd.read_pickle('./397/y_val_base.pkl').astype(float)
X_test = pd.read_pickle('./397/X_test_base.pkl').astype(float)
y_test = pd.read_pickle('./397/y_test_base.pkl').astype(float)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.fit_transform(X_val)
X_test = scaler.fit_transform(X_test)

In [51]:
def np_rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true-y_pred)/y_true)))

In [52]:
def rmspe(y_true, y_pred):
    return K.sqrt(K.mean(K.square((y_true-y_pred)/y_true)))

In [53]:
def CNN(X_train, y_train, X_val, y_val, num_columns, num_labels, learning_rate, epochs):
    inp = tf.keras.layers.Input(shape=(num_columns,))
    x = tf.keras.layers.BatchNormalization()(inp)
    x = tf.keras.layers.Dense(256, activation='LeakyReLU')(x)
    x = tf.keras.layers.Reshape((16,16))(x)
    x = tf.keras.layers.Conv1D(filters=12, kernel_size=2, strides=1, activation='LeakyReLU')(x)
    x = tf.keras.layers.MaxPooling1D(pool_size=2)(x)
    x = tf.keras.layers.Flatten()(x)

    for i in range(3):
        x = tf.keras.layers.Dense(64//(2**i), activation='LeakyReLU')(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.GaussianNoise(0.01)(x)
        x = tf.keras.layers.Dropout(0.20)(x)

    x = tf.keras.layers.Dense(num_labels)(x)

    model = tf.keras.models.Model(inputs=inp, outputs=x)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss=rmspe)

    rlr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_delta=1e-5, verbose=2, eps=1e-5)
    es = EarlyStopping(monitor='val_loss', min_delta=1e-5, patience=31, restore_best_weights=True, verbose=2)   
    
    history = model.fit(X_train, y_train, epochs=epochs,validation_data=(X_val, y_val), validation_batch_size=len(y_val),batch_size=batch_size,verbose=1, callbacks=[rlr,es])

    return model, history

In [54]:
y_train

116913    0.006292
115458    0.005039
375236    0.011085
400420    0.004263
158032    0.002898
            ...   
213316    0.002781
220941    0.001824
378784    0.002106
402197    0.001141
247224    0.007090
Name: target, Length: 299050, dtype: float64

In [55]:
tf.random.set_seed(777)
num_columns = X_train.shape[1]
num_labels = 1
learning_rate = 6e-3

tf.random.set_seed(777)
batch_size = 1024
dropout_rates = 0
epochs = 1000

n_folds = 3
kfold = KFold(n_splits=n_folds, shuffle=True, random_state=0)
train_fold_predict = np.zeros((X_train.shape[0], 1))
test_predict = np.zeros((X_test.shape[0], n_folds))

for cv_num, (train_index, val_index) in tqdm(enumerate(kfold.split(X_train))):
    X_train_ = X_train[train_index,:]
    y_train_ = y_train.iloc[train_index]
    X_val_ = X_train[val_index,:]

    model = CNN(X_train_, y_train_, X_val, y_val, num_columns, num_labels, learning_rate, epochs)

    train_fold_predict[val_index,:] = model[0].predict(X_val_).reshape(-1,1)
    test_predict[:,cv_num] = model[0].predict(np.array(X_test)).reshape(-1)

cnn_test_predict_mean = np.mean(test_predict, axis=1).reshape(-1,1)
cnn_train_predict = train_fold_predict

cnn_test_predict_mean.shape, cnn_train_predict.shape

0it [00:00, ?it/s]

Epoch 1/1000


2022-10-24 22:50:06.192798: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




2022-10-24 22:50:10.923195: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000

Epoch 00019: ReduceLROnPlateau reducing learning rate to 0.003000000026077032.
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000

Epoch 00034: ReduceLROnPlateau reducing learning rate to 0.001500000013038516.
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000

Epoch 00042: ReduceLROnPlateau reducing learning rate to 0.000750000006519258.
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000

Epoch 00047: ReduceLROnPlateau reducing learning rate to 0.000375000003259629.
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Ep

2022-10-24 22:57:26.399171: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
1it [07:32, 452.57s/it]

Epoch 1/1000


2022-10-24 22:57:39.217024: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




2022-10-24 22:57:44.078880: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000

Epoch 00019: ReduceLROnPlateau reducing learning rate to 0.003000000026077032.
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000

Epoch 00029: ReduceLROnPlateau reducing learning rate to 0.001500000013038516.
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000

Epoch 00035: ReduceLROnPlateau reducing learning rate to 0.000750000006519258.
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000

Epoch 00053: ReduceLROnPlateau reducing

2022-10-24 23:05:16.740268: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2it [15:23, 463.21s/it]

Epoch 1/1000


2022-10-24 23:05:29.731950: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




2022-10-24 23:05:35.038726: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000

Epoch 00010: ReduceLROnPlateau reducing learning rate to 0.003000000026077032.
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000

Epoch 00021: ReduceLROnPlateau reducing learning rate to 0.001500000013038516.
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000

Epoch 00028: ReduceLROnPlateau reducing learning rate to 0.000750000006519258.
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000

Epoch 00034: ReduceLROnPlateau reducing learning rate to 0.000375000003259629.
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000

Epoch 00039: ReduceLROnPlateau reducing learning rate to 0.0001875000016298145.
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000

Epoc

2022-10-24 23:10:39.918683: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
3it [20:45, 415.12s/it]


((85444, 1), (299050, 1))

# MLP

In [56]:
def np_rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true-y_pred)/y_true)))

In [57]:
def rmspe(y_true, y_pred):
    return K.sqrt(K.mean(K.square((y_true-y_pred)/y_true)))

In [58]:
def MLP(X_train, y_train, X_val, y_val):

    model = tf.keras.models.Sequential()
    model.add(
        tf.keras.layers.Dense(
            units=198,
            input_dim=X_train.shape[1],
            kernel_initializer='glorot_uniform',
            activation='ReLU'
        ))

    model.add(
        tf.keras.layers.Dense(
            units=49,
            input_dim=198,
            kernel_initializer='glorot_uniform',
            activation='ReLU'
        ))
        
    model.add(
        tf.keras.layers.Dense(
            units=1,
            input_dim=49,
            kernel_initializer='glorot_uniform',
    ))
    model.summary()
    model.compile(optimizer=tf.keras.optimizers.Adam(0.001),
                loss=rmspe)

    rlr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_delta=1e-5, verbose=1)
    es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=1e-5, patience=11, restore_best_weights=True, verbose=1)
    callback_list = [rlr, es]
    history = model.fit(X_train, y_train,
                        batch_size=500, epochs=1000, verbose=1,
                        validation_data=(X_val, y_val), callbacks=callback_list
    )

    return model

In [59]:
X_train[:100,0].shape

(100,)

In [60]:
tmp = X_train[:100,0].reshape(-1,1)

In [61]:
tmp.reshape(-1).shape

(100,)

In [64]:
tf.random.set_seed(777)

n_folds = 3
kfold = KFold(n_splits=n_folds, shuffle=True, random_state=0)
train_fold_predict = np.zeros((X_train.shape[0], 1))
test_predict = np.zeros((X_test.shape[0], n_folds))

for cv_num, (train_index, val_index) in tqdm(enumerate(kfold.split(X_train))):
    X_train_ = X_train[train_index,:]
    y_train_ = y_train.iloc[train_index]
    X_val_ = X_train[val_index,:]

    model = MLP(X_train_, y_train_, X_val, y_val)

    train_fold_predict[val_index,:] = model.predict(X_val_).reshape(-1,1)
    test_predict[:,cv_num] = model.predict(np.array(X_test)).reshape(-1)

mlp_test_predict_mean = np.mean(test_predict, axis=1).reshape(-1,1)
mlp_train_predict = train_fold_predict

0it [00:00, ?it/s]

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_45 (Dense)             (None, 198)               66528     
_________________________________________________________________
dense_46 (Dense)             (None, 49)                9751      
_________________________________________________________________
dense_47 (Dense)             (None, 1)                 50        
Total params: 76,329
Trainable params: 76,329
Non-trainable params: 0
_________________________________________________________________
Epoch 1/1000


2022-10-24 23:16:37.783978: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




2022-10-24 23:16:42.529132: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000

Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000

Epoch 00017: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000

Epoch 00024: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000

Epoch 00029: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000

Epoch 00043: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.
Epoch 44/1000
Ep

2022-10-24 23:19:47.060491: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
1it [03:18, 198.39s/it]

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_48 (Dense)             (None, 198)               66528     
_________________________________________________________________
dense_49 (Dense)             (None, 49)                9751      
_________________________________________________________________
dense_50 (Dense)             (None, 1)                 50        
Total params: 76,329
Trainable params: 76,329
Non-trainable params: 0
_________________________________________________________________
Epoch 1/1000
  7/399 [..............................] - ETA: 3s - loss: 160.5647  

2022-10-24 23:19:55.002473: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




2022-10-24 23:19:58.194683: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000

Epoch 00014: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000

Epoch 00021: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000

Epoch 00026: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000

Epoch 00036: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000

Epoch 00040: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Ep

2022-10-24 23:22:36.144298: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2it [06:06, 180.84s/it]

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_51 (Dense)             (None, 198)               66528     
_________________________________________________________________
dense_52 (Dense)             (None, 49)                9751      
_________________________________________________________________
dense_53 (Dense)             (None, 1)                 50        
Total params: 76,329
Trainable params: 76,329
Non-trainable params: 0
_________________________________________________________________
Epoch 1/1000


2022-10-24 23:22:43.683257: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




2022-10-24 23:22:46.884076: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000

Epoch 00017: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000

Epoch 00025: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000

Epoch 00032: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000

Epoch 00038: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000

Epoch 00041: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Ep

2022-10-24 23:25:30.753202: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
3it [09:00, 180.28s/it]


In [65]:
new_X_train = np.concatenate((xgb_train_predict, cnn_train_predict, mlp_train_predict), axis=1)
new_X_test = np.concatenate((xgb_test_predict_mean, cnn_test_predict_mean, mlp_test_predict_mean), axis=1)

print(new_X_train.shape, new_X_test.shape)

(299050, 3) (85444, 3)


In [66]:
final_model = XGBRegressor()
final_model.fit(new_X_train, y_train)
y_pred_final = final_model.predict(new_X_test)

In [82]:
print(f'Final RMSPE: {rmspe(y_test, y_pred_final).numpy()}')

Final RMSPE: 0.2375852139618925
