In [1]:
import os
from math import erfc
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow.keras.layers as KL

from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score

In [2]:
path_raw = '../data/md_raw_dataset.csv'
path_target = '../data/md_target_dataset.csv'

In [3]:
df_raw = pd.read_csv(path_raw, sep=';')
df_target = pd.read_csv(path_target, sep=';')

In [4]:
# non numerical and dates columns
non_numerical = ['super_hero_group', 'crystal_type', 'Cycle']
dates = ['when', 'expected_start', 'start_process', 'start_subprocess1', 'start_critical_subprocess1', 'predicted_process_end', \
         'process_end', 'subprocess1_end', 'reported_on_tower', 'opened']

In [5]:
# merge the targets to the features
df = df_raw.join(df_target['target'], lsuffix='_caller', rsuffix='_other')
# get only data with target values
df = df[:len(df_target)]
# remove columns with no name
#df = df.drop([c for c in df.columns if 'Unnamed' in c], axis=1)

# convert string dates to pandas date and split year, month, day, hour, minute, seconds in different columns
for col in dates:
    df[col] = pd.to_datetime(df[col], infer_datetime_format=True, errors='coerce')
    #df[col+'_year']   = df[col].dt.year
    #df[col+'_month']  = df[col].dt.month
    #df[col+'_day']    = df[col].dt.day
    df[col+'_time']   = df[col].dt.hour*60**2 + df[col].dt.minute*60 + df[col].dt.second
    
## convert values to logarithm
#for col in df._get_numeric_data().columns:
#    if df[col].min()>0:
#        df[col] = np.log(df[col])

# handle non numerical data by assignin unique numerical values
for col in non_numerical:
    uniques = df[col].unique()
    mapping = dict(zip(uniques, range(len(uniques))))
    
    df[col] = df[col].apply(lambda x:mapping[x])

df

Unnamed: 0.1,Unnamed: 0,when,super_hero_group,tracking,place,tracking_times,crystal_type,Unnamed: 7,human_behavior_report,human_measure,...,when_time,expected_start_time,start_process_time,start_subprocess1_time,start_critical_subprocess1_time,predicted_process_end_time,process_end_time,subprocess1_end_time,reported_on_tower_time,opened_time
0,0,2020-09-07,0,84921,1,1,0,2,3,650,...,0.0,47400.0,47280.0,47460.0,47580.0,49260.0,48480.0,48420.0,49020.0,
1,1,2020-09-07,0,84941,1,1,1,1,4,700,...,0.0,54480.0,54660.0,54960.0,55080.0,56940.0,56340.0,56280.0,57180.0,
2,2,2020-09-07,0,84951,1,1,1,2,4,800,...,0.0,58500.0,58560.0,58800.0,58920.0,60840.0,60120.0,60060.0,60840.0,
3,3,2020-09-07,0,84971,1,1,1,7,3,700,...,0.0,66120.0,66240.0,66660.0,66780.0,68520.0,67620.0,67620.0,68100.0,68520.0
4,4,2020-09-07,0,84981,1,1,0,17,3,700,...,0.0,69240.0,69120.0,69360.0,,72780.0,70620.0,70620.0,71220.0,73200.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9584,64,2019-09-11,3,546351,2,1,37,17,3,850,...,0.0,33300.0,33720.0,34020.0,34920.0,36480.0,36480.0,36240.0,37320.0,37680.0
9585,65,2019-09-11,3,546361,2,1,37,16,4,570,...,0.0,39120.0,38880.0,39180.0,39540.0,41220.0,41160.0,40980.0,41640.0,41880.0
9586,66,2019-09-11,3,546371,2,1,24,14,4,550,...,0.0,,42240.0,42480.0,42960.0,52380.0,46080.0,45960.0,54240.0,55320.0
9587,67,2019-09-11,3,546381,2,1,37,11,4,530,...,0.0,52200.0,52860.0,53640.0,,55620.0,54780.0,54480.0,60960.0,63360.0


In [6]:
# remove not numerical columns
df_num = df._get_numeric_data()
df_num = df_num.drop('place', axis=1)

In [7]:
# fill nan values with the previous value and then after value
df_num = df_num.fillna(method='ffill')
df_num = df_num.fillna(method='bfill')

In [8]:
# use chauvenet criterion to eliminate outliners
def chauvenet(array):
    N = len(array)
    criterion = 1.0/(3*N)
    norm = abs(array-array.mean())/array.std()
    prob = np.array([erfc(d) for d in norm])
    
    return prob < criterion

for col in df_num.columns:
    if col=='target':
        continue
    
    if len(df_num.loc[chauvenet(df_num[col]), col])>0:
        df_num.loc[chauvenet(df_num[col]), col] = np.nan

# fill nan values with the previous value and then after value
df_num = df_num.fillna(method='ffill')
df_num = df_num.fillna(method='bfill')

In [9]:
df_num

Unnamed: 0.1,Unnamed: 0,super_hero_group,tracking,tracking_times,crystal_type,Unnamed: 7,human_behavior_report,human_measure,crystal_weight,expected_factor_x,...,when_time,expected_start_time,start_process_time,start_subprocess1_time,start_critical_subprocess1_time,predicted_process_end_time,process_end_time,subprocess1_end_time,reported_on_tower_time,opened_time
0,0,0,84921,1.0,0.0,2,3,650.0,345.2483,1616.0,...,0.0,47400.0,47280.0,47460.0,47580.0,49260.0,48480.0,48420.0,49020.0,68520.0
1,1,0,84941,1.0,1.0,1,4,700.0,350.6301,1610.0,...,0.0,54480.0,54660.0,54960.0,55080.0,56940.0,56340.0,56280.0,57180.0,68520.0
2,2,0,84951,1.0,1.0,2,4,800.0,347.4298,1609.0,...,0.0,58500.0,58560.0,58800.0,58920.0,60840.0,60120.0,60060.0,60840.0,68520.0
3,3,0,84971,1.0,1.0,7,3,700.0,333.1576,1622.0,...,0.0,66120.0,66240.0,66660.0,66780.0,68520.0,67620.0,67620.0,68100.0,68520.0
4,4,0,84981,1.0,0.0,17,3,700.0,362.3764,1620.0,...,0.0,69240.0,69120.0,69360.0,66780.0,72780.0,70620.0,70620.0,71220.0,73200.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9584,64,3,546351,1.0,37.0,17,3,850.0,350.2309,1656.0,...,0.0,33300.0,33720.0,34020.0,34920.0,36480.0,36480.0,36240.0,37320.0,37680.0
9585,65,3,546361,1.0,37.0,16,4,570.0,344.8642,1627.0,...,0.0,39120.0,38880.0,39180.0,39540.0,41220.0,41160.0,40980.0,41640.0,41880.0
9586,66,3,546371,1.0,24.0,14,4,550.0,344.8859,1629.0,...,0.0,39120.0,42240.0,42480.0,42960.0,52380.0,46080.0,45960.0,54240.0,55320.0
9587,67,3,546381,1.0,37.0,11,4,530.0,356.2083,1656.0,...,0.0,52200.0,52860.0,53640.0,42960.0,55620.0,54780.0,54480.0,60960.0,63360.0


In [10]:
# filters by quasi-constant variance threshold
constant_filter = VarianceThreshold(threshold=0.01)
constant_filter.fit(df_num)
df_num = df_num[df_num.columns[constant_filter.get_support()]]

In [11]:
# fill nan values with the previous value and then after value
df_num = df_num.fillna(method='ffill')
df_num = df_num.fillna(method='bfill')

In [12]:
# remove low correlated features with the targets
threshold = 0.01
correlated_features = set()
correlation_matrix = df_num.corr()
df_corr = df_num.copy()

df_corr = df_corr[df_corr.columns[correlation_matrix['target'].abs() > threshold]]

In [13]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(correlation_matrix['target'])

Unnamed: 0                         0.843047
super_hero_group                   0.012475
tracking                           0.009840
crystal_type                       0.027164
Unnamed: 7                         0.026970
human_behavior_report              0.005532
human_measure                     -0.029758
crystal_weight                    -0.016181
expected_factor_x                 -0.021367
previous_factor_x                 -0.029191
first_factor_x                    -0.034596
expected_final_factor_x           -0.031521
final_factor_x                    -0.052043
previous_adamantium               -0.018024
Unnamed: 17                       -0.034404
chemical_x                         0.032057
raw_kryptonite                     0.002776
argon                              0.021545
pure_seastone                      0.014700
Cycle                              0.043598
groups                            -0.044000
target                             1.000000
expected_start_time             

In [14]:
# normalize data
df_norm = df_corr.copy()
for col in df_norm.columns:
    # output to [-1,1]
    if col=='target':
        df_norm[col] = 2*(df_corr[col]-df_corr[col].min())/(df_corr[col].max()-df_corr[col].min())-1
    # inputs to mean=0 and std=1
    else:
        df_norm[col] = (df_corr[col]-df_corr[col].mean())/df_corr[col].std()

#df_norm = 2*(df_corr-df_corr.min())/(df_corr.max()-df_corr.min())-1

In [15]:
#for col in df_norm.columns:
#    plt.figure(figsize=(15,10))
#    plt.plot(df_norm[col])
#    plt.title(col)
#    plt.show()

In [16]:
X = np.array(df_norm.drop('target', axis=1))
y = np.array(df_norm['target'])

X.shape, y.shape

((9589, 26), (9589,))

In [17]:
# define small NN
def get_model():
    model = tf.keras.Sequential([
        KL.Dense(128, input_shape=(X.shape[-1],)),
        KL.PReLU(),
        KL.Dense(256),
        KL.PReLU(),
        KL.Dense(128),
        KL.PReLU(),
        KL.Dense(1, activation='tanh')
    ])

    model.compile(tf.keras.optimizers.Adam(learning_rate=3e-3), loss='huber')
    return model

In [18]:
skf = KFold(n_splits=10)
skf.get_n_splits(X, y)

scores = []
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model = get_model()
    model.fit(
        x=X_train,
        y=y_train,
        shuffle=True,
        batch_size=64,
        epochs=100,
        callbacks = [
            tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True),
            tf.keras.callbacks.ReduceLROnPlateau(factor=0.5,patience=2),
            tf.keras.callbacks.ModelCheckpoint(f'nn_model_{i}.h5', save_best_only=True),
        ],
        validation_data = (X_test, y_test),
        verbose=0
    )
    
    model.load_weights(f'nn_model_{i}.h5')
    pred = model.predict(X_test)
    scores.append(r2_score(y_test, pred))
    
    print(scores[-1])
    del model

print('mean value: ', np.mean(scores))

0.7514698811362043
0.6971932969503969
0.6561059872564254
0.7605021390691483
0.7577353661576604
0.6619560069868842
0.7295769874575626
0.6860474508785137
0.6556093838300774
0.6860436560880052
mean value:  0.7042240155810878


In [22]:
from glob import glob
from tensorflow.keras.models import load_model

path_models = glob('./*.h5')
models = [load_model(path) for path in path_models]

In [31]:
base = get_model()
weights = [m.weights for m in models]

In [27]:
# mean for all models
pred = [m.predict(X) for m in models]
pred = np.mean(pred, axis=0)
r2_score(y, pred)

0.7948583445143478

In [26]:
pred.shape

(9589, 1)

| estimator              | score              |
|------------------------|--------------------|
|SVR(C=1.0, epsilon=0.1) | 0.6953288562363568 |
|SGDRegressor(loss='huber', penalty='l1', average=True) | 0.6977306996875166 |
|Ridge(alpha=2.0) | 0.6919560554210303 |
|NN no batch + mean/std + PReLU | 0.7129820329488472 |
|NN batch + [-1,1] + PReLU | 0.722115513271296 |
|NN no batch + [-1,1] + PReLU | 0.740501847364801 |
|NN no batch + [-1,1] + ReLU | 0.7293876523359863 |
|NN no batch + [-1,1] + PReLU + time | 0.7116691856520748 |
|NN no batch + [-1,1] + PReLU + time + log | 0.6194547509831078 |
|NN no batch + + mean/std + gt[-1,1] + PReLU + time + log | 0.6946419211535133 |