In [286]:
# import general modules
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.svm import OneClassSVM
import xgboost as xgb
from helpers import *
import pandas as pd
import numpy as np

# import specialised modules
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


Features are calculated for the whole data. This only shows how it was done, the results are already saved as csv.

In [287]:
new_features = False

if new_features:
    # prepare the X data for analysis
    X_ = pd.read_csv('X_train.csv', engine='c')
    X_.drop(columns='id', inplace=True)
    col_names = X_.index
    # transform the data
    scaler = MinMaxScaler(feature_range=(-1,1))
    X_processed = pd.DataFrame(scaler.fit_transform(X_.transpose()).transpose())
    # convert the frame to np arrays and remove the nans
    X_processed = [item[~np.isnan(item)] for item in X_processed.to_numpy()]
    #extracting noise features
    noise_df = extract_noise_features(X_processed)
    X_pp = preprocess(X_processed)
    #extract features based on temporal and frequental things
    df = extract_features(X_pp)
    df = pd.concat([df, noise_df], axis=1)
    df.to_csv('train_features.csv')
    X_ = pd.read_csv('X_test.csv', engine='c')
    X_.drop(columns='id', inplace=True)
    col_names = X_.index
    # transform the data
    scaler = MinMaxScaler(feature_range=(-1,1))
    X_processed = pd.DataFrame(scaler.fit_transform(X_.transpose()).transpose())
    # convert the frame to np arrays and remove the nans
    X_processed = [item[~np.isnan(item)] for item in X_processed.to_numpy()]
    #extracting noise features
    noise_df = extract_noise_features(X_processed)
    X_pp = preprocess(X_processed)
    #extract features based on temporal and frequental things
    df = extract_features(X_pp)
    df = pd.concat([df, noise_df], axis=1)
    df.to_csv('test_features.csv')

In [288]:
#print general description
df = pd.read_csv('train_features.csv').drop(['Unnamed: 0'],axis=1)
df.describe(include='all')

Unnamed: 0,mean,var,min,max,delta,delta_hr,var_hr,mean_rp,min_rp,max_rp,...,fbin4,fbin5,fbin6,fbin7,fbin8,fbin9,fbin10,fmean,fstd,snr
count,5117.0,5117.0,5117.0,5117.0,5117.0,5112.0,5112.0,5117.0,5117.0,5117.0,...,5117.0,5117.0,5117.0,5117.0,5117.0,5117.0,5117.0,5117.0,5117.0,5117.0
mean,-0.002463,0.070645,-0.457473,0.361939,0.819412,24.32378,78.459586,251.014966,186.859683,319.230995,...,5.690467,8.057255,43.461169,264.264152,742.820371,2034.227951,4367.077673,15.493249,11.017222,1.171283
std,0.020138,0.02686,0.181384,0.13233,0.253523,24.526138,165.437718,53.370393,73.384114,110.647062,...,7.80665,7.998656,30.528956,187.327562,516.488174,1399.88977,3064.881309,6.534356,5.221419,9.092662
min,-0.088494,0.007019,-0.863601,0.003723,0.133684,0.128576,0.002755,84.484211,1.0,88.0,...,0.18034,0.29257,3.644086,10.171169,27.114916,96.757229,187.310408,1.44383,0.919977,-86.63999
25%,-0.013215,0.049633,-0.610884,0.272008,0.63602,6.819845,2.950418,217.868421,128.0,258.0,...,1.955814,3.711286,24.938766,148.433714,420.051147,1125.084032,2329.188564,10.284164,7.075823,-2.190189
50%,-0.005024,0.074205,-0.468133,0.352294,0.861386,14.43081,12.749544,249.625,196.0,298.0,...,3.09454,5.549801,35.357642,219.707239,641.945791,1818.335679,3956.120848,15.793983,10.90385,3.618137
75%,0.006963,0.091962,-0.308277,0.437741,1.034896,34.511353,71.124028,282.285714,240.0,352.0,...,5.675709,9.011174,51.986088,319.650996,885.903092,2427.495005,5274.26319,20.209531,14.298418,6.966429
max,0.100316,0.144354,-0.02236,0.834691,1.373629,139.490994,2191.155354,794.888889,638.0,2697.0,...,91.62478,95.006648,352.761849,1923.670048,4348.883447,10595.43755,19144.804254,37.314805,46.819868,21.845221


In [289]:
# replace inf with nan
df.replace([np.inf, -np.inf], np.NaN, inplace=True)
imp = KNNImputer(n_neighbors=4, weights='distance')
# imp_mean = SimpleImputer(missing_values=np.nan, strategy='median')
df_X = pd.DataFrame(imp.fit_transform(df), columns=df.columns)

In [290]:
y = pd.read_csv('y_train.csv')
y.drop(columns='id', inplace=True)

Remove outliers

In [291]:
samples_before = df_X.shape[0]

# trans = ExperimentalTransformer(OneClassSVM(nu=0.995))
model = IsolationForest(contamination="auto")
outl_pred = model.fit_predict(df_X)
mask = outl_pred != -1

X_selection, y = df_X[mask], y[mask]

samples_after = X_selection.shape[0]

print(f'Data size reduced from {samples_before} to {samples_after}')
df_X = X_selection

data_train_backup = df_X.copy()

Data size reduced from 5117 to 4726


Find most important features with a random forest, since different libraries are used and they showed very inconsistant performance.
The 40 best features are used.

In [292]:
# feature importance (optional)
from sklearn.feature_selection import RFE

rfe = RFE(estimator=RandomForestClassifier(n_estimators=200, n_jobs=-1), n_features_to_select=40, step=0.05)
rfe.fit(df_X, y.values.ravel())
df_X = rfe.transform(df_X)

Standard train test split

In [293]:
# X_train, X_test, y_train, y_test = train_test_split(X_selection, y, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(df_X, y, random_state=42)

## Model

XGB is often used for tabular data for many reasons and it's potential was again showed in the recent paper [Tabular Data: Deep Learning is Not All You Need](https://arxiv.org/abs/2106.03253).

The hyperopt package is used to maximize the AUC while finding the best hyperparameters.

In [294]:
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import warnings
warnings.filterwarnings('ignore')

space = {'max_depth': hp.quniform("max_depth", 3, 12, 1),
         'gamma': hp.uniform('gamma', 1, 9),
         'reg_alpha': hp.quniform('reg_alpha', 40, 180, 1),
         'reg_lambda': hp.uniform('reg_lambda', 0, 1),
         'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
         'min_child_weight': hp.quniform('min_child_weight', 0, 10, 1),
         'n_estimators': 1000,
         'seed': 0
         }

def objective(space):
    clf = xgb.XGBClassifier(
        use_label_encoder=False,n_jobs=-1,
        n_estimators=space['n_estimators'], max_depth=int(space['max_depth']), gamma=space['gamma'],
        reg_alpha=int(space['reg_alpha']), min_child_weight=int(space['min_child_weight']),
        colsample_bytree=int(space['colsample_bytree']))

    evaluation = [(X_train, y_train.values.ravel()),
                  (X_test, y_test.values.ravel())]

    clf.fit(X_train, y_train.values.ravel(),
            eval_set=evaluation, eval_metric="mlogloss",
            early_stopping_rounds=10, verbose=False)

    pred = clf.predict(X_test)
    accuracy = f1_score(y_test.values.ravel(), pred, average='micro')
    return {'loss': -accuracy, 'status': STATUS_OK}

In [295]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 500,
                        trials = trials)

best_xgb = xgb.XGBClassifier(best_hyperparams)
best_xgb.fit(X_train, y_train.values.ravel(), verbose=False)

100%|█████████████████████████████████████████████| 500/500 [05:41<00:00,  1.46trial/s, best loss: -0.7428087986463621]


In [296]:
print(f1_score(y_test, best_xgb.predict(X_test), average=None))
print(f1_score(y_test, best_xgb.predict(X_test), average='micro'))

[0.88740839 0.62569832 0.70170015 0.48648649]
0.8104906937394247


Before using the model on the submission, it was finetuned on the whole data, since we should make use of all data we have.

In [300]:
data_test = pd.read_csv('test_features.csv').drop(['Unnamed: 0'],axis=1)
data_test.replace([np.inf, -np.inf], np.NaN, inplace=True)
# print(len(data_test.columns), len(data_train_backup.columns))
data_test = pd.DataFrame(imp.fit(data_train_backup).transform(data_test), columns=data_train_backup.columns)
data_test = rfe.transform(data_test)

y_prob = None
n_rounds = 1
for r in range(n_rounds):
    best_xgb = xgb.XGBClassifier(best_hyperparams)
    best_xgb.fit(df_X, y.values.ravel(), verbose=False, eval_metric='mlogloss')
    y_test_prob = best_xgb.predict_proba(data_test)
    if y_prob is None:
        y_prob = np.zeros_like(y_test_prob)
    y_prob = y_prob + y_test_prob
y_test = np.argmax(y_prob, axis=1)
data_out = {"id" : np.arange(len(y_test)), "y": y_test}
df_out = pd.DataFrame(data_out)
df_out.to_csv("submission.csv", index=False)

In [301]:
print(best_hyperparams)

{'colsample_bytree': 0.8623811480223281, 'gamma': 1.2017606607223974, 'max_depth': 4.0, 'min_child_weight': 0.0, 'reg_alpha': 40.0, 'reg_lambda': 0.19101490078046782}
