In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn import model_selection
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from sklearn import metrics
import pickle

In [None]:
def dataset(filename, window_size, label):
    raw_data = pd.read_csv(filename)[::10]
    raw_data = np.array(raw_data)
    features = pd.DataFrame()
    labels = pd.DataFrame()
    for i in range(0,raw_data.shape[0]-window_size):
        labels.loc[i,'label']=label
        seq = raw_data[i:i+window_size]
        y = seq[:,0]
        z = seq[:,1]
        features.loc[i,'MedianAbsoluteDiff_z'] = np.median(np.absolute(np.diff(z)))
        features.loc[i,'log_MeanAbsoluteDiff_z'] = np.log(np.median(np.absolute(np.diff(z))))
        features.loc[i,'log_MeanAbsoluteDiff_y'] = np.log(np.median(np.absolute(np.diff(y))))
        features.loc[i,'MedianAbsoluteDiff_y'] = np.median(np.absolute(np.diff(y)))
        features.loc[i,'max_y'] = np.max(y)
        features.loc[i,'e_max_z'] = np.exp(np.max(z))
        features.loc[i,'e_min_y'] = np.exp(np.min(y))
        features.loc[i,'min_z'] = np.min(z)
        features.loc[i,'MedianDiff_y'] = np.median(np.diff(y))
        features.loc[i,'MedianDiff_z'] = np.median(np.diff(z))
        features.loc[i,'e_kurt_y'] = np.exp(stats.kurtosis(y))
        features.loc[i,'median_z'] = np.median(z)
    return features,labels

X140_80,Y140_80 = dataset("../dataset/CriticalSpeed140/v80.csv", 80, 0)
X140_130,Y140_130 = dataset("../dataset/CriticalSpeed140/v130.csv", 80, 0)
X140_135,Y140_135 = dataset("../dataset/CriticalSpeed140/v135.csv", 80, 0)
X140_139,Y140_139 = dataset("../dataset/CriticalSpeed140/v139.csv", 80, 0)
X140_140,Y140_140 = dataset("../dataset/CriticalSpeed140/v140.csv", 80, 1)
X140_145,Y140_145 = dataset("../dataset/CriticalSpeed140/v145.csv", 80, 1)
X140_150,Y140_150 = dataset("../dataset/CriticalSpeed140/v150.csv", 80, 1)
X140_200,Y140_200 = dataset("../dataset/CriticalSpeed140/v200.csv", 80, 1)

X68_55,Y68_55 = dataset("../dataset/CriticalSpeed68/v55.csv", 80, 0)
X68_60,Y68_60 = dataset("../dataset/CriticalSpeed68/v60.csv", 80, 0)
X68_67,Y68_67 = dataset("../dataset/CriticalSpeed68/v67.csv", 80, 0)
X68_70,Y68_70 = dataset("../dataset/CriticalSpeed68/v70.csv", 80, 1)
X68_75,Y68_75 = dataset("../dataset/CriticalSpeed68/v75.csv", 80, 1)
X68_80,Y68_80 = dataset("../dataset/CriticalSpeed68/v80.csv", 80, 1)


x = pd.concat([X140_80,X140_130,X140_135,X140_139,X140_140,X140_145,X140_150,X140_200,X68_55,X68_60,X68_67,X68_70,X68_75,X68_80],ignore_index=True)
y = pd.concat([Y140_80,Y140_130,Y140_135,Y140_139,Y140_140,Y140_145,Y140_150,Y140_200,Y68_55,Y68_60,Y68_67,Y68_70,Y68_75,Y68_80],ignore_index=True)
numerical_cols = x.select_dtypes(exclude = 'object').columns
x = x[numerical_cols]
x = x.fillna(0)
x = x.clip(-1e8,1e8)
y = y['label'].fillna(0)

In [None]:
x_train,x_test,y_train,y_test = model_selection.train_test_split(x,y,random_state=1,test_size=0.2)

In [None]:
xgb = XGBClassifier(random_state=1, use_label_encoder=False)
xgb.fit(x_train,y_train)
print("XGBoost-accuracy-trainSet: ",xgb.score(x_train,y_train))
print("XGBoost-accuracy-testSet: ",xgb.score(x_test,y_test))
y_hat_xgb = xgb.predict(x_test)
xgb_roc_auc = roc_auc_score(y_test, xgb.predict(x_test))
print ("XGBoost AUC =  {:.3f}" % xgb_roc_auc)
print(metrics.classification_report(y_test,y_hat_xgb))

In [None]:
with open('../weights/xgb.pickle', 'wb') as f:
    pickle.dump(xgb, f)