In [153]:
#load libraries
from datetime import datetime, timedelta
from dateutil import parser
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv, os
from glob import glob
#Import scikit-learn dataset library
from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.datasets import dump_svmlight_file
from random import shuffle


#load the data
PATH = "/Users/leeo/Desktop/KI2/7.master_thesis/1.data/14.feature_label_intact_5min/label_fe/combine_fe/training_data_undersample_binary.csv"
df = pd.read_csv(PATH, sep=",", header=0,
                       parse_dates=[0], index_col=0)
#add coefficient of variance
df["cv"] = df["std"]/df["mean"]
df["cv.1"] = df["std.1"]/df["mean.1"]
cols = df.columns.tolist()
cols = cols[-2:] + cols[:-2]
df = df[cols]

In [154]:
# Split the training and testing dataseet
#df = shuffle(df,random_state=42)
X=df[df.columns[:-2]]  # Features
y=df[df.columns[-2]]  # Labels

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # 70% training and 30% test
#df.iloc[X_test.index]["label"].value_counts()
#df.iloc[X_train.index]["label"].value_counts()

# Random forest

In [155]:
#Create a RF Classifier
clf=RandomForestClassifier(n_estimators=20)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

#apply the model
y_pred=clf.predict(X_test)

# Model Accuracy, Precisiono and recall
print("Confusion matrix:\n",metrics.confusion_matrix(y_test, y_pred))
print("Accuracy",metrics.accuracy_score(y_test, y_pred))
print("Classification_report:\n",metrics.classification_report(y_test, y_pred))


#Train the model using the entire training sets 
#clf.fit(X,y)


Confusion matrix:
 [[5731  460]
 [ 933 2182]]
Accuracy 0.8503116269073716
Classification_report:
               precision    recall  f1-score   support

         0.0       0.86      0.93      0.89      6191
         1.0       0.83      0.70      0.76      3115

    accuracy                           0.85      9306
   macro avg       0.84      0.81      0.82      9306
weighted avg       0.85      0.85      0.85      9306



# XGBoost

In [156]:
# use DMatrix for xgbosot
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)


# use svmlight file for xgboost
dump_svmlight_file(X_train, y_train, 'dtrain.svm', zero_based=True)
dump_svmlight_file(X_test, y_test, 'dtest.svm', zero_based=True)
dtrain_svm = xgb.DMatrix('dtrain.svm')
dtest_svm = xgb.DMatrix('dtest.svm')

# set xgboost params
param = {
    'max_depth': 6,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 3}  # the number of classes that exist in this datset
num_round = 10  # the number of training iterations

  if getattr(data, 'base', None) is not None and \


[16:25:50] 21711x16 matrix with 347376 entries loaded from dtrain.svm
[16:25:50] 9306x16 matrix with 148896 entries loaded from dtest.svm


In [157]:
# training and testing - numpy matrices
bst = xgb.train(param, dtrain, num_round)
preds = bst.predict(dtest)
best_preds = np.asarray([np.argmax(line) for line in preds])

#model evaluation: Model Accuracy, Precisiono and recall
print("Confusion matrix:\n",metrics.confusion_matrix(y_test, best_preds))
print("Accuracy",metrics.accuracy_score(y_test, best_preds))
print("Classification_report:\n",metrics.classification_report(y_test, best_preds))

Confusion matrix:
 [[5587  604]
 [1312 1803]]
Accuracy 0.7941113260262196
Classification_report:
               precision    recall  f1-score   support

         0.0       0.81      0.90      0.85      6191
         1.0       0.75      0.58      0.65      3115

    accuracy                           0.79      9306
   macro avg       0.78      0.74      0.75      9306
weighted avg       0.79      0.79      0.79      9306



## Support Vector Machine
* The running time is approximately 2h

In [None]:
svclassifier = SVC(kernel='rbf')
svclassifier.fit(X_train, y_train)

#apply the model
y_pred = svclassifier.predict(X_test)

# Model Accuracy, Precisiono and recall
print("Confusion matrix:\n",metrics.confusion_matrix(y_test, y_pred))
print("Accuracy",metrics.accuracy_score(y_test, y_pred))
print("Classification_report:\n",metrics.classification_report(y_test, y_pred))

# Logistic Regression

In [267]:
#train the model
model = LogisticRegression(solver='saga', multi_class="multinomial",random_state=42).fit(X_train, y_train)

#apply the model
y_pred=model.predict(X_test)

# Model Accuracy, Precisiono and recall
print("Confusion matrix:\n",metrics.confusion_matrix(y_test, y_pred))
print("Accuracy",metrics.accuracy_score(y_test, y_pred))
print("Classification_report:\n",metrics.classification_report(y_test, y_pred))

Confusion matrix:
 [[2314  862   43]
 [ 955 1453   39]
 [ 243  416  109]]
Accuracy 0.6024246192104445
Classification_report:
               precision    recall  f1-score   support

         0.0       0.66      0.72      0.69      3219
         1.0       0.53      0.59      0.56      2447
         2.0       0.57      0.14      0.23       768

    accuracy                           0.60      6434
   macro avg       0.59      0.48      0.49      6434
weighted avg       0.60      0.60      0.58      6434





# External evaluation

In [159]:
PATH = "/Users/leeo/Desktop/KI2/7.master_thesis/1.data/14.feature_label_intact_5min/label_fe/combine_fe/testing_data_binary.csv"
dftest = pd.read_csv(PATH, sep=",", header=0,
                       parse_dates=[0], index_col=0)

#add coefficient of variance
df["cv"] = df["std"]/df["mean"]
df["cv.1"] = df["std.1"]/df["mean.1"]
cols = df.columns.tolist()
cols = cols[-2:] + cols[:-2]
df = df[cols]


X_test=dftest[dftest.columns[:-2]]  # Features
y_test=dftest[dftest.columns[-2]]  # Labels

In [160]:
#RF
#apply the model
y_pred=clf.predict(X_test)

# Model Accuracy, Precisiono and recall
print("Confusion matrix:\n",metrics.confusion_matrix(y_test, y_pred))
print("Accuracy",metrics.accuracy_score(y_test, y_pred))
print("Classification_report:\n",metrics.classification_report(y_test, y_pred))

Confusion matrix:
 [[22069  5081]
 [ 1847  2443]]
Accuracy 0.7796437659033079
Classification_report:
               precision    recall  f1-score   support

         0.0       0.92      0.81      0.86     27150
         1.0       0.32      0.57      0.41      4290

    accuracy                           0.78     31440
   macro avg       0.62      0.69      0.64     31440
weighted avg       0.84      0.78      0.80     31440



In [104]:
# use DMatrix for xgbosot
dtest = xgb.DMatrix(X_test, label=y_test)


# use svmlight file for xgboost
dump_svmlight_file(X_test, y_test, 'dtest.svm', zero_based=True)
dtest_svm = xgb.DMatrix('dtest.svm')

# set xgboost params
param = {
    'max_depth': 3,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 3}  # the number of classes that exist in this datset
num_round = 10  # the number of training iterations

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


[01:45:11] 30795x16 matrix with 492720 entries loaded from dtest.svm


In [105]:
# training and testing - numpy matrices
preds = bst.predict(dtest)
best_preds = np.asarray([np.argmax(line) for line in preds])

#model evaluation: Model Accuracy, Precisiono and recall
print("Confusion matrix:\n",metrics.confusion_matrix(y_test, best_preds))
print("Accuracy",metrics.accuracy_score(y_test, best_preds))
print("Classification_report:\n",metrics.classification_report(y_test, best_preds))

Confusion matrix:
 [[20444  5372   794]
 [ 1151  1283   119]
 [  394  1144    94]]
Accuracy 0.708589056665043
Classification_report:
               precision    recall  f1-score   support

         0.0       0.93      0.77      0.84     26610
         1.0       0.16      0.50      0.25      2553
         2.0       0.09      0.06      0.07      1632

    accuracy                           0.71     30795
   macro avg       0.40      0.44      0.39     30795
weighted avg       0.82      0.71      0.75     30795

