In [93]:
#load libraries
from datetime import datetime, timedelta
from dateutil import parser
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv, os
from glob import glob
#Import scikit-learn dataset library
from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.datasets import dump_svmlight_file


#load the data
PATH = "/Users/leeo/Desktop/KI2/7.master_thesis/1.data/12.feature_label_4h_control/combine_fe/train_balance_data.csv"
df = pd.read_csv(PATH, sep=",", header=0,
                       parse_dates=[0], index_col=0)


In [94]:
# Split the training and testing dataseet
X=df[df.columns[:-2]]  # Features
y=df[df.columns[-2]]  # Labels

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # 70% training and 30% test

# Random forest

In [95]:
#Create a RF Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

#apply the model
y_pred=clf.predict(X_test)

# Model Accuracy, Precisiono and recall
print("Confusion matrix:\n",metrics.confusion_matrix(y_test, y_pred))
print("Accuracy",metrics.accuracy_score(y_test, y_pred))
print("Classification_report:\n",metrics.classification_report(y_test, y_pred))

Confusion matrix:
 [[2863  322   34]
 [ 208 2226   13]
 [  26   59  683]]
Accuracy 0.89710910786447
Classification_report:
               precision    recall  f1-score   support

         0.0       0.92      0.89      0.91      3219
         1.0       0.85      0.91      0.88      2447
         2.0       0.94      0.89      0.91       768

    accuracy                           0.90      6434
   macro avg       0.90      0.90      0.90      6434
weighted avg       0.90      0.90      0.90      6434



# XGBoost

In [96]:
# use DMatrix for xgbosot
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)


# use svmlight file for xgboost
dump_svmlight_file(X_train, y_train, 'dtrain.svm', zero_based=True)
dump_svmlight_file(X_test, y_test, 'dtest.svm', zero_based=True)
dtrain_svm = xgb.DMatrix('dtrain.svm')
dtest_svm = xgb.DMatrix('dtest.svm')

# set xgboost params
param = {
    'max_depth': 3,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 3}  # the number of classes that exist in this datset
num_round = 10  # the number of training iterations

  if getattr(data, 'base', None) is not None and \


[20:08:36] 15010x16 matrix with 240160 entries loaded from dtrain.svm
[20:08:36] 6434x16 matrix with 102944 entries loaded from dtest.svm


In [97]:
# training and testing - numpy matrices
bst = xgb.train(param, dtrain, num_round)
preds = bst.predict(dtest)
best_preds = np.asarray([np.argmax(line) for line in preds])

#model evaluation: Model Accuracy, Precisiono and recall
print("Confusion matrix:\n",metrics.confusion_matrix(y_test, best_preds))
print("Accuracy",metrics.accuracy_score(y_test, best_preds))
print("Classification_report:\n",metrics.classification_report(y_test, best_preds))

Confusion matrix:
 [[2633  532   54]
 [ 954 1396   97]
 [  89  433  246]]
Accuracy 0.6644389182468138
Classification_report:
               precision    recall  f1-score   support

         0.0       0.72      0.82      0.76      3219
         1.0       0.59      0.57      0.58      2447
         2.0       0.62      0.32      0.42       768

    accuracy                           0.66      6434
   macro avg       0.64      0.57      0.59      6434
weighted avg       0.66      0.66      0.65      6434



## Support Vector Machine
* The running time is approximately 2h

In [None]:
svclassifier = SVC(kernel='rbf')
svclassifier.fit(X_train, y_train)

#apply the model
y_pred = svclassifier.predict(X_test)

# Model Accuracy, Precisiono and recall
print("Confusion matrix:\n",metrics.confusion_matrix(y_test, y_pred))
print("Accuracy",metrics.accuracy_score(y_test, y_pred))
print("Classification_report:\n",metrics.classification_report(y_test, y_pred))

# External evaluation

In [98]:
PATH = "/Users/leeo/Desktop/KI2/7.master_thesis/1.data/12.feature_label_4h_control/combine_fe/test_data.csv"
dftest = pd.read_csv(PATH, sep=",", header=0,
                       parse_dates=[0], index_col=0)

X_test=dftest[dftest.columns[:-2]]  # Features
y_test=dftest[dftest.columns[-2]]  # Labels

In [99]:
#RF
#apply the model
y_pred=clf.predict(X_test)

# Model Accuracy, Precisiono and recall
print("Confusion matrix:\n",metrics.confusion_matrix(y_test, y_pred))
print("Accuracy",metrics.accuracy_score(y_test, y_pred))
print("Classification_report:\n",metrics.classification_report(y_test, y_pred))

Confusion matrix:
 [[26284  2745   204]
 [ 1483   824   147]
 [  312   512   126]]
Accuracy 0.8344516959279346
Classification_report:
               precision    recall  f1-score   support

         0.0       0.94      0.90      0.92     29233
         1.0       0.20      0.34      0.25      2454
         2.0       0.26      0.13      0.18       950

    accuracy                           0.83     32637
   macro avg       0.47      0.46      0.45     32637
weighted avg       0.86      0.83      0.85     32637



In [100]:
# use DMatrix for xgbosot
dtest = xgb.DMatrix(X_test, label=y_test)


# use svmlight file for xgboost
dump_svmlight_file(X_test, y_test, 'dtest.svm', zero_based=True)
dtest_svm = xgb.DMatrix('dtest.svm')

# set xgboost params
param = {
    'max_depth': 3,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 3}  # the number of classes that exist in this datset
num_round = 10  # the number of training iterations

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


[20:12:01] 32637x16 matrix with 522192 entries loaded from dtest.svm


In [101]:
# training and testing - numpy matrices
preds = bst.predict(dtest)
best_preds = np.asarray([np.argmax(line) for line in preds])

#model evaluation: Model Accuracy, Precisiono and recall
print("Confusion matrix:\n",metrics.confusion_matrix(y_test, best_preds))
print("Accuracy",metrics.accuracy_score(y_test, best_preds))
print("Classification_report:\n",metrics.classification_report(y_test, best_preds))

Confusion matrix:
 [[25477  3584   172]
 [ 1314  1017   123]
 [  130   743    77]]
Accuracy 0.8141373287986028
Classification_report:
               precision    recall  f1-score   support

         0.0       0.95      0.87      0.91     29233
         1.0       0.19      0.41      0.26      2454
         2.0       0.21      0.08      0.12       950

    accuracy                           0.81     32637
   macro avg       0.45      0.46      0.43     32637
weighted avg       0.87      0.81      0.84     32637

