# Modelovanie č. 0 - klasifikačné metódy - bez predspracovaných dát

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier

## Načítanie dát

In [2]:
train_df = pd.read_csv('data/Train_data.csv')
test_df = pd.read_csv('data/Test_data.csv')

In [3]:
train_df['class'] = train_df['class'].map({'normal': 0, 'anomaly': 1})

## One Hot Encoding

In [4]:
train_df = pd.get_dummies(train_df, drop_first=True)

## Rozdelenie dát na trénovacie a validačné

In [5]:
X_train, X_val, y_train, y_val = train_test_split(train_df.drop(columns=['class']), train_df['class'], test_size=0.3, random_state=123)

## Normalizácia trénovacích dát pomocou StandardScaler

In [6]:
quantitative = X_train.select_dtypes(include=[np.number]).columns

scaler = StandardScaler().fit(X_train[quantitative])

X_train[quantitative] = scaler.transform(X_train[quantitative])
X_val[quantitative] = scaler.transform(X_val[quantitative])

In [7]:
models = (
    ('Logistic Regression', LogisticRegression(max_iter=4000)),
    ('Random Forest', RandomForestClassifier()),
    ('Gradient Boosting Classifier', GradientBoostingClassifier()),
    ('AdaBoost Classifier', AdaBoostClassifier(algorithm='SAMME')),
    ('Bagging Classifier', BaggingClassifier()),
    ('Extra Trees Classifier', ExtraTreesClassifier()),
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('Decision Tree Classifier', DecisionTreeClassifier()),
    ('Naive Bayes', GaussianNB()),
    ('Support Vector Machine', SVC()),
    ('XGBoost Classifier', XGBClassifier())
)

## Modelovanie

In [8]:
# Gradient Boosting Classifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
gbc_pred = gbc.predict(X_val)
print("-- Gradient Boosting Classifier --")
print(classification_report(y_val, gbc_pred))
print(15*'-')
print(confusion_matrix(y_val, gbc_pred))
print(54*'-')

# K-Nearest Neighbors (KNN)
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_val)
print("-- K-Nearest Neighbors --")
print(classification_report(y_val, knn_pred))
print(15*'-')
print(confusion_matrix(y_val, knn_pred))
print(54*'-')

# Decision Tree Classifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_val)
print("-- Decision Tree Classifier --")
print(classification_report(y_val, dt_pred))
print(15*'-')
print(confusion_matrix(y_val, dt_pred))
print(54*'-')

# Naive Bayes (GaussianNB)
gnb = GaussianNB()
gnb.fit(X_train, y_train)
gnb_pred = gnb.predict(X_val)
print("-- Naive Bayes --")
print(classification_report(y_val, gnb_pred))
print(15*'-')
print(confusion_matrix(y_val, gnb_pred))
print(54*'-')

# Support Vector Machine (SVM)
svm = SVC()
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_val)
print("-- Support Vector Machine --")
print(classification_report(y_val, svm_pred))
print(15*'-')
print(confusion_matrix(y_val, svm_pred))
print(54*'-')

# AdaBoost Classifier
ada = AdaBoostClassifier(algorithm='SAMME')
ada.fit(X_train, y_train)
ada_pred = ada.predict(X_val)
print("-- AdaBoost Classifier --")
print(classification_report(y_val, ada_pred))
print(15*'-')
print(confusion_matrix(y_val, ada_pred))
print(54*'-')

# Bagging Classifier
bagging = BaggingClassifier()
bagging.fit(X_train, y_train)
bagging_pred = bagging.predict(X_val)
print("-- Bagging Classifier --")
print(classification_report(y_val, bagging_pred))
print(15*'-')
print(confusion_matrix(y_val, bagging_pred))
print(54*'-')

# Extra Trees Classifier
extra_trees = ExtraTreesClassifier()
extra_trees.fit(X_train, y_train)
extra_trees_pred = extra_trees.predict(X_val)
print("-- Extra Trees Classifier --")
print(classification_report(y_val, extra_trees_pred))
print(15*'-')
print(confusion_matrix(y_val, extra_trees_pred))
print(54*'-')

# XGBoost Classifier
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_val)
print("-- XGBoost Classifier --")
print(classification_report(y_val, xgb_pred))
print(15*'-')
print(confusion_matrix(y_val, xgb_pred))
print(54*'-')

# Random Forest Classifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_val)
print("-- Random Forest Classifier --")
print(classification_report(y_val, rf_pred))
print(15*'-')
print(confusion_matrix(y_val, rf_pred))
print(54*'-')

# Logistic Regression
lr = LogisticRegression(max_iter=4000)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_val)
print("-- Logistic Regression --")
print(classification_report(y_val, lr_pred))
print(15*'-')
print(confusion_matrix(y_val, lr_pred))
print(54*'-')

-- Gradient Boosting Classifier --
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      4022
           1       1.00      0.99      1.00      3536

    accuracy                           1.00      7558
   macro avg       1.00      1.00      1.00      7558
weighted avg       1.00      1.00      1.00      7558

---------------
[[4009   13]
 [  21 3515]]
------------------------------------------------------
-- K-Nearest Neighbors --
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4022
           1       0.99      0.99      0.99      3536

    accuracy                           0.99      7558
   macro avg       0.99      0.99      0.99      7558
weighted avg       0.99      0.99      0.99      7558

---------------
[[4000   22]
 [  35 3501]]
------------------------------------------------------
-- Decision Tree Classifier --
              precision    recall  f1-score   support

   

In [9]:
from sklearn.model_selection import cross_val_score

for name, model in models:
    scores = cross_val_score(model, X_train, y_train, cv=10, scoring='accuracy')
    print(f"{name} Cross Validation Accuracy: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")

Logistic Regression Cross Validation Accuracy: 0.9723 (+/- 0.0108)
Random Forest Cross Validation Accuracy: 0.9970 (+/- 0.0033)
Gradient Boosting Classifier Cross Validation Accuracy: 0.9951 (+/- 0.0034)
AdaBoost Classifier Cross Validation Accuracy: 0.9761 (+/- 0.0066)
Bagging Classifier Cross Validation Accuracy: 0.9967 (+/- 0.0038)
Extra Trees Classifier Cross Validation Accuracy: 0.9970 (+/- 0.0031)
K-Nearest Neighbors Cross Validation Accuracy: 0.9917 (+/- 0.0051)
Decision Tree Classifier Cross Validation Accuracy: 0.9957 (+/- 0.0030)
Naive Bayes Cross Validation Accuracy: 0.8603 (+/- 0.0086)
Support Vector Machine Cross Validation Accuracy: 0.9904 (+/- 0.0048)
XGBoost Classifier Cross Validation Accuracy: 0.9973 (+/- 0.0030)
