# Modelovanie č. 0 - klasifikačné metódy - bez predspracovaných dát

In [17]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier

## Načítanie dát

In [18]:
train_df = pd.read_csv('data/Train_data.csv')
test_df = pd.read_csv('data/Test_data.csv')

## Predspracovanie dát

In [19]:
numeric_cols = train_df.select_dtypes(include=[np.number])
train_df[numeric_cols.columns] = StandardScaler().fit_transform(numeric_cols)

## One-hot encoding

In [20]:
train_df['class'].unique()
train_df['class'] = train_df['class'].map({'normal': 0, 'anomaly': 1})

In [21]:
train_df = pd.get_dummies(train_df,drop_first=True)
train_df.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,-0.113551,-0.009889,-0.03931,-0.00891,-0.091223,-0.006301,-0.091933,-0.02622,-0.807626,-0.021873,...,False,False,False,False,False,False,False,False,True,False
1,-0.113551,-0.010032,-0.03931,-0.00891,-0.091223,-0.006301,-0.091933,-0.02622,-0.807626,-0.021873,...,False,False,False,False,False,False,False,False,True,False
2,-0.113551,-0.010093,-0.03931,-0.00891,-0.091223,-0.006301,-0.091933,-0.02622,-0.807626,-0.021873,...,False,False,False,False,True,False,False,False,False,False
3,-0.113551,-0.009996,0.052473,-0.00891,-0.091223,-0.006301,-0.091933,-0.02622,1.238197,-0.021873,...,False,False,False,False,False,False,False,False,True,False
4,-0.113551,-0.01001,-0.034582,-0.00891,-0.091223,-0.006301,-0.091933,-0.02622,1.238197,-0.021873,...,False,False,False,False,False,False,False,False,True,False


## Rozdelenie dát na trénovacie a validačné

In [22]:
X_train, X_val, y_train, y_val = train_test_split(train_df.drop(columns=['class']), train_df['class'], test_size=0.35, random_state=123)

## Modelovanie

In [23]:
# Gradient Boosting Classifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
gbc_pred = gbc.predict(X_val)
print("-- Gradient Boosting Classifier --")
print(classification_report(y_val, gbc_pred))
print(confusion_matrix(y_val, gbc_pred))

# K-Nearest Neighbors (KNN)
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_val)
print("-- K-Nearest Neighbors --")
print(classification_report(y_val, knn_pred))
print(confusion_matrix(y_val, knn_pred))

# Decision Tree Classifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_val)
print("-- Decision Tree Classifier --")
print(classification_report(y_val, dt_pred))
print(confusion_matrix(y_val, dt_pred))

# Naive Bayes (GaussianNB)
gnb = GaussianNB()
gnb.fit(X_train, y_train)
gnb_pred = gnb.predict(X_val)
print("-- Naive Bayes --")
print(classification_report(y_val, gnb_pred))
print(confusion_matrix(y_val, gnb_pred))

# Support Vector Machine (SVM)
svm = SVC()
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_val)
print("-- Support Vector Machine --")
print(classification_report(y_val, svm_pred))
print(confusion_matrix(y_val, svm_pred))

# AdaBoost Classifier
ada = AdaBoostClassifier(algorithm='SAMME')
ada.fit(X_train, y_train)
ada_pred = ada.predict(X_val)
print("-- AdaBoost Classifier --")
print(classification_report(y_val, ada_pred))
print(confusion_matrix(y_val, ada_pred))

# Bagging Classifier
bagging = BaggingClassifier()
bagging.fit(X_train, y_train)
bagging_pred = bagging.predict(X_val)
print("-- Bagging Classifier --")
print(classification_report(y_val, bagging_pred))
print(confusion_matrix(y_val, bagging_pred))

# Extra Trees Classifier
extra_trees = ExtraTreesClassifier()
extra_trees.fit(X_train, y_train)
extra_trees_pred = extra_trees.predict(X_val)
print("-- Extra Trees Classifier --")
print(classification_report(y_val, extra_trees_pred))
print(confusion_matrix(y_val, extra_trees_pred))

# XGBoost Classifier
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_val)
print("-- XGBoost Classifier --")
print(classification_report(y_val, xgb_pred))
print(confusion_matrix(y_val, xgb_pred))

# Random Forest Classifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_val)
print("-- Random Forest Classifier --")
print(classification_report(y_val, rf_pred))
print(confusion_matrix(y_val, rf_pred))

# Logistic Regression
lr = LogisticRegression(max_iter=4000)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_val)
print("-- Logistic Regression --")
print(classification_report(y_val, lr_pred))
print(confusion_matrix(y_val, lr_pred))

-- Gradient Boosting Classifier --
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      4706
           1       1.00      0.99      0.99      4112

    accuracy                           1.00      8818
   macro avg       1.00      0.99      0.99      8818
weighted avg       1.00      1.00      1.00      8818

[[4687   19]
 [  25 4087]]
-- K-Nearest Neighbors --
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4706
           1       0.99      0.99      0.99      4112

    accuracy                           0.99      8818
   macro avg       0.99      0.99      0.99      8818
weighted avg       0.99      0.99      0.99      8818

[[4679   27]
 [  43 4069]]
-- Decision Tree Classifier --
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4706
           1       1.00      1.00      1.00      4112

    accuracy                        