# Model Comparison

## Imports

In [2]:
import os
import warnings
import pandas as pd
from tqdm import tqdm
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from utils.constant import ATTACKS, FEATURES, LABELS, DATASET_DIRECTORY
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

warnings.filterwarnings('ignore')

## Dataset

In [3]:
# File Paths
df_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('.csv')]
df_sets.sort()

# Split
training_sets = df_sets[:15]
test_sets = df_sets[15:20]

# Preprocessing Functions
scaler = StandardScaler()

### Loading and Preprocessing

In [None]:
df = pd.DataFrame()
for train_set in tqdm(training_sets):

    # Load data into a single dataframe
    df_set = pd.read_csv(DATASET_DIRECTORY + train_set)
    df = df._append(df_set, ignore_index=True)

    # Fit scaler
    scaler.fit(df_set[FEATURES])

# Clean data
df = df.dropna()
df = df.drop_duplicates()
df = df.reset_index(drop=True)

# Scale
df[FEATURES] = scaler.transform(df[FEATURES])

# Encode labels
df[LABELS] = df[LABELS].apply(lambda x: ATTACKS[x])

## Models

In [4]:
models = {
    'log_reg': LogisticRegression(),
    'xgb': XGBClassifier()
}

### Training

In [5]:
for model in tqdm(models):
    models[model].fit(df[FEATURES], df[LABELS])

100%|██████████| 2/2 [00:00<?, ?it/s]

log_reg
xgb





## Evaluation

In [None]:
df_test = pd.DataFrame()
for test_set in test_sets:
    
    # Load data into a single dataframe
    df_set = pd.read_csv(DATASET_DIRECTORY + test_set)
    df_test = df_test._append(df_set, ignore_index=True)

# Scale
df_test[FEATURES] = scaler.transform(df_test[FEATURES])

# Encode labels
df_test[LABELS] = df_test[LABELS].apply(lambda x: ATTACKS[x])

# Predict
for model in tqdm(models):
    y_pred = list(models[model].predict(df_test[FEATURES]))

    # Evaluate
    y_test = list(df_test[LABELS])
    print('accuracy_score = ', accuracy_score(y_pred, y_test))
    print('recall_score = ', recall_score(y_pred, y_test, average='macro'))
    print('precision_score = ', precision_score(y_pred, y_test, average='macro'))
    print('f1_score = ', f1_score(y_pred, y_test, average='macro'))