# Model Comparison

## Imports

In [1]:
# Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Models
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# Custom
from utils.constant import ATTACKS, FEATURES, LABELS, DATASET_DIRECTORY

# Other
import os
import warnings
import pandas as pd
from tqdm import tqdm

# Ignore warnings
warnings.filterwarnings('ignore')

## Dataset

In [2]:
# File Paths
df_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('.csv')]
df_sets.sort()

# Split
training_sets = df_sets[:1]
test_sets = df_sets[1:2]

# Preprocessing Functions
scaler = StandardScaler()

### Loading and Preprocessing

In [3]:
# Create dataframe
df = pd.DataFrame()
for train_set in tqdm(training_sets):

    # Load data into a single dataframe
    df_set = pd.read_csv(DATASET_DIRECTORY + train_set)
    df = df._append(df_set, ignore_index=True)

    # Fit scaler
    scaler.fit(df_set[FEATURES])

# Clean data
df = df.dropna()
df = df.drop_duplicates()
df = df.reset_index(drop=True)

# Scale
df[FEATURES] = scaler.transform(df[FEATURES])

# Encode labels
df[LABELS] = df[LABELS].apply(lambda x: ATTACKS[x])

100%|██████████| 1/1 [00:00<00:00,  1.31it/s]


## Models

In [4]:
models = {
    'log_reg': LogisticRegression(),
    'xgb': XGBClassifier(),
    'svm': SVC(),
}

### Training

In [5]:
for model in tqdm(models):
    models[model].fit(df[FEATURES], df[LABELS])

100%|██████████| 3/3 [12:50<00:00, 256.85s/it]


## Evaluation

In [6]:
df_test = pd.DataFrame()
for test_set in test_sets:
    
    # Load data into a single dataframe
    df_set = pd.read_csv(DATASET_DIRECTORY + test_set)
    df_test = df_test._append(df_set, ignore_index=True)

# Scale
df_test[FEATURES] = scaler.transform(df_test[FEATURES])

# Encode labels
df_test[LABELS] = df_test[LABELS].apply(lambda x: ATTACKS[x])

# Predict
for model in tqdm(models):
    y_pred = list(models[model].predict(df_test[FEATURES]))

    # Evaluate
    y_test = list(df_test[LABELS])
    print('Model: ', model)
    print('  accuracy_score = ', accuracy_score(y_pred, y_test))
    print('  recall_score = ', recall_score(y_pred, y_test, average='macro'))
    print('  precision_score = ', precision_score(y_pred, y_test, average='macro'))
    print('  f1_score = ', f1_score(y_pred, y_test, average='macro'))

  0%|          | 0/3 [00:00<?, ?it/s]

Model:  log_reg
  accuracy_score =  0.8043646168963232
  recall_score =  0.5946313957995658
  precision_score =  0.4908326519822142


 33%|███▎      | 1/3 [00:00<00:01,  1.16it/s]

  f1_score =  0.4961079750698912
Model:  xgb
  accuracy_score =  0.9920111514819131
  recall_score =  0.7691804011120529
  precision_score =  0.7107762528852515


 67%|██████▋   | 2/3 [00:02<00:01,  1.35s/it]

  f1_score =  0.7208170814552072
