File processing

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

column_headers = [
    'srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur', 'sbytes', 'dbytes',
    'sttl', 'dttl', 'sloss', 'dloss', 'service', 'Sload', 'Dload', 'Spkts', 'Dpkts',
    'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'trans_depth', 'res_bdy_len',
    'Sjit', 'Djit', 'Stime', 'Ltime', 'Sintpkt', 'Dintpkt', 'tcprtt', 'synack', 'ackdat',
    'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd',
    'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm', 'ct_src_dport_ltm',
    'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat', 'Label'
]

def merge_csv_files(file_list, headers):
    data_frames = []
    for path in file_list:
        print(f"Loading {path}...")
        data = pd.read_csv(path, names=headers, low_memory=False)
        data_frames.append(data)
    merged_data = pd.concat(data_frames, ignore_index=True)
    print(f"Combined dataset shape: {merged_data.shape}")
    return merged_data

def create_balanced_sample(data, sample_size=100000):
    if 'Label' not in data.columns or 'attack_cat' not in data.columns:
        raise ValueError("Required columns missing")

    print(f"Attack category distribution:\n{data['attack_cat'].value_counts()}")
    
    categories = data['attack_cat'].unique()
    samples_per_category = sample_size // len(categories)
    
    balanced_samples = []
    for category in categories:
        subset = data[data['attack_cat'] == category]
        n = min(samples_per_category, len(subset))
        sampled_subset = subset.sample(n=n, random_state=42)
        balanced_samples.append(sampled_subset)
    
    balanced_data = pd.concat(balanced_samples, ignore_index=True)
    
    if len(balanced_data) < sample_size:
        additional_needed = sample_size - len(balanced_data)
        extra_samples = data.sample(n=additional_needed, random_state=42)
        balanced_data = pd.concat([balanced_data, extra_samples], ignore_index=True)
    
    print(f"Balanced dataset shape: {balanced_data.shape}")
    print(f"Sampled attack category distribution:\n{balanced_data['attack_cat'].value_counts()}")
    return balanced_data

def prepare_data_for_modeling(data):
    columns_to_remove = ['srcip', 'dstip', 'sport', 'dsport', 'stcpb', 'dtcpb', 'Stime', 'Ltime']
    data = data.drop(columns=columns_to_remove, errors='ignore')
    
    numeric_features = data.select_dtypes(include=['int64', 'float64']).columns
    for feature in numeric_features:
        data[feature] = data[feature].fillna(data[feature].median())
    
    categorical_features = data.select_dtypes(include=['object']).columns
    for feature in categorical_features:
        data[feature] = data[feature].fillna(data[feature].mode()[0])
        data[feature] = data[feature].astype(str)
    
    encoders = {}
    for feature in categorical_features:
        encoder = LabelEncoder()
        data[feature] = encoder.fit_transform(data[feature])
        encoders[feature] = encoder
    
    scaler = StandardScaler()
    numeric_features = numeric_features.drop('Label', errors='ignore')
    data[numeric_features] = scaler.fit_transform(data[numeric_features])
    
    if data.isnull().sum().sum() > 0:
        print("Warning: NaN values detected after preprocessing")
    if np.isinf(data[numeric_features]).sum().sum() > 0:
        print("Warning: Infinite values detected after preprocessing")
    
    return data, encoders, scaler

if __name__ == "__main__":
    input_files = [
        'UNSW-NB15_1.csv',
        'UNSW-NB15_2.csv',
        'UNSW-NB15_3.csv',
        'UNSW-NB15_4.csv'
    ]
    
    full_dataset = merge_csv_files(input_files, column_headers)
    balanced_dataset = create_balanced_sample(full_dataset)
    processed_dataset, encoders, scaler = prepare_data_for_modeling(balanced_dataset)
    processed_dataset.to_csv('unsw_nb15_preprocessed.csv', index=False)
    print("Preprocessed dataset saved as 'unsw_nb15_preprocessed.csv'")
    print(f"Final dataset columns: {processed_dataset.columns.tolist()}")


Loading UNSW-NB15_1.csv...
Loading UNSW-NB15_2.csv...
Loading UNSW-NB15_3.csv...
Loading UNSW-NB15_4.csv...
Combined dataset shape: (2540047, 49)
Attack category distribution:
attack_cat
Generic             215481
Exploits             44525
 Fuzzers             19195
DoS                  16353
 Reconnaissance      12228
 Fuzzers              5051
Analysis              2677
Backdoor              1795
Reconnaissance        1759
 Shellcode            1288
Backdoors              534
Shellcode              223
Worms                  174
Name: count, dtype: int64
Balanced dataset shape: (100000, 49)
Sampled attack category distribution:
attack_cat
Generic             11411
Exploits             8024
 Fuzzers             7517
DoS                  7464
 Reconnaissance      7400
 Fuzzers             5160
Analysis             2726
Backdoor             1833
Reconnaissance       1790
 Shellcode           1310
Backdoors             549
Shellcode             228
Worms                 177
Name: count,

Cleaning and Feature Engineering

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

def read_data(file_path):
    data = pd.read_csv(file_path)
    print(f"Dataset shape: {data.shape}")
    print(f"Columns in dataset: {data.columns.tolist()}")
    return data

def validate_data(data):
    print("Checking for missing values:")
    print(data.isnull().sum())
    print("\nChecking for infinite values:")
    print(np.isinf(data.select_dtypes(include=['float64', 'int64'])).sum())
    print("\nData types:")
    print(data.dtypes)

def limit_outliers(data, numeric_columns):
    for column in numeric_columns:
        q1 = data[column].quantile(0.25)
        q3 = data[column].quantile(0.75)
        iqr = q3 - q1
        min_val = q1 - 1.5 * iqr
        max_val = q3 + 1.5 * iqr
        data[column] = data[column].clip(lower=min_val, upper=max_val)
    return data

def reduce_features(data, target_column='Label', corr_cutoff=0.8):
    numeric_data = data.select_dtypes(include=['float64', 'int64']).columns.drop(target_column, errors='ignore')
    corr_matrix = data[numeric_data].corr().abs()
    upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    redundant_features = [col for col in upper_triangle.columns if any(upper_triangle[col] > corr_cutoff)]
    print(f"Highly correlated features to drop: {redundant_features}")
    
    features = data.drop(columns=[target_column, 'attack_cat'])
    labels = data[target_column]
    model = RandomForestClassifier(random_state=42, n_jobs=-1)
    model.fit(features, labels)
    
    importance_scores = pd.Series(model.feature_importances_, index=features.columns)
    key_features = importance_scores.nlargest(25).index.tolist()
    print(f"Top 25 features by importance: {key_features}")
    
    selected = key_features + ['attack_cat', target_column]
    trimmed_data = data[selected]
    print(f"Columns after feature selection: {trimmed_data.columns.tolist()}")
    return trimmed_data, redundant_features

def create_additional_features(data):
    required = ['sbytes', 'dbytes', 'Sload', 'Dload']
    missing = [col for col in required if col not in data.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")
    
    data['sbytes_dbytes_ratio'] = data['sbytes'] / (data['dbytes'] + 1e-6)
    data['sload_dload_ratio'] = data['Sload'] / (data['Dload'] + 1e-6)
    
    new_features = ['sbytes_dbytes_ratio', 'sload_dload_ratio']
    scaler = StandardScaler()
    data[new_features] = scaler.fit_transform(data[new_features])
    
    return data

if __name__ == "__main__":
    input_file = 'unsw_nb15_preprocessed.csv'
    dataset = read_data(input_file)
    
    validate_data(dataset)
    
    numeric_cols = dataset.select_dtypes(include=['float64', 'int64']).columns.drop('Label', errors='ignore')
    dataset = limit_outliers(dataset, numeric_cols)
    
    dataset, removed = reduce_features(dataset)
    print(f"Dropped correlated features: {removed}")
    
    dataset = create_additional_features(dataset)
    print(f"Dataset shape after feature engineering: {dataset.shape}")
    
    dataset.to_csv('unsw_nb15_engineered.csv', index=False)
    print("Enhanced dataset saved as 'unsw_nb15_engineered.csv'")
    print(f"Final dataset columns: {dataset.columns.tolist()}")


Dataset shape: (100000, 41)
Columns in dataset: ['proto', 'state', 'dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'Sload', 'Dload', 'Spkts', 'Dpkts', 'swin', 'dwin', 'smeansz', 'dmeansz', 'trans_depth', 'res_bdy_len', 'Sjit', 'Djit', 'Sintpkt', 'Dintpkt', 'tcprtt', 'synack', 'ackdat', 'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat', 'Label']
Checking for missing values:
proto               0
state               0
dur                 0
sbytes              0
dbytes              0
sttl                0
dttl                0
sloss               0
dloss               0
service             0
Sload               0
Dload               0
Spkts               0
Dpkts               0
swin                0
dwin                0
smeansz             0
dmeansz             0
trans_depth         0
res_bdy_len         

EDA

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import warnings
import os

warnings.filterwarnings('ignore')

sns.set_context('notebook', font_scale=1.2)
sns.set_style('whitegrid')

try:
    dataset = pd.read_csv('unsw_nb15_engineered.csv')
except FileNotFoundError:
    print("Error: 'unsw_nb15_engineered.csv' not found. Please ensure the file is in the working directory.")
    exit(1)

attack_categories = {
    0: 'Analysis', 1: 'Backdoor', 2: 'DoS', 3: 'Exploits', 4: 'Fuzzers',
    5: 'Generic', 6: 'Reconnaissance', 7: 'Shellcode', 8: 'Worms', 9: 'Normal',
    10: 'Class_10', 11: 'Class_11', 12: 'Class_12'
}
dataset['attack_cat_label'] = dataset['attack_cat'].map(attack_categories)

if not os.path.exists('eda_plots'):
    os.makedirs('eda_plots')

plt.figure(figsize=(10, 6))
sns.countplot(x='attack_cat_label', data=dataset, order=dataset['attack_cat_label'].value_counts().index)
plt.title('Distribution of Attack Categories')
plt.xlabel('Attack Category')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('eda_plots/class_distribution.png')
plt.close()

features_to_plot = ['dur', 'spkts', 'dpkts', 'sbytes']
valid_features = [feature for feature in features_to_plot if feature in dataset.columns]

if valid_features:
    plt.figure(figsize=(12, 8))
    for index, column in enumerate(valid_features[:4]):
        plt.subplot(2, 2, index + 1)
        sns.boxplot(x='attack_cat_label', y=column, data=dataset)
        plt.title(f'Distribution of {column} by Attack Category')
        plt.xlabel('Attack Category')
        plt.ylabel(column)
        plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig('eda_plots/feature_distributions.png')
    plt.close()
else:
    print("Warning: None of the specified numerical features (dur, spkts, dpkts, sbytes) found in dataset.")

numeric_columns = dataset.select_dtypes(include=['float64', 'int64']).columns
numeric_columns = [col for col in numeric_columns if col not in ['attack_cat', 'Label']]

if numeric_columns:
    correlation_data = dataset[numeric_columns].corr()
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_data, annot=False, cmap='coolwarm', center=0)
    plt.title('Correlation Matrix of Numerical Features')
    plt.tight_layout()
    plt.savefig('eda_plots/correlation_matrix.png')
    plt.close()
else:
    print("Warning: No numerical columns available for correlation matrix.")

if 'proto' in dataset.columns:
    plt.figure(figsize=(12, 6))
    sns.countplot(x='proto', hue='attack_cat_label', data=dataset, order=dataset['proto'].value_counts().index[:10])
    plt.title('Attack Categories by Network Protocol')
    plt.xlabel('Protocol')
    plt.ylabel('Count')
    plt.legend(title='Attack Category', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig('eda_plots/attack_by_protocol.png')
    plt.close()
else:
    print("Warning: Column 'proto' not found in dataset.")

if 'time' in dataset.columns:
    try:
        dataset['time'] = pd.to_datetime(dataset['time'], errors='coerce')
        dataset['week'] = dataset['time'].dt.isocalendar().week
        weekly_attacks = dataset.groupby(['week', 'attack_cat_label']).size().unstack(fill_value=0)

        plt.figure(figsize=(12, 6))
        for attack_type in weekly_attacks.columns:
            plt.plot(weekly_attacks.index, weekly_attacks[attack_type], label=attack_type)
        plt.title('Weekly Attack Frequency')
        plt.xlabel('Week of Year')
        plt.ylabel('Number of Attacks')
        plt.legend(title='Attack Category', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.grid(True)
        plt.tight_layout()
        plt.savefig('eda_plots/temporal_attack_patterns.png')
        plt.close()
    except Exception as e:
        print(f"Warning: Error processing temporal data: {e}")
else:
    print("Warning: Timestamp column 'time' not found in dataset.")

try:
    with open('tuned_models.pkl', 'rb') as file:
        models = pickle.load(file)

    model_features = dataset.drop(columns=['attack_cat', 'Label', 'attack_cat_label']).columns
    xgb = models.get('XGBoost')

    if xgb:
        importance_scores = xgb.feature_importances_
        top_features = np.argsort(importance_scores)[-10:]
        plt.figure(figsize=(10, 6))
        plt.barh(range(len(top_features)), importance_scores[top_features], color='teal', align='center')
        plt.yticks(range(len(top_features)), [model_features[i] for i in top_features])
        plt.xlabel('Feature Importance')
        plt.title('XGBoost Feature Importance')
        plt.tight_layout()
        plt.savefig('eda_plots/feature_importance_xgboost.png')
        plt.close()
    else:
        print("Warning: XGBoost model not found in tuned_models.pkl.")
except FileNotFoundError:
    print("Warning: 'tuned_models.pkl' not found. Skipping XGBoost feature importance plot.")

print("EDA plots generated in 'eda_plots' directory.")


EDA plots generated in 'eda_plots' directory.


Model Training

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import pickle
import warnings

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, classification_report, confusion_matrix, roc_curve
)
from sklearn.preprocessing import LabelBinarizer

warnings.filterwarnings('ignore')

def load_data(path):
    data = pd.read_csv(path)
    print(f"Dataset shape: {data.shape}")
    print(f"Columns: {data.columns.tolist()}")
    print(f"Target distribution:\n{data['attack_cat'].value_counts()}")

    if len(data['attack_cat'].unique()) < 2:
        raise ValueError("Insufficient classes for classification.")

    return data

def split_data(data, target='attack_cat'):
    features = data.drop(columns=[target, 'Label'])
    labels = data[target]
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42, stratify=labels)

    label_names = {
        0: 'Analysis', 1: 'Backdoor', 2: 'DoS', 3: 'Exploits', 4: 'Fuzzers',
        5: 'Generic', 6: 'Reconnaissance', 7: 'Shellcode', 8: 'Worms', 9: 'Normal',
        10: 'Class_10', 11: 'Class_11', 12: 'Class_12'
    }
    class_names = [label_names.get(i, f'Class_{i}') for i in np.unique(labels)]

    print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")
    print(f"Classes: {class_names}")

    return X_train, X_test, y_train, y_test, features.columns, class_names

def optimize_model(model, params, X, y):
    grid = GridSearchCV(model, params, cv=5, scoring='f1_weighted', n_jobs=-1, verbose=1)
    grid.fit(X, y)
    print(f"Best for {model.__class__.__name__}: {grid.best_params_}, Score: {grid.best_score_:.4f}")
    return grid.best_estimator_

def show_confusion(y_true, y_pred, title, labels):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.title(f'{title} Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig(f'confusion_matrix_{title.lower().replace(" ", "_")}.png')
    plt.close()

def show_roc(models, X, y, labels):
    binarizer = LabelBinarizer()
    y_bin = binarizer.fit_transform(y)
    plt.figure(figsize=(10, 8))
    colors = sns.color_palette("husl", len(models))

    for i, (name, model) in enumerate(models.items()):
        if hasattr(model, "predict_proba"):
            probs = model.predict_proba(X)
            fpr, tpr, _ = roc_curve(y_bin.ravel(), probs.ravel())
            auc = roc_auc_score(y_bin, probs, multi_class='ovr', average='macro')
            plt.plot(fpr, tpr, color=colors[i], label=f'{name} (AUC = {auc:.2f})')

    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Macro-Average ROC Curves')
    plt.legend(loc='best', bbox_to_anchor=(1.05, 1))
    plt.tight_layout()
    plt.savefig('macro_roc_curves.png', bbox_inches='tight')
    plt.close()

def show_feature_importance(model, name, features):
    if hasattr(model, 'feature_importances_'):
        values = model.feature_importances_
        top = np.argsort(values)[-10:]
        plt.figure(figsize=(10, 6))
        plt.barh(range(len(top)), values[top], align='center')
        plt.yticks(range(len(top)), [features[i] for i in top])
        plt.xlabel('Importance')
        plt.title(f'{name} Feature Importance')
        plt.tight_layout()
        plt.savefig(f'feature_importance_{name.lower().replace(" ", "_")}.png')
        plt.close()

def compare_models(df):
    metrics = ['Accuracy', 'Precision (Weighted)', 'Recall (Weighted)', 'F1-Score (Weighted)']
    plt.figure(figsize=(12, 8))
    for metric in metrics:
        plt.plot(df['Model'], df[metric], marker='o', label=metric)
    plt.xlabel('Model')
    plt.ylabel('Score')
    plt.title('Model Performance')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig('metric_comparison.png')
    plt.close()

def run_models(X_train, X_test, y_train, y_test, features, labels):
    weights = {cls: len(y_train)/(len(np.unique(y_train)) * sum(y_train == cls)) for cls in np.unique(y_train)}

    candidates = {
        'Random Forest': RandomForestClassifier(random_state=42, n_jobs=-1, class_weight=weights),
        'Logistic Regression': LogisticRegression(random_state=42, multi_class='multinomial', max_iter=1000, class_weight=weights),
        'SVM': SVC(random_state=42, probability=True),
        'KNN': KNeighborsClassifier(),
        'XGBoost': XGBClassifier(random_state=42, eval_metric='mlogloss')
    }

    params = {
        'Random Forest': {'n_estimators': [100, 200], 'max_depth': [10, 20, None], 'min_samples_split': [2, 5]},
        'Logistic Regression': {'C': [0.1, 1, 10], 'solver': ['lbfgs', 'saga']},
        'SVM': {'C': [0.1, 1], 'kernel': ['rbf', 'linear']},
        'KNN': {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']},
        'XGBoost': {'n_estimators': [100, 200], 'max_depth': [3, 6], 'learning_rate': [0.01, 0.1]}
    }

    results = []
    final_models = {}

    for name, model in candidates.items():
        print(f"\n{name} tuning...")
        start = time.time()
        tuned = optimize_model(model, params[name], X_train, y_train)
        tuned.fit(X_train, y_train)
        pred = tuned.predict(X_test)

        results.append({
            'Model': name,
            'Accuracy': accuracy_score(y_test, pred),
            'Precision (Macro)': precision_score(y_test, pred, average='macro', zero_division=0),
            'Recall (Macro)': recall_score(y_test, pred, average='macro', zero_division=0),
            'F1-Score (Macro)': f1_score(y_test, pred, average='macro', zero_division=0),
            'Precision (Weighted)': precision_score(y_test, pred, average='weighted', zero_division=0),
            'Recall (Weighted)': recall_score(y_test, pred, average='weighted', zero_division=0),
            'F1-Score (Weighted)': f1_score(y_test, pred, average='weighted', zero_division=0),
            'CV F1-Weighted Mean': cross_val_score(tuned, X_train, y_train, cv=5, scoring='f1_weighted', n_jobs=-1).mean(),
            'Training Time (s)': time.time() - start
        })

        final_models[name] = tuned
        show_confusion(y_test, pred, name, labels)
        show_feature_importance(tuned, name, features)
        print(f"\n{name} Report:\n{classification_report(y_test, pred, target_names=labels, zero_division=0)}")

    show_roc(final_models, X_test, y_test, labels)
    df_results = pd.DataFrame(results)
    print("\nPerformance Summary:")
    print(df_results)
    compare_models(df_results)

    preds = {name: model.predict(X_test) for name, model in final_models.items()}
    pd.DataFrame(preds).to_csv('model_predictions.csv', index=False)
    pd.Series(y_test, name='actual').to_csv('y_test.csv', index=False)
    with open('tuned_models.pkl', 'wb') as f:
        pickle.dump(final_models, f)

    return df_results, final_models

def choose_best(df):
    best = df.loc[df['F1-Score (Weighted)'].idxmax()]
    print("\nBest Model:")
    print(f"Model: {best['Model']}")
    print(f"Weighted F1: {best['F1-Score (Weighted)']:.4f}")
    print(f"CV Mean: {best['CV F1-Weighted Mean']:.4f}, Time: {best['Training Time (s)']:.2f}s")
    return best

if __name__ == "__main__":
    df = load_data('unsw_nb15_engineered.csv')
    X_train, X_test, y_train, y_test, feature_names, class_labels = split_data(df)
    results, models = run_models(X_train, X_test, y_train, y_test, feature_names, class_labels)
    best = choose_best(results)
    results.to_csv('model_performance_multiclass.csv', index=False)
    print("\nPerformance saved as 'model_performance_multiclass.csv'")


Dataset shape: (100000, 29)
Columns: ['ct_state_ttl', 'sttl', 'Dload', 'dmeansz', 'dttl', 'dbytes', 'Dpkts', 'synack', 'Sload', 'ackdat', 'sbytes', 'smeansz', 'tcprtt', 'Spkts', 'dloss', 'Dintpkt', 'Sintpkt', 'dur', 'state', 'Djit', 'sloss', 'ct_dst_ltm', 'ct_dst_sport_ltm', 'ct_srv_src', 'Sjit', 'attack_cat', 'Label', 'sbytes_dbytes_ratio', 'sload_dload_ratio']
Target distribution:
attack_cat
9     55822
8      8024
1      7517
7      7464
2      7400
0      5160
4      2726
5      1833
10     1790
3      1310
6       549
11      228
12      177
Name: count, dtype: int64
Train shape: (80000, 27), Test shape: (20000, 27)
Classes: ['Analysis', 'Backdoor', 'DoS', 'Exploits', 'Fuzzers', 'Generic', 'Reconnaissance', 'Shellcode', 'Worms', 'Normal', 'Class_10', 'Class_11', 'Class_12']

Random Forest tuning...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best for RandomForestClassifier: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}, Score: 0.7864

Random For

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best for LogisticRegression: {'C': 10, 'solver': 'lbfgs'}, Score: 0.6808


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Logistic Regression Report:
                precision    recall  f1-score   support

      Analysis       0.45      0.29      0.35      1032
      Backdoor       0.53      0.31      0.39      1503
           DoS       0.50      0.16      0.24      1480
      Exploits       0.18      0.37      0.24       262
       Fuzzers       0.22      0.27      0.24       545
       Generic       0.11      0.34      0.16       367
Reconnaissance       0.10      0.73      0.17       110
     Shellcode       0.37      0.08      0.13      1493
         Worms       0.70      0.35      0.47      1605
        Normal       0.99      0.96      0.98     11164
      Class_10       0.14      0.61      0.23       358
      Class_11       0.02      0.35      0.05        46
      Class_12       0.05      0.80      0.09        35

      accuracy                           0.66     20000
     macro avg       0.33      0.43      0.29     20000
  weighted avg       0.75      0.66      0.68     20000


SVM tuning...
F

Model performance plots

In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pickle
import os

# Load model performance data
performance = pd.read_csv('model_performance_multiclass.csv')

# Check available columns
print("Available columns in performance file:", performance.columns.tolist())

# Plot bar chart for weighted F1-Score
plt.figure(figsize=(10, 6))
sns.barplot(x='Model', y='F1-Score (Weighted)', data=performance)
plt.title('Weighted F1-Score Comparison')
plt.ylabel('Weighted F1-Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('visual_f1_score_comparison.png')
plt.close()

# Plot training time
plt.figure(figsize=(10, 6))
sns.barplot(x='Model', y='Training Time (s)', data=performance)
plt.title('Training Time by Model')
plt.ylabel('Training Time (s)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('visual_training_time_comparison.png')
plt.close()

# Plot CV F1-Weighted Mean if available
if 'CV F1-Weighted Mean' in performance.columns:
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Model', y='CV F1-Weighted Mean', data=performance)
    plt.title('Cross-Validation F1-Weighted Mean')
    plt.ylabel('CV F1-Weighted Mean')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('visual_cv_f1_weighted_mean.png')
    plt.close()
else:
    print("CV F1-Weighted Mean not found in CSV.")

# Optional violin plot to show variability (only if Std column exists)
if 'CV F1-Weighted Std' in performance.columns:
    cv_distributions = {}
    for model_name in performance['Model']:
        mean_score = performance.loc[performance['Model'] == model_name, 'CV F1-Weighted Mean'].values[0]
        std_dev = performance.loc[performance['Model'] == model_name, 'CV F1-Weighted Std'].values[0]
        # Simulate distribution
        cv_distributions[model_name] = np.random.normal(mean_score, std_dev, 1000)

    dist_df = pd.DataFrame(cv_distributions)
    dist_df = dist_df.melt(var_name='Model', value_name='CV F1 Score')

    plt.figure(figsize=(12, 6))
    sns.violinplot(x='Model', y='CV F1 Score', data=dist_df)
    plt.title('Simulated CV F1-Score Distribution')
    plt.ylabel('F1 Score')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('visual_violin_cv_f1_score_distribution.png')
    plt.close()



Available columns in performance file: ['Model', 'Accuracy', 'Precision (Macro)', 'Recall (Macro)', 'F1-Score (Macro)', 'Precision (Weighted)', 'Recall (Weighted)', 'F1-Score (Weighted)', 'CV F1-Weighted Mean', 'Training Time (s)']
