Part 1: Setup and Installations

In [4]:
# 1. Essential Pip Installs
!pip install deap scikit-learn matplotlib seaborn pandas numpy plotly requests

import os
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from deap import base, creator, tools, algorithms

# 2. Data Download Script (Checks if already exists)
file_urls = [
    ("https://drive.google.com/uc?export=download&id=1cictwnxUyu1vCa4H9iefIrQeVLCC3RCv", "benign-chrome.csv"),
    ("https://drive.google.com/uc?export=download&id=1cms99qEylyvesqcX3dQRZOUQRAONy2uS", "benign-firefox.csv"),
    ("https://drive.google.com/uc?export=download&id=1cqDL7A_kdOCL4Km4uUifRPllFmB3WaZ_", "mal-dns2tcp.csv"),
    ("https://drive.google.com/uc?export=download&id=1cxeTvXNV-OY_4T6xs4sUB98lmanROw3m", "mal-dnscat2.csv"),
    ("https://drive.google.com/uc?export=download&id=1czNRMpNyicFNYW2fbK_WjsoF77qB9_XA", "mal-iodine.csv")
]

if not os.path.exists("DoHBrw-2020"):
    os.makedirs("DoHBrw-2020")

print("--- Checking Data Files ---")
for url, filename in file_urls:
    file_path = os.path.join("DoHBrw-2020", filename)
    if not os.path.exists(file_path):
        try:
            print(f"Downloading {filename}...")
            # Using direct download stream
            response = requests.get(url, stream=True)
            with open(file_path, "wb") as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk: f.write(chunk)
            print(f"{filename} ready.")
        except Exception as e:
            print(f"Error downloading {filename}: {e}")
    else:
        print(f"{filename} already exists.")

--- Checking Data Files ---
benign-chrome.csv already exists.
benign-firefox.csv already exists.
mal-dns2tcp.csv already exists.
mal-dnscat2.csv already exists.
mal-iodine.csv already exists.


Part 2: Data Preprocessing (Exact Logic)

In [5]:
# Load and Concatenate
df_benign = pd.concat([pd.read_csv('DoHBrw-2020/benign-chrome.csv'),
                       pd.read_csv('DoHBrw-2020/benign-firefox.csv')], ignore_index=True)
df_benign['labels'] = 0

df1_malic = pd.read_csv('DoHBrw-2020/mal-iodine.csv'); df1_malic['labels'] = 1
df2_malic = pd.read_csv('DoHBrw-2020/mal-dns2tcp.csv'); df2_malic['labels'] = 2
df3_malic = pd.read_csv('DoHBrw-2020/mal-dnscat2.csv'); df3_malic['labels'] = 3

data = shuffle(pd.concat([df_benign, df1_malic, df2_malic, df3_malic], ignore_index=True), random_state=1)

# Cleaning and Imputation
data_dropped = data.drop(columns=[col for col in data.columns if data[col].nunique() == 1])
data_filled = data_dropped.fillna(0)
X = data_filled.drop(["TimeStamp", "labels", "SourceIP", "DestinationIP"], axis=1, errors='ignore')
y = data_filled['labels'].values

imputer = SimpleImputer(strategy='mean')
scaler = StandardScaler()
X_scaled = scaler.fit_transform(imputer.fit_transform(X))

# Exact Split used in your study
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.5, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

  data_filled = data_dropped.fillna(0)


Part 3: Replicate Experiment & Generate New Figures

In [6]:
# Setup GA Histories for Figures
best_fitness_history = []
feature_count_history = []

def evaluate(individual):
    selected = [i for i, bit in enumerate(individual) if bit == 1]
    if not selected: return 0,
    clf = RandomForestClassifier(n_estimators=50, max_depth=10, n_jobs=-1, random_state=42)
    clf.fit(X_train[:, selected], y_train)
    f1 = f1_score(y_val, clf.predict(X_val[:, selected]), average='weighted')
    return f1 - (0.01 * (sum(individual) / len(individual))),

# Execute GA
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)
toolbox = base.Toolbox()
toolbox.register("attr_bool", np.random.randint, 0, 2)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=X_scaled.shape[1])
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

pop = toolbox.population(n=20)
for gen in range(20):
    offspring = algorithms.varAnd(pop, toolbox, cxpb=0.5, mutpb=0.2)
    fits = list(map(toolbox.evaluate, offspring))
    for ind, fit in zip(offspring, fits): ind.fitness.values = fit
    pop = toolbox.select(offspring, k=len(pop))
    best = tools.selBest(pop, 1)[0]
    best_fitness_history.append(best.fitness.values[0])
    feature_count_history.append(sum(best))

# --- GENERATE REVIEWER FIGURES ---
# Figure: Convergence and Reduction

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1); plt.plot(best_fitness_history, 'b-o'); plt.title('GA Fitness Evolution'); plt.xlabel('Generation'); plt.ylabel('Fitness Score')
plt.subplot(1, 2, 2); plt.plot(feature_count_history, 'r-s'); plt.title('Feature Subset Reduction'); plt.xlabel('Generation'); plt.ylabel('Number of Features')
plt.tight_layout(); plt.show()

# Figure: Professional Confusion Matrix
best_indices = [i for i, bit in enumerate(best)]
final_model = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42)
final_model.fit(X_train[:, best_indices], y_train)
y_pred = final_model.predict(X_test[:, best_indices])


plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues',
            xticklabels=['Benign', 'Iodine', 'DNS2TCP', 'Dnscat2'], yticklabels=['Benign', 'Iodine', 'DNS2TCP', 'Dnscat2'])
plt.title('Final Model Performance Matrix'); plt.show()

print("\n--- Final Metrics for Journal Section IV ---")
print(classification_report(y_test, y_pred, target_names=['Benign', 'Iodine', 'DNS2TCP', 'Dnscat2']))

KeyboardInterrupt: 

In [None]:
import time
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_curve

# --- 1. Statistical Significance (K-Fold) ---
final_clf = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42)
cv_scores = cross_val_score(final_clf, X_scaled[:, best_indices], y, cv=5, scoring='f1_weighted')
print(f"K-Fold F1-Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# --- 2. Inference Speed Test ---
start_time = time.time()
for _ in range(1000):
    final_clf.predict(X_test[:1, best_indices]) # Predict 1000 individual samples
end_time = time.time()
print(f"Inference Latency per 1000 packets: {(end_time - start_time):.4f} seconds")

# --- 3. PR Curve (New Figure for Clarity) ---
from sklearn.preprocessing import label_binarize
from sklearn.metrics import precision_recall_curve, average_precision_score

y_test_bin = label_binarize(y_test, classes=[0, 1, 2, 3])
y_score = final_clf.predict_proba(X_test[:, best_indices])

plt.figure(figsize=(7, 5))
colors = ['purple', 'orange', 'green', 'blue']
classes = ['Benign', 'Iodine', 'DNS2TCP', 'Dnscat2']

for i in range(4):
    precision, recall, _ = precision_recall_curve(y_test_bin[:, i], y_score[:, i])
    plt.plot(recall, precision, color=colors[i], label=f'{classes[i]} (AP = {average_precision_score(y_test_bin[:, i], y_score[:, i]):.2f})')

plt.title('Precision-Recall Curve (Multi-class Analysis)')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend(loc='best')
plt.grid(alpha=0.3)
plt.show()