In [None]:
pip install pandas numpy scikit-learn xgboost

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, accuracy_score

from xgboost import XGBClassifier



In [None]:
df = pd.read_csv("Monday-WorkingHours.pcap_ISCX.csv")
df.columns = df.columns.str.strip()

df = df.replace([np.inf, -np.inf], np.nan)
df.dropna(inplace=True)

df['Label'] = df['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)

df.to_csv("processed_monday.csv", index=False)

In [None]:
df = pd.read_csv("Tuesday-WorkingHours.pcap_ISCX.csv")
df.columns = df.columns.str.strip()

df = df.replace([np.inf, -np.inf], np.nan)
df.dropna(inplace=True)

df['Label'] = df['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)

df.to_csv("processed_tuesday.csv", index=False)


In [None]:
df = pd.read_csv("Wednesday-workingHours.pcap_ISCX.csv")
df.columns = df.columns.str.strip()

df = df.replace([np.inf, -np.inf], np.nan)
df.dropna(inplace=True)

df['Label'] = df['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)

df.to_csv("processed_wednesday.csv", index=False)


In [None]:
df = pd.read_csv("Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv")
df.columns = df.columns.str.strip()

df = df.replace([np.inf, -np.inf], np.nan)
df.dropna(inplace=True)

df['Label'] = df['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)

df.to_csv("processed_thursday_morning.csv", index=False)


In [None]:
df = pd.read_csv("Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv")
df.columns = df.columns.str.strip()

df = df.replace([np.inf, -np.inf], np.nan)
df.dropna(inplace=True)

df['Label'] = df['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)

df.to_csv("processed_thursday_afternoon.csv", index=False)


In [None]:
df = pd.read_csv("Friday-WorkingHours-Morning.pcap_ISCX.csv")
df.columns = df.columns.str.strip()

df = df.replace([np.inf, -np.inf], np.nan)
df.dropna(inplace=True)

df['Label'] = df['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)

df.to_csv("processed_friday_morning.csv", index=False)


In [None]:
df = pd.read_csv("Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv")
df.columns = df.columns.str.strip()

df = df.replace([np.inf, -np.inf], np.nan)
df.dropna(inplace=True)

df['Label'] = df['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)

df.to_csv("processed_friday_portscan.csv", index=False)


In [None]:
df = pd.read_csv("Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv")
df.columns = df.columns.str.strip()

df = df.replace([np.inf, -np.inf], np.nan)
df.dropna(inplace=True)

df['Label'] = df['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)

df.to_csv("processed_friday_ddos.csv", index=False)


In [None]:
import pandas as pd

files = [
    "processed_monday.csv",
    "processed_tuesday.csv",
    "processed_wednesday.csv",
    "processed_thursday_morning.csv",
    "processed_thursday_afternoon.csv",
    "processed_friday_morning.csv",
    "processed_friday_portscan.csv",
    "processed_friday_ddos.csv"
]

for f in files:
    df = pd.read_csv(f)
    print(f, "â†’", df['Label'].value_counts().to_dict())


In [None]:
import pandas as pd

files = [
    "processed_monday.csv",
    "processed_tuesday.csv",
    "processed_wednesday.csv",
    "processed_thursday_morning.csv",
    "processed_thursday_afternoon.csv",
    "processed_friday_morning.csv",
    "processed_friday_portscan.csv",
    "processed_friday_ddos.csv"
]

dfs = [pd.read_csv(f) for f in files]
data = pd.concat(dfs, axis=0, ignore_index=True)

print("FINAL LABEL DISTRIBUTION:")
print(data['Label'].value_counts())


In [None]:
assert set(data['Label'].unique()) == {0, 1}
print("âœ… Dataset verified: both classes present")


In [None]:
from sklearn.model_selection import train_test_split

X = data.drop('Label', axis=1)
y = data['Label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train distribution:\n", y_train.value_counts())
print("Test distribution:\n", y_test.value_counts())


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
from sklearn.ensemble import IsolationForest
import numpy as np

iso = IsolationForest(
    n_estimators=100,
    contamination=0.1,
    random_state=42,
    n_jobs=-1
)

iso.fit(X_train_scaled)

train_anomaly = iso.decision_function(X_train_scaled)
test_anomaly = iso.decision_function(X_test_scaled)

X_train_if = np.column_stack((X_train_scaled, train_anomaly))
X_test_if = np.column_stack((X_test_scaled, test_anomaly))


In [None]:
from xgboost import XGBClassifier
from collections import Counter

counter = Counter(y_train)
scale_pos_weight = counter[0] / counter[1]

xgb = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    eval_metric='logloss',
    scale_pos_weight=scale_pos_weight,
    random_state=42
)

xgb.fit(X_train_if, y_train)


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = xgb.predict(X_test_if)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['BENIGN', 'ATTACK'],
            yticklabels=['BENIGN', 'ATTACK'])
plt.xlabel("Predicted Label")
plt.ylabel("Actual Label")
plt.title("Confusion Matrix Heatmap")
plt.show()


In [None]:
# Select top 15 features based on variance
top_features = data.drop('Label', axis=1).var().sort_values(ascending=False).head(15).index

corr = data[top_features].corr()

plt.figure(figsize=(10,8))
sns.heatmap(corr, cmap='coolwarm', linewidths=0.5)
plt.title("Feature Correlation Heatmap (Top 15 Features)")
plt.show()


In [None]:
plt.figure(figsize=(5,4))
sns.countplot(x=data['Label'])
plt.xticks([0,1], ['BENIGN', 'ATTACK'])
plt.title("Class Distribution in CICIDS2017 Dataset")
plt.ylabel("Number of Samples")
plt.show()


In [None]:
plt.figure()
plt.hist(train_anomaly, bins=50)
plt.xlabel("Anomaly Score")
plt.ylabel("Frequency")
plt.title("Isolation Forest Anomaly Score Distribution")
plt.show()


In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

y_prob = xgb.predict_proba(X_test_if)[:, 1]

fpr, tpr, _ = roc_curve(y_test, y_prob)
auc = roc_auc_score(y_test, y_prob)

plt.figure()
plt.plot(fpr, tpr, label=f"AUC = {auc:.3f}")
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()


In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
fp = cm[0, 1]
fn = cm[1, 0]

plt.figure()
plt.bar(["False Positives", "False Negatives"], [fp, fn])
plt.ylabel("Count")
plt.title("False Positives vs False Negatives")
plt.show()

In [None]:
y_pred = xgb.predict(X_test_if)


In [None]:
y_prob = xgb.predict_proba(X_test_if)[:, 1]

# Increase threshold
threshold = 0.7
y_pred_custom = (y_prob >= threshold).astype(int)


In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred_custom)
fp = cm[0, 1]
fn = cm[1, 0]

print("FP:", fp)
print("FN:", fn)


In [None]:
y_prob = xgb.predict_proba(X_test_if)[:, 1]

threshold = 0.8   # try 0.7 â†’ 0.75 â†’ 0.8
y_pred_new = (y_prob >= threshold).astype(int)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_new)
FP = cm[0,1]
FN = cm[1,0]

print("FP:", FP)
print("FN:", FN)


In [None]:
#zero day attack

from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

# Separate benign and attack samples
benign_data = data[data['Label'] == 0]
attack_data = data[data['Label'] == 1]

X_benign = benign_data.drop('Label', axis=1)
X_attack = attack_data.drop('Label', axis=1)

# Scale using only benign traffic
scaler_zd = StandardScaler()
X_benign_scaled = scaler_zd.fit_transform(X_benign)
X_attack_scaled = scaler_zd.transform(X_attack)

# Train Isolation Forest ONLY on BENIGN traffic
iso_zero_day = IsolationForest(
    n_estimators=100,
    contamination=0.02,
    random_state=42,
    n_jobs=-1
)

iso_zero_day.fit(X_benign_scaled)

# Predict anomalies
benign_pred = iso_zero_day.predict(X_benign_scaled)
attack_pred = iso_zero_day.predict(X_attack_scaled)

print("Benign detected as anomaly (FP):", (benign_pred == -1).sum())
print("Attacks detected as anomaly (TP):", (attack_pred == -1).sum())


In [None]:
# Ground truth for zero-day test
# Benign = 0, Attack = 1
y_true_zero = (
    [0] * len(benign_pred) +
    [1] * len(attack_pred)
)


In [None]:
# Convert predictions to labels
y_pred_zero = (
    [0 if p == 1 else 1 for p in benign_pred] +
    [0 if p == 1 else 1 for p in attack_pred]
)


In [None]:
from sklearn.metrics import accuracy_score, classification_report

zero_day_accuracy = accuracy_score(y_true_zero, y_pred_zero)
print("Zero-Day Detection Accuracy:", zero_day_accuracy)

print(classification_report(y_true_zero, y_pred_zero))


In [None]:
# Separate benign and attack samples
benign_data = data[data['Label'] == 0]
attack_data = data[data['Label'] == 1]

X_benign = benign_data.drop('Label', axis=1)
X_attack = attack_data.drop('Label', axis=1)


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_benign_scaled = scaler.fit_transform(X_benign)
X_attack_scaled = scaler.transform(X_attack)


In [None]:
from sklearn.ensemble import IsolationForest

iso_zero_day = IsolationForest(
    n_estimators=100,
    contamination=0.02,
    random_state=42,
    n_jobs=-1
)

iso_zero_day.fit(X_benign_scaled)


In [None]:
# Predict on benign and attack data
benign_pred = iso_zero_day.predict(X_benign_scaled)
attack_pred = iso_zero_day.predict(X_attack_scaled)


In [None]:
# Ground truth
y_true = ([0] * len(benign_pred)) + ([1] * len(attack_pred))

# Predicted labels
y_pred = (
    [0 if p == 1 else 1 for p in benign_pred] +
    [0 if p == 1 else 1 for p in attack_pred]
)


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Zero-Day Detection Accuracy:",
      accuracy_score(y_true, y_pred))

print("\nConfusion Matrix:\n",
      confusion_matrix(y_true, y_pred))

print("\nClassification Report:\n",
      classification_report(y_true, y_pred))


In [None]:
#concept drift

from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

# Train on early traffic
train_files = [
    "processed_monday.csv",
    "processed_tuesday.csv"
]

# Test on later traffic (drift)
test_files = [
    "processed_friday_portscan.csv",
    "processed_friday_ddos.csv"
]

train_df = pd.concat([pd.read_csv(f) for f in train_files], axis=0)
test_df = pd.concat([pd.read_csv(f) for f in test_files], axis=0)

X_train_drift = train_df.drop('Label', axis=1)
y_train_drift = train_df['Label']

X_test_drift = test_df.drop('Label', axis=1)
y_test_drift = test_df['Label']

scaler_drift = StandardScaler()
X_train_drift_scaled = scaler_drift.fit_transform(X_train_drift)
X_test_drift_scaled = scaler_drift.transform(X_test_drift)

# Train model on old data
xgb_drift = XGBClassifier(
    n_estimators=150,
    max_depth=6,
    learning_rate=0.1,
    eval_metric='logloss',
    random_state=42
)

xgb_drift.fit(X_train_drift_scaled, y_train_drift)

# Test on new data
y_pred_drift = xgb_drift.predict(X_test_drift_scaled)

print("Accuracy under concept drift:",
      accuracy_score(y_test_drift, y_pred_drift))

print(classification_report(y_test_drift, y_pred_drift))


In [None]:
# Retrain using new (Friday) data
combined_df = pd.concat([train_df, test_df], axis=0)

X_new = combined_df.drop('Label', axis=1)
y_new = combined_df['Label']

scaler_new = StandardScaler()
X_new_scaled = scaler_new.fit_transform(X_new)

xgb_drift.fit(X_new_scaled, y_new)

# Test again
y_pred_retrained = xgb_drift.predict(X_test_drift_scaled)

from sklearn.metrics import accuracy_score
print("Accuracy after retraining:",
      accuracy_score(y_test_drift, y_pred_retrained))


In [None]:
import pandas as pd

# Time-based chunks (ordered)
chunks = [
    "processed_monday.csv",
    "processed_tuesday.csv",
    "processed_wednesday.csv",
    "processed_thursday_morning.csv",
    "processed_friday_ddos.csv"
]


In [None]:
import pandas as pd

monday = pd.read_csv("processed_monday.csv")
tuesday = pd.read_csv("processed_tuesday.csv")

# Optional: take only a small part of Tuesday attacks
tuesday_attack = tuesday[tuesday['Label'] == 1].sample(n=5000, random_state=42)
tuesday_benign = tuesday[tuesday['Label'] == 0].sample(n=5000, random_state=42)

train_df = pd.concat([monday, tuesday_attack, tuesday_benign], axis=0)

print("Seed dataset label distribution:")
print(train_df['Label'].value_counts())


In [None]:
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

X_train = train_df.drop('Label', axis=1)
y_train = train_df['Label']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = XGBClassifier(
    n_estimators=150,
    max_depth=6,
    learning_rate=0.1,
    objective='binary:logistic',
    eval_metric='logloss',
    base_score=0.5,
    random_state=42
)

model.fit(X_train_scaled, y_train)
print("âœ… Initial model trained successfully")


In [None]:
chunks = [
    "processed_wednesday.csv",
    "processed_thursday_morning.csv",
    "processed_friday_ddos.csv"
]


In [None]:
from sklearn.metrics import accuracy_score

results = []

for file in chunks:
    print(f"\nðŸ“¥ New incoming data: {file}")
    
    new_df = pd.read_csv(file)
    
    X_new = new_df.drop('Label', axis=1)
    y_new = new_df['Label']
    
    X_new_scaled = scaler.transform(X_new)
    
    # ðŸ”¹ Performance BEFORE retraining (concept drift effect)
    y_pred_before = model.predict(X_new_scaled)
    acc_before = accuracy_score(y_new, y_pred_before)
    
    # ðŸ”¹ Incremental update (append new data)
    train_df = pd.concat([train_df, new_df], axis=0)
    
    X_train = train_df.drop('Label', axis=1)
    y_train = train_df['Label']
    
    X_train_scaled = scaler.fit_transform(X_train)
    model.fit(X_train_scaled, y_train)
    
    # ðŸ”¹ Performance AFTER retraining
    y_pred_after = model.predict(X_new_scaled)
    acc_after = accuracy_score(y_new, y_pred_after)
    
    results.append((file, acc_before, acc_after))


In [None]:
print("\nðŸ“Š Incremental Retraining Results")
for f, before, after in results:
    print(f"{f}")
    print(f"  Accuracy before retraining : {before:.3f}")
    print(f"  Accuracy after retraining  : {after:.3f}")


In [None]:
acc_before = accuracy_score(y_new, y_pred_before)
print("Accuracy before retraining (concept drift):", acc_before)


In [None]:
acc_after = accuracy_score(y_new, y_pred_after)
print("Accuracy after incremental retraining:", acc_after)


In [None]:
from sklearn.metrics import accuracy_score

y_pred = xgb.predict(X_test_if)
print("Accuracy:", accuracy_score(y_test, y_pred))


In [None]:
baseline_accuracy = accuracy_score(y_test, y_pred)
print("Baseline Accuracy:", baseline_accuracy)


In [None]:
summary = pd.DataFrame({
    "Scenario": [
        "Normal Training (No Drift)",
        "Zero-Day Attack Detection",
        "Concept Drift (Before Retraining)",
        "Concept Drift (After Retraining)"
    ],
    "Accuracy": [
        baseline_accuracy,      # from your main model
        zero_day_accuracy,
        acc_before,
        acc_after
    ]
})

print(summary)
