<a href="https://colab.research.google.com/github/Ivyratermgwangqa/Lerato_Mgwangqa/blob/main/Group_2_Anomaly_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install the core libraries not pre-installed on Colab
!pip install xgboost
!pip install shap
!pip install imbalanced-learn  # Crucial for SMOTE
!pip install kaggle           # If you plan to download datasets directly from Kaggle



In [2]:
# Core Data Handling & Computation
import numpy as np
import pandas as pd
from scipy import stats

# Data Preprocessing & Evaluation
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix,
                             classification_report, RocCurveDisplay)
from sklearn.pipeline import Pipeline

# Handling Class Imbalance (CRITICAL FOR THIS PROJECT)
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline as imblearn_make_pipeline

# Machine Learning Models
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

# Explainable AI (XAI)
import shap

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# System & Utilities
import time
import warnings
warnings.filterwarnings('ignore') # Suppress warnings for a cleaner output

print("All libraries imported successfully!")

All libraries imported successfully!


In [3]:
import tensorflow as tf
# Get the GPU device name.
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

SystemError: GPU device not found

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("dhoogla/cicdarknet2020")

print("Path to dataset files:", path)

# Task
Load the "CIC-Darknet2020" dataset downloaded from Kaggle into a pandas DataFrame and display the first 5 rows, data types, and descriptive statistics. The dataset is located at the path printed in the output of the cell with id "1W85HkQyswrH".

## Identify data files

### Subtask:
List the files in the downloaded directory to understand the dataset structure and identify the relevant data file(s).


**Reasoning**:
List the files in the downloaded directory to understand the dataset structure and identify the relevant data file(s).



In [None]:
import os

# List the files in the downloaded directory
files_in_directory = os.listdir(path)

# Print the list of files and directories
print("Files in the dataset directory:")
for file in files_in_directory:
    print(file)

## Load data

### Subtask:
Load the main data file into a pandas DataFrame.


**Reasoning**:
Load the parquet file into a pandas DataFrame.



In [None]:
import os

# Construct the full path to the parquet file
file_path = os.path.join(path, 'cicdarknet2020.parquet')

# Load the data into a pandas DataFrame
df = pd.read_parquet(file_path)

## Inspect data

### Subtask:
Display the first few rows and check the data types and basic statistics to get an initial understanding of the data.


**Reasoning**:
Display the first few rows, data types, and basic statistics of the DataFrame to understand its structure and content.



In [None]:
display(df.head())
display(df.info())
display(df.describe())

## Summary:

### Data Analysis Key Findings

*   The dataset is stored in a parquet file named `cicdarknet2020.parquet`.
*   The loaded DataFrame contains 103,121 entries and 79 columns.
*   The dataset includes numerical columns with various integer and float data types, and likely object type columns for the labels.
*   There are no missing values in any of the columns.
*   The dataset contains two label columns: 'Label' and 'Label.1'.

### Insights or Next Steps

*   Further analysis should investigate the meaning and relationship between the two label columns.
*   The presence of various numerical features suggests the need for scaling or normalization before using some machine learning algorithms.


In [None]:
# Simple test to verify XGBoost and SHAP work
print("Testing XGBoost...")
X, y = np.random.rand(100, 5), np.random.randint(0, 2, 100) # Dummy data
xgb_model = xgb.XGBClassifier(tree_method='gpu_hist') # Use GPU for training!
xgb_model.fit(X, y)
print("XGBoost model trained successfully!")

print("\nTesting SHAP...")
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X)
print(f"SHAP values calculated. Shape: {np.array(shap_values).shape}")
print("✅ Environment setup is complete and functional!")

In [None]:
# Assuming the label is in a column called 'Label'
# Check the distribution of the target variable ('Label')
print("Target Variable Distribution:")
label_counts = df['Label'].value_counts()
print(label_counts)

# Visualize the class distribution
plt.figure(figsize=(10, 5))
plt.bar(label_counts.index.astype(str), label_counts.values)
plt.title('Class Distribution in Sample Data')
plt.xlabel('Class Label')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

# Check the ratio of benign to malicious traffic
malicious_ratio = (df['Label'] != 'Benign').sum() / len(df)  # Adjust 'Benign' as needed
print(f"\nRatio of malicious flows: {malicious_ratio:.4f}")

In [None]:
# Check for missing values in the entire DataFrame
print("Missing values per column:")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0]) # Only show columns with missing values

# Visualize missing data (if any)
if missing_values.sum() > 0:
    plt.figure(figsize=(12, 6))
    sns.heatmap(df.isnull(), cbar=False, yticklabels=False)
    plt.title('Heatmap of Missing Data')
    plt.show()
else:
    print("No missing values found in the sampled data!")

In [None]:
# Example: Drop columns with all missing values
df_clean = df.dropna(axis=1, how='all')

# Example: Drop high-cardinality or identifier columns (adjust list as needed)
columns_to_drop = ['Src IP', 'Dst IP', 'Src Port', 'Dst Port', 'Timestamp']
df_clean = df_clean.drop(columns=[col for col in columns_to_drop if col in df_clean.columns])

print(f"Original shape: {df.shape}")
print(f"New shape after dropping columns: {df_clean.shape}")

In [None]:
# Fill numerical columns with median (more robust to outliers)
numerical_cols = df_clean.select_dtypes(include=[np.number]).columns
df_clean[numerical_cols] = df_clean[numerical_cols].fillna(df_clean[numerical_cols].median())

# Fill categorical columns with mode
categorical_cols = df_clean.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df_clean[col] = df_clean[col].fillna(df_clean[col].mode()[0])

# Verify no missing values remain
print("Missing values after imputation:")
print(df_clean.isnull().sum().sum())

In [None]:
# Check for infinite values
print("Number of infinite values per column:")
# Select only numerical columns before applying isinf
numerical_cols = df_clean.select_dtypes(include=np.number).columns
infinite_values = df_clean[numerical_cols].apply(lambda x: np.isinf(x).sum())
print(infinite_values[infinite_values > 0])

# Replace infinite values with a large number (e.g., the maximum value in the column or a predefined large number)
# Here, we'll replace with the maximum value of each column to avoid arbitrary large numbers
for col in infinite_values[infinite_values > 0].index:
    max_val = df_clean[col][np.isfinite(df_clean[col])].max()
    df_clean[col] = df_clean[col].replace([np.inf, -np.inf], max_val)

# Verify no infinite values remain
print("\nNumber of infinite values after handling:")
print(df_clean[numerical_cols].apply(lambda x: np.isinf(x).sum()).sum())

In [None]:
# Perform Sanity Checks: Remove rows with illogical values

# Check for negative values in relevant numerical columns
print("Number of rows with negative values before cleaning:")
negative_duration = (df_clean['Flow Duration'] < 0).sum()
negative_fwd_packets = (df_clean['Total Fwd Packet'] < 0).sum()
negative_bwd_packets = (df_clean['Total Bwd packets'] < 0).sum()

print(f"  Flow Duration: {negative_duration}")
print(f"  Total Fwd Packet: {negative_fwd_packets}")
print(f"  Total Bwd packets: {negative_bwd_packets}")

# Remove rows with negative values in these columns
initial_rows = df_clean.shape[0]
df_clean = df_clean[df_clean['Flow Duration'] >= 0]
df_clean = df_clean[df_clean['Total Fwd Packet'] >= 0]
df_clean = df_clean[df_clean['Total Bwd packets'] >= 0]
rows_removed = initial_rows - df_clean.shape[0]

print(f"\nNumber of rows removed during sanity checks: {rows_removed}")
print(f"New DataFrame shape after sanity checks: {df_clean.shape}")

In [None]:
# Identify categorical columns
categorical_cols = df_clean.select_dtypes(include=['object', 'category']).columns
print("Categorical columns:")
print(categorical_cols)

# Display value counts for categorical columns to understand unique values
for col in categorical_cols:
    print(f"\nValue counts for '{col}':")
    print(df_clean[col].value_counts())

In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'Label' column
df_clean['Label_Encoded'] = label_encoder.fit_transform(df_clean['Label'])

# Display the mapping of original labels to encoded numbers
print("Label Encoding Mapping:")
for i, label in enumerate(label_encoder.classes_):
    print(f"{label}: {i}")

# Quick dictionary mapping for reference
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("\nLabel Mapping Dictionary:", label_mapping)

# Verify the encoding by checking the value counts of the new encoded column
print("\nValue counts of the encoded target variable:")
print(df_clean['Label_Encoded'].value_counts())


In [None]:
from sklearn.preprocessing import RobustScaler

# Identify numerical columns, excluding the encoded target and original label
numerical_cols = df_clean.select_dtypes(include=np.number).columns.tolist()
# Remove the encoded target column from the list of numerical columns to scale
if 'Label_Encoded' in numerical_cols:
    numerical_cols.remove('Label_Encoded')

# Initialize RobustScaler
scaler = RobustScaler()

# Apply RobustScaler to the numerical columns
df_clean[numerical_cols] = scaler.fit_transform(df_clean[numerical_cols])

print("Numerical features have been scaled using RobustScaler.")
display(df_clean.head())

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Separate features (X) and target (y)
# We use the encoded 'Label_Encoded' column as the target
X = df_clean.drop(['Label', 'Label.1', 'Label_Encoded'], axis=1) # Drop original labels and the encoded one to avoid including it in features
y = df_clean['Label_Encoded']

# Split data into training and testing sets
# Using stratify=y to maintain the proportion of classes in both train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print(f"Original dataset shape: {X.shape}, {y.shape}")
print(f"Training set shape before SMOTE: {X_train.shape}, {y_train.shape}")
print(f"Testing set shape: {X_test.shape}, {y_test.shape}")

# Apply SMOTE to the training data only
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print(f"\nTraining set shape after SMOTE: {X_train_res.shape}, {y_train_res.shape}")

# Check the distribution of the target variable after SMOTE
print("\nClass distribution in training set after SMOTE:")
print(pd.Series(y_train_res).value_counts())

In [None]:
import xgboost as xgb
import shap
import matplotlib.pyplot as plt

# Initialize and train an XGBoost model
# Use the resampled training data
xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(label_encoder.classes_), random_state=42, tree_method='gpu_hist')
xgb_model.fit(X_train_res, y_train_res)

print("XGBoost model trained successfully on resampled data.")

# Explain the model's predictions using SHAP
# Use the original training data for SHAP explanation (or a representative subset)
# Using X_train (before SMOTE) is generally recommended for SHAP explanations
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X_train)

print("SHAP values calculated.")

# Visualize the feature importance
# Use the original feature names from X_train
shap.summary_plot(shap_values, X_train, plot_type="bar", feature_names=X_train.columns)
# plt.title("SHAP Feature Importance") # Remove this line
# plt.show() # Remove this line

# You can also get a more detailed summary plot
# shap.summary_plot(shap_values, X_train)
# plt.title("SHAP Summary Plot")
# plt.show()

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted') # Use weighted average for multi-class
recall = recall_score(y_test, y_pred, average='weighted')     # Use weighted average for multi-class
f1 = f1_score(y_test, y_pred, average='weighted')             # Use weighted average for multi-class
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

# For multi-class ROC AUC, we can calculate it per class or use macro/weighted average
# Let's calculate the weighted average ROC AUC
# Need prediction probabilities for roc_auc_score
y_pred_proba = xgb_model.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='weighted') # Use 'ovr' for one-vs-rest strategy

print("Model Evaluation on Test Set:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"ROC AUC (Weighted): {roc_auc:.4f}")

print("\nConfusion Matrix:")
display(conf_matrix)

print("\nClassification Report:")
print(class_report)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score

# Initialize and train a Random Forest model
# Use the resampled training data
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_res, y_train_res)

print("Random Forest model trained successfully on resampled data.")

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf, average='weighted')
recall_rf = recall_score(y_test, y_pred_rf, average='weighted')
f1_rf = f1_score(y_test, y_pred_rf, average='weighted')

# For multi-class ROC AUC, need prediction probabilities
y_pred_proba_rf = rf_model.predict_proba(X_test)
roc_auc_rf = roc_auc_score(y_test, y_pred_proba_rf, multi_class='ovr', average='weighted')

conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
class_report_rf = classification_report(y_test, y_pred_rf, target_names=label_encoder.classes_)


print("\nRandom Forest Model Evaluation on Test Set:")
print(f"Accuracy: {accuracy_rf:.4f}")
print(f"Precision: {precision_rf:.4f}")
print(f"Recall: {recall_rf:.4f}")
print(f"F1-score: {f1_rf:.4f}")
print(f"ROC AUC (Weighted): {roc_auc_rf:.4f}")

print("\nConfusion Matrix (Random Forest):")
display(conf_matrix_rf)

print("\nClassification Report (Random Forest):")
print(class_report_rf)

In [None]:
# ========================================
# 7. Compare Results
# ========================================

# Collect the metrics for each model
xgb_results = [accuracy, precision, recall, f1, roc_auc]
rf_results = [accuracy_rf, precision_rf, recall_rf, f1_rf, roc_auc_rf]

metrics = pd.DataFrame([rf_results, xgb_results],
                       columns=["Accuracy","Precision","Recall","F1","ROC-AUC"],
                       index=["Random Forest", "XGBoost"])
print(metrics)

sns.heatmap(metrics, annot=True, cmap="Blues", fmt=".3f")
plt.title("Model Performance Comparison")
plt.show()

In [None]:
import shap

# Sample only 500 rows to make SHAP faster
X_sample = X_train.sample(500, random_state=42)

# Explain the Random Forest model's predictions
explainer_rf = shap.TreeExplainer(rf_model)
shap_values_rf = explainer_rf(X_sample)   # ✅ new API

print("SHAP values calculated for Random Forest model.")

# Feature importance (bar plot)
shap.summary_plot(shap_values_rf, X_sample, plot_type="bar", feature_names=X_sample.columns)

# Detailed summary plot (dot plot)
shap.summary_plot(shap_values_rf, X_sample)


In [None]:
# ========================================
# 8. Prototype Demo - Single Prediction
# ========================================

import shap
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split

# Ensure consistent X_test for both models
try:
    if 'X_test' not in globals() or 'y_test' not in globals():
        print("Re-splitting data to ensure consistent X_test...")
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
except NameError as e:
    print(f"Error: {e}. Ensure X and y are defined from Step 4.")
    raise

# Ensure SHAP JavaScript is initialized (optional for matplotlib backend)
shap.initjs()

# Create a label encoder for decoding predictions (multi-class: 'Non-Tor', 'NonVPN', 'Tor', 'VPN')
label_encoder = LabelEncoder()
label_encoder.classes_ = np.array(['Non-Tor', 'NonVPN', 'Tor', 'VPN'])  # Matches dataset labels

# Pick one random row from the test set
sample = X_test.iloc[[0]]  # Shape (1, 77)
print("Sample Flow Metadata:\n", sample)

# Predict with both models
try:
    rf_pred = rf_model.predict(sample)
    xgb_pred = xgb_model.predict(sample)
except NameError as e:
    print(f"Error: {e}. Ensure rf_model and xgb_model are defined from Steps 5-6.")
    raise

# Decode predictions to original class names
rf_pred_label = label_encoder.inverse_transform(rf_pred)[0]
xgb_pred_label = label_encoder.inverse_transform(xgb_pred)[0]

print("\nRandom Forest Prediction:", rf_pred_label)
print("XGBoost Prediction:", xgb_pred_label)


In [None]:
# ========================================
# 9. Export Results
# ========================================

import joblib
import shap
import matplotlib.pyplot as plt

# Save models
joblib.dump(rf_model, "random_forest_model.pkl")
joblib.dump(xgb_model, "xgboost_model.pkl")

# Save metrics table
metrics.to_csv("model_comparison_results.csv", index=True)

# Sample only 500 rows from X_test to make SHAP faster for the summary plot
X_test_sample = X_test.sample(500, random_state=42)

# Calculate SHAP values for XGBoost on the test sample
# Use the explainer created earlier for the XGBoost model
xgb_explainer = shap.TreeExplainer(xgb_model)
shap_values_xgb_test_sample = xgb_explainer(X_test_sample)

# Save SHAP summary plots
plt.figure()
# Use shap_values_rf from the previous cell
shap.summary_plot(shap_values_rf, X_sample, feature_names=X_sample.columns, show=False)
plt.savefig("rf_shap_summary.png", bbox_inches='tight')

plt.figure()
# Use the newly calculated SHAP values for XGBoost on the test sample
shap.summary_plot(shap_values_xgb_test_sample, X_test_sample, feature_names=X_test_sample.columns, show=False)
plt.savefig("xgb_shap_summary.png", bbox_inches='tight')