<a href="https://colab.research.google.com/github/Munazza-Farees/NITW-SIP2025-Project/blob/main/AKN_FGD_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [171]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pywt
from sklearn.metrics import classification_report, roc_curve, f1_score, silhouette_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from scipy.spatial.distance import euclidean, cdist
from imblearn.over_sampling import SMOTE
from sklearn.cluster import KMeans
from scipy.stats import pearsonr
from collections import Counter
from sklearn.neighbors import NearestNeighbors
from sklearn.exceptions import UndefinedMetricWarning
import warnings
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

import kagglehub

# Download latest version
# path = kagglehub.dataset_download("aikenkazin/ddos-sdn-dataset")

# print("Path to dataset files:", path)

In [172]:
# Load dataset
# URL = '/content/drive/MyDrive/Colab Notebooks/Client1 (copy).csv'
# URL = '/content/drive/MyDrive/Colab Notebooks/Client2 (copy).csv'
# URL = '/content/drive/MyDrive/Colab Notebooks/Combined datasets.csv'
URL = '/content/drive/MyDrive/Colab Notebooks/Final_data.csv'
data = pd.read_csv(URL)



In [173]:
# Impute missing values
data.fillna(data.select_dtypes(include=np.number).median(), inplace=True)
data['Protocol'].fillna(data['Protocol'].mode()[0], inplace=True)
data['Flags'].fillna(data['Flags'].mode()[0], inplace=True)

# Convert types and encode categorical features
data = data.astype({'Time': 'float', 'Label': int})
data = pd.get_dummies(data, columns=['Protocol', 'Flags'], prefix=['Protocol', 'Flags'], dummy_na=False)
data = data.drop(columns=['Source', 'Destination'], errors='ignore')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Protocol'].fillna(data['Protocol'].mode()[0], inplace=True)


KeyError: 'Flags'

In [None]:
# Sequence Alignment with Smith-Waterman Algorithm
def smith_waterman(seq1, seq2, match_score=2, mismatch_score=-2, gap_penalty=-1):
    n, m = len(seq1), len(seq2)
    H = np.zeros((n+1, m+1))
    for i in range(1, n+1):
        for j in range(1, m+1):
            match = H[i-1, j-1] + (match_score if seq1[i-1] == seq2[j-1] else mismatch_score)
            delete = H[i-1, j] + gap_penalty
            insert = H[i, j-1] + gap_penalty
            H[i, j] = max(match, delete, insert, 0)
    return np.max(H)

# Construct benchmark sequences for RTO values
rto_values = [1, 2]
benchmark_sequences = {rto: np.array([1 if i % rto < 0.1 else 0 for i in np.arange(0, 60, 0.1)]) for rto in rto_values}

In [None]:
# Estimate DU duration using UDP packet counts
cw_duration = 60
data['CW_ID'] = ((data['Time'] - data['Time'].min()) // cw_duration).astype(int)
udp_data = data[data['Protocol_UDP'] == 1]
cw_udp_counts = udp_data.groupby('CW_ID')['Length'].count().reindex(range(data['CW_ID'].max() + 1), fill_value=0)
cw_udp_seq = cw_udp_counts.values

max_score = -np.inf
time_du = 1
for rto, seq in benchmark_sequences.items():
    score = smith_waterman(cw_udp_seq[:min(len(cw_udp_seq), len(seq))], seq[:min(len(cw_udp_seq), len(seq))])
    if score > max_score:
        max_score = score
        time_du = rto

# Create DUs based on estimated TimeDU
data['DU_ID'] = ((data['Time'] - data['Time'].min()) // time_du).astype(int)
print("Estimated DU Duration:", time_du, "seconds")
print("Packets per DU:\n", data.groupby('DU_ID').size().describe())

In [None]:
# Feature Aggregation
data['TCP_Packets'] = data['Protocol_TCP'].astype(int)
data['Total_Packets'] = 1
data['Burstiness'] = data.groupby('DU_ID')['Packet_Rate'].transform(lambda x: x.max() / (x.mean() + 1e-10))

features_to_aggregate = [
    'Length', 'Inter_Arrival_Time', 'Connection_Duration', 'Packet_Rate',
    'Flow_Bytes_Per_Second', 'Flow_Packets_Per_Second', 'Forward_Packets',
    'Backward_Packets', 'Ratio_Fwd_Bwd', 'Entropy', 'Packet_Size_Variance',
    'Burstiness', 'TCP_Packets', 'Total_Packets'
]

# Ensure features exist in dataset
features_to_aggregate = [f for f in features_to_aggregate if f in data.columns]
agg_funcs = {col: ['mean', 'std', 'max', 'min'] for col in features_to_aggregate}

# Add explicit std-based features for:
std_features = ['Length', 'Inter_Arrival_Time', 'Burstiness', 'Forward_Packets']
for feature in std_features:
    if feature in data.columns:
        agg_funcs[feature] = agg_funcs.get(feature, []) + ['std']

agg_funcs['Label'] = lambda x: 1 if (x == 1).mean() >= 0.05 else 0
agg_data = data.groupby('DU_ID').agg(agg_funcs)
agg_data.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in agg_data.columns]
agg_data.reset_index(inplace=True)

In [None]:
# Compute wavelet packet entropy per DU
def compute_du_entropy(group):
    signal = group['Length'].values
    if len(signal) < 2:
        return 0
    try:
        wp = pywt.WaveletPacket(data=signal, wavelet='db1', mode='symmetric', maxlevel=3)
        energies = [np.sum(np.square(node.data)) for node in wp.get_level(3)]
        total_energy = np.sum(energies) + 1e-10
        probs = np.array(energies) / total_energy
        return -np.sum(probs * np.log2(probs + 1e-10))
    except:
        return 0

agg_data['Entropy_DU'] = data.groupby('DU_ID').apply(compute_du_entropy).reindex(agg_data.index).fillna(0).values

In [None]:
# Feature Scaling
agg_data_features = agg_data.drop(columns=['DU_ID', 'Label_<lambda>'])
agg_data_features = agg_data_features.fillna(0)

for col in agg_data_features.columns:
    # Check if the column is a pandas Series before accessing dtype
    col_data = agg_data_features[col]
    if isinstance(col_data, pd.Series):
        if col_data.dtype == bool:
            agg_data_features[col] = col_data.astype(int)
    else:
        continue
        # print(f"Warning: agg_data_features[{col}] is not a Series. Type: {type(col_data)}")


for col in agg_data_features.columns:
    p1, p99 = agg_data_features[col].quantile([0.01, 0.99])
    col_data = agg_data_features[col]
    if isinstance(col_data, pd.Series):
        agg_data_features[col] = agg_data_features[col].clip(p1, p99)
    else:
        continue
        # print(f"Warning: agg_data_features[{col}] is not a Series. Type: {type(col_data)}")

scaler = RobustScaler()
X_scaled = scaler.fit_transform(agg_data_features)
X_scaled_data = pd.DataFrame(X_scaled, columns=agg_data_features.columns)
X_scaled_data['DU_ID'] = agg_data['DU_ID'].values
X_scaled_data['Label'] = agg_data['Label_<lambda>'].values
X_scaled_data['Entropy_DU'] = agg_data['Entropy_DU'].values
X_scaled_data = X_scaled_data.fillna(0)
# print(X_scaled_data['TCP_Packets_mean'].value_counts())

In [None]:
# Compute Pearson correlation per DU
pearson_coeffs = []
for du_id in X_scaled_data['DU_ID'].unique():
    du_data = data[data['DU_ID'] == du_id]
    if len(du_data) > 1 and du_data['Total_Packets'].sum() > 0:
        corr, _ = pearsonr(du_data['TCP_Packets'], du_data['Total_Packets'])
        pearson_coeffs.append(corr if not np.isnan(corr) else 0)
    else:
        pearson_coeffs.append(0)
pearson_map = dict(zip(X_scaled_data['DU_ID'].unique(), pearson_coeffs))
X_scaled_data['Pearson_Corr'] = X_scaled_data['DU_ID'].map(pearson_map).fillna(0)

In [None]:
# AKN: Calculate Neurons' Number (Algorithm 1)
def calculate_optimal_neurons(X, theta=0.5):
    if len(X) < 2:
        return 1, X
    C = [X[0]]
    second = X[1]
    for i in range(2, len(X)):
        if euclidean(X[i], C[0]) > euclidean(second, C[0]):
            second = X[i]
    C.append(second)

    num_cl = 2

    while True:
        # Step 9–13: For each sample, compute distance to nearest center
        D = np.min(cdist(X, np.array(C)), axis=1)
        # Step 14: Find max of D
        D_max = np.max(D)
        id_D = np.argmax(D)
        # Step 15: Check if D_max is large enough to add a new center

        dist_C1C2 = euclidean(C[1], C[0])
        if D_max > theta * dist_C1C2:
            C.append(X[id_D])
            num_cl += 1
        else:
            break
    return num_cl, np.array(C)

In [None]:
# AKN: Calculate Initial Weights (Algorithm 2)
def calculate_initial_weights(X_raw, num_cl):
    R = np.array(X_raw)
    n, m = R.shape

    # Step 1–5: Normalize R into N
    N = np.zeros_like(R)
    for j in range(m):
        col = R[:, j]
        col_min = col.min()
        col_max = col.max()
        if col_max - col_min == 0:
            N[:, j] = 0
        else:
            N[:, j] = (col - col_min) / (col_max - col_min)

    # Step 6: Calculate Center Vector (mean of N)
    C = np.mean(N, axis=0)

    # Step 7: Calculate d_max = max distance from any point to center
    distances = cdist(N, [C])
    d_max = np.max(distances)

    # Step 8–12: Initialize weights
    np.random.seed(42)  # for reproducibility
    X_init = np.zeros((num_cl, m))

    for j in range(num_cl):
        # For each weight, add or subtract a small random delta from C
        random_shifts = (np.random.rand(m) - 0.5) * 2 * d_max  # in range [-d_max, +d_max]
        X_init[j] = C + random_shifts

    return X_init

In [None]:
def kohonen_clustering(X, weights, learning_radius=0.3, initial_eta=0.5, num_epochs=100, sigma=1.0):
    n, m = X.shape
    num_cl = weights.shape[0]
    w = weights.copy()

    for t in range(1, num_epochs + 1):
        for i in range(n):
            x = X[i]
            distances = np.linalg.norm(w - x, axis=1)
            winner_idx = np.argmin(distances)
            winner = w[winner_idx]

            neighbor_mask = np.linalg.norm(X - winner, axis=1) < learning_radius
            neighbors = X[neighbor_mask]

            if len(neighbors) == 0:
                continue

            S = np.linalg.norm(neighbors - winner, axis=1)
            T = np.exp(-np.square(S) / (2 * sigma ** 2))
            eta_t = initial_eta * np.exp(-t / num_epochs)

            for k, neighbor in enumerate(neighbors):
                neuron_idx = np.random.randint(0, num_cl)
                influence = T[k]
                delta = eta_t * influence * (neighbor - w[neuron_idx])
                w[neuron_idx] += delta

    cluster_indices = np.argmin(cdist(X, w), axis=1)
    return cluster_indices, w

In [None]:
# Prepare data for clustering
feature_cols = [col for col in X_scaled_data.columns if col not in ['DU_ID', 'Label']]
X_clustering = X_scaled_data[feature_cols].values

# Step 1: Calculate optimal neurons
num_neurons, initial_centers = calculate_optimal_neurons(X_clustering, theta=0.3)

# Step 2: Initialize weights
initial_weights = calculate_initial_weights(X_clustering, num_neurons)
initial_weights = initial_centers.copy()

# Step 3: Apply AKN clustering
cluster_labels, final_weights = kohonen_clustering(
    X_clustering, initial_weights, num_epochs=100,
    learning_radius=0.1, initial_eta=0.5
)

# Save results
X_scaled_data['Cluster'] = cluster_labels

# Evaluate clustering
if len(set(cluster_labels)) > 1:
    print("✅ Clustering Silhouette Score:", silhouette_score(X_clustering, cluster_labels))
else:
    print("⚠️ Only one cluster formed — check theta or neuron spread.")


In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

X_tsne = TSNE(n_components=2, random_state=42).fit_transform(X_clustering)

plt.figure(figsize=(8, 6))
plt.title("t-SNE of DU Features")
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=X_scaled_data['Label'], cmap='coolwarm', alpha=0.7)
plt.xlabel("TSNE-1")
plt.ylabel("TSNE-2")
plt.colorbar(label="Label (0 = Normal, 1 = Attack)")
plt.show()

In [None]:
import seaborn as sns
import pandas as pd

# Cross tab: Cluster vs True Label
cluster_purity = pd.crosstab(X_scaled_data['Cluster'], X_scaled_data['Label'])
print(cluster_purity)

# Optional: Normalize row-wise to see %
cluster_purity_norm = cluster_purity.div(cluster_purity.sum(axis=1), axis=0)
sns.heatmap(cluster_purity_norm, annot=True, cmap='coolwarm')
plt.title("Cluster Purity by Label")
plt.xlabel("True Label")
plt.ylabel("Cluster")
plt.show()

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

kmeans = KMeans(n_clusters=num_neurons, random_state=42).fit(X_clustering)
X_scaled_data['Cluster_KMeans'] = kmeans.labels_

score = silhouette_score(X_clustering, kmeans.labels_)
print(f"KMeans Silhouette Score: {score:.4f}")

In [None]:
# Compute MAD on original data
mean_C = (X_scaled_data['TCP_Packets_mean'] / (X_scaled_data['TCP_Packets_mean'] + 1e-10)).mean()
std_C = (X_scaled_data['TCP_Packets_mean'] / (X_scaled_data['TCP_Packets_mean'] + 1e-10)).std() or 1e-10

mean_H = X_scaled_data['Entropy_DU'].mean()
std_H = X_scaled_data['Entropy_DU'].std() or 1e-10

mean_P = X_scaled_data['Pearson_Corr'].mean()
std_P = X_scaled_data['Pearson_Corr'].std() or 1e-10

mean_B = X_scaled_data['Burstiness_mean'].mean()
std_B = X_scaled_data['Burstiness_mean'].std()


mad_scores = []
for _, row in X_scaled_data.iterrows():
    # C = (row['TCP_Packets_mean'] / (row['TCP_Packets_mean'] + 1e-10) - mean_C) / std_C
    C = (row['Burstiness_mean'] - mean_B) / std_B
    H = (row['Entropy_DU'] - mean_H) / std_H
    P = (row['Pearson_Corr'] - mean_P) / std_P
    mad = np.sqrt(0.4 * C**2 + 0.3 * H**2 + 0.3 * P**2)
    # mad = (np.abs(C) + np.abs(H) + np.abs(P)) / 3
    mad_scores.append(mad if not np.isnan(mad) else 0)
X_scaled_data['MAD'] = mad_scores
X_scaled_data['MAD'] = scaler.fit_transform(X_scaled_data[['MAD']]).flatten()
X_scaled_data['MAD'] = X_scaled_data['MAD'].fillna(0)

In [None]:
# Balance data for threshold optimization
# X_array = X_scaled_data.drop(columns=['DU_ID', 'Label', 'Entropy_DU', 'Pearson_Corr', 'Cluster', 'MAD']).values
# y_array = X_scaled_data['Label'].values
# try:
#     smote = SMOTE(random_state=42, k_neighbors=min(5, max(1, len(y_array[y_array == 1]) - 1)))
#     X_resampled, y_resampled = smote.fit_resample(X_array, y_array)
# except ValueError:
#     X_resampled, y_resampled = X_array, y_array
# X_scaled_data_resampled = pd.DataFrame(X_resampled, columns=X_scaled_data.drop(columns=['DU_ID', 'Label', 'Entropy_DU', 'Pearson_Corr', 'Cluster', 'MAD']).columns)
# X_scaled_data_resampled['Label'] = y_resampled
# # Ensure MAD values align with resampled data length
# mad_values = X_scaled_data['MAD'].values
# if len(X_resampled) > len(mad_values):
#     # Pad with mean MAD if resampled data is longer
#     mad_extended = np.pad(mad_values, (0, len(X_resampled) - len(mad_values)), mode='constant', constant_values=mad_values.mean())
# else:
#     mad_extended = mad_values[:len(X_resampled)]
# X_scaled_data_resampled['MAD'] = scaler.transform(mad_extended.reshape(-1, 1)).flatten()

In [None]:
# Optimize Thresholds
fpr, tpr, thresholds = roc_curve(X_scaled_data['Label'], X_scaled_data['MAD'])
optimal_idx = np.argmax(tpr - fpr)
optimal_z = (thresholds[optimal_idx] - X_scaled_data['MAD'].mean()) / (X_scaled_data['MAD'].std() + 1e-10)
# LMAD = X_scaled_data_resampled['MAD'].mean() + optimal_z * X_scaled_data_resampled['MAD'].std()
LMAD = np.percentile(X_scaled_data['MAD'], 90)
X_scaled_data['Abnormal_DU'] = (X_scaled_data['MAD'] > LMAD).astype(int)

best_ladur = 0.55
best_f1 = 0
for ladur in [0.5, 0.55, 0.6]:
    cluster_votes = X_scaled_data.groupby('Cluster')['Abnormal_DU'].mean()
    abnormal_clusters = cluster_votes[cluster_votes >= ladur].index
    X_scaled_data['Predicted_Label'] = X_scaled_data['Cluster'].apply(lambda c: 1 if c in abnormal_clusters else 0)
    f1 = f1_score(X_scaled_data['Label'], X_scaled_data['Predicted_Label'])
    if f1 > best_f1:
        best_f1 = f1
        best_ladur = ladur

In [None]:
# Final Detection
LMAD = X_scaled_data['MAD'].mean() + optimal_z * X_scaled_data['MAD'].std()

X_scaled_data['Abnormal_DU'] = (X_scaled_data['MAD'] > LMAD).astype(int)

cluster_votes = X_scaled_data.groupby('Cluster')['Abnormal_DU'].mean()
cluster_to_label = X_scaled_data.groupby('Cluster')['Label'].agg(lambda x: x.mode()[0])
X_scaled_data['Predicted_Label'] = X_scaled_data['Cluster'].map(cluster_to_label)

abnormal_clusters = cluster_votes[cluster_votes >= best_ladur].index

X_scaled_data['Predicted_Label'] = X_scaled_data['Cluster'].apply(lambda c: 1 if c in abnormal_clusters else 0)

X_scaled_data['Predicted_Label'] = X_scaled_data['Predicted_Label'].fillna(0)

# print("Confusion matrix:\n")
# print(confusion_matrix(X_scaled_data['Label'], X_scaled_data['Predicted_Label']))

print("Final Detection Evaluation:\n")
print(classification_report(X_scaled_data['Label'], X_scaled_data['Predicted_Label']))

In [None]:
# Cross-validate Random Forest
# rf = RandomForestClassifier(random_state=42)
# scores = cross_val_score(rf, X_array, X_scaled_data['Label'], cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), scoring='f1_weighted')
# print("Random Forest Cross-Validation F1 Scores:", scores, "Mean:", scores.mean())

rf = RandomForestClassifier(random_state=42)
X_array = X_scaled_data.drop(columns=['DU_ID', 'Label', 'Entropy_DU', 'Pearson_Corr', 'Cluster', 'MAD']).values
y_array = X_scaled_data['Label'].values
rf.fit(X_array, y_array)
y_pred_rf = rf.predict(X_array)
print("Random Forest Classification Report:\n")
print(classification_report(y_array, y_pred_rf))

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import train_test_split
# from imblearn.over_sampling import SMOTE


# # Feature Selection using Random Forest
# rf = RandomForestClassifier(random_state=42)
# X_array = X_scaled_data.drop(columns=['DU_ID', 'Label', 'Entropy_DU', 'Pearson_Corr', 'Cluster', 'MAD']).values
# y_array = X_scaled_data['Label'].values
# rf.fit(X_array, y_array)
# feature_importances = pd.Series(rf.feature_importances_, index=X_scaled_data.drop(columns=['DU_ID', 'Label', 'Entropy_DU', 'Pearson_Corr', 'Cluster', 'MAD']).columns)
# top_features = feature_importances.nlargest(20).index
# X_array = X_scaled_data[top_features].values
# print("Top 20 Features:\n", top_features)

# # Split data for proper evaluation
# X_train, X_test, y_train, y_test = train_test_split(X_array, y_array, test_size=0.2, stratify=y_array, random_state=42)

# # Balance training data
# try:
#     smote = SMOTE(random_state=42, k_neighbors=min(5, max(1, len(y_train[y_train == 1]) - 1)))
#     X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
# except ValueError:
#     X_train_resampled, y_train_resampled = X_train, y_train

# # Optimize Thresholds for AKN-FGD
# X_scaled_data_resampled = pd.DataFrame(X_train_resampled, columns=top_features)
# X_scaled_data_resampled['Label'] = y_train_resampled
# mad_values = X_scaled_data['MAD'].values
# if len(X_train_resampled) > len(mad_values):
#     mad_extended = np.pad(mad_values, (0, len(X_train_resampled) - len(mad_values)), mode='constant', constant_values=mad_values.mean())
# else:
#     mad_extended = mad_values[:len(X_train_resampled)]
# X_scaled_data_resampled['MAD'] = scaler.transform(mad_extended.reshape(-1, 1)).flatten()
# # Extend or truncate Cluster values to match resampled data length
# cluster_values = X_scaled_data['Cluster'].values
# if len(X_train_resampled) > len(cluster_values):
#     cluster_extended = np.pad(cluster_values, (0, len(X_train_resampled) - len(cluster_values)), mode='constant', constant_values=np.median(cluster_values))
# else:
#     cluster_extended = cluster_values[:len(X_train_resampled)]
# X_scaled_data_resampled['Cluster'] = cluster_extended

# fpr, tpr, thresholds = roc_curve(X_scaled_data_resampled['Label'], X_scaled_data_resampled['MAD'])
# optimal_idx = np.argmax(tpr - fpr)
# optimal_z = (thresholds[optimal_idx] - X_scaled_data_resampled['MAD'].mean()) / (X_scaled_data_resampled['MAD'].std() + 1e-10)
# LMAD = X_scaled_data_resampled['MAD'].mean() + optimal_z * X_scaled_data_resampled['MAD'].std()
# X_scaled_data['Abnormal_DU'] = (X_scaled_data['MAD'] > LMAD).astype(int)

# best_ladur = 0.55
# best_f1 = 0
# for ladur in [0.3, 0.4, 0.5, 0.55, 0.6, 0.7]:
#     cluster_votes = X_scaled_data.groupby('Cluster')['Abnormal_DU'].mean()
#     abnormal_clusters = cluster_votes[cluster_votes >= ladur].index
#     X_scaled_data['Predicted_Label'] = X_scaled_data['Cluster'].apply(lambda c: 1 if c in abnormal_clusters else 0)
#     f1 = f1_score(X_scaled_data['Label'], X_scaled_data['Predicted_Label'])
#     if f1 > best_f1:
#         best_f1 = f1
#         best_ladur = ladur

# # Final Detection for AKN-FGD
# LMAD = X_scaled_data['MAD'].mean() + optimal_z * X_scaled_data['MAD'].std()
# X_scaled_data['Abnormal_DU'] = (X_scaled_data['MAD'] > LMAD).astype(int)
# cluster_votes = X_scaled_data.groupby('Cluster')['Abnormal_DU'].mean()
# abnormal_clusters = cluster_votes[cluster_votes >= best_ladur].index
# X_scaled_data['Predicted_Label'] = X_scaled_data['Cluster'].apply(lambda c: 1 if c in abnormal_clusters else 0)
# X_scaled_data['Predicted_Label'] = X_scaled_data['Predicted_Label'].fillna(0)
# print("Final Detection Evaluation (AKN-FGD):\n")
# print(classification_report(X_scaled_data['Label'], X_scaled_data['Predicted_Label']))

# # Random Forest Evaluation with Cross-Validation
# rf = RandomForestClassifier(max_depth=10, min_samples_split=5, random_state=42)
# scores = cross_val_score(rf, X_train, y_train, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), scoring='f1_weighted')
# print("Random Forest Cross-Validation F1 Scores:", scores, "Mean:", scores.mean())
# rf.fit(X_train, y_train)
# y_pred_rf = rf.predict(X_test)
# print("Random Forest Test Set Classification Report:\n")
# print(classification_report(y_test, y_pred_rf))

In [None]:
# # Lightweight Neural Network
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout
# from tensorflow.keras.callbacks import EarlyStopping
# from sklearn.model_selection import train_test_split

# model = Sequential([
#     Dense(32, activation='relu', input_shape=(X_train.shape[1],)),
#     Dropout(0.2),
#     Dense(16, activation='relu'),
#     Dropout(0.2),
#     Dense(1, activation='sigmoid')
# ])
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
# history = model.fit(X_train_resampled, y_train_resampled, validation_split=0.2, epochs=50, batch_size=32, callbacks=[early_stopping], verbose=0)
# y_pred_nn = (model.predict(X_test) > 0.5).astype(int)
# print("Neural Network Test Set Classification Report:\n")
# print(classification_report(y_test, y_pred_nn))

In [None]:
# Balance data for threshold optimization
# X_array = X_scaled_data.drop(columns=['DU_ID', 'Label', 'Entropy_DU', 'Pearson_Corr', 'Cluster', 'MAD']).values
# y_array = X_scaled_data['Label'].values
# try:
#     smote = SMOTE(random_state=42, k_neighbors=min(5, len(y_array[y_array == 1]) - 1))
#     X_resampled, y_resampled = smote.fit_resample(X_array, y_array)
# except ValueError:
#     X_resampled, y_resampled = X_array, y_array
# X_scaled_data_resampled = pd.DataFrame(X_resampled, columns=X_scaled_data.drop(columns=['DU_ID', 'Label', 'Entropy_DU', 'Pearson_Corr', 'Cluster', 'MAD']).columns)
# X_scaled_data_resampled['Label'] = y_resampled
# X_scaled_data_resampled['MAD'] = scaler.transform(X_scaled_data[['MAD']].iloc[:len(X_resampled)].values).flatten()