<a href="https://colab.research.google.com/github/Munazza-Farees/NITW-SIP2025-Project/blob/main/AKN_FGD_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [79]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pywt
from sklearn.metrics import classification_report, roc_curve, f1_score, silhouette_score
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from scipy.spatial.distance import euclidean
from imblearn.over_sampling import SMOTE
from sklearn.cluster import KMeans
from scipy.stats import pearsonr
from collections import Counter
from sklearn.neighbors import NearestNeighbors

In [80]:
# Load dataset
URL = '/content/drive/MyDrive/Colab Notebooks/Client1 (copy).csv'
# URL = '/content/drive/MyDrive/Colab Notebooks/Client2 (copy).csv'
data = pd.read_csv(URL)



In [81]:
# Impute missing values
data.fillna(data.select_dtypes(include=np.number).median(), inplace=True)
data['Protocol'].fillna(data['Protocol'].mode()[0], inplace=True)
data['Flags'].fillna(data['Flags'].mode()[0], inplace=True)

# Convert types and encode categorical features
data = data.astype({'Time': 'float', 'Label': int})
data = pd.get_dummies(data, columns=['Protocol', 'Flags'], prefix=['Protocol', 'Flags'], dummy_na=False)
data = data.drop(columns=['Source', 'Destination'], errors='ignore')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Protocol'].fillna(data['Protocol'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Flags'].fillna(data['Flags'].mode()[0], inplace=True)


In [82]:
# Sequence Alignment with Smith-Waterman Algorithm
def smith_waterman(seq1, seq2, match_score=2, mismatch_score=-2, gap_penalty=-1):
    n, m = len(seq1), len(seq2)
    H = np.zeros((n+1, m+1))
    for i in range(1, n+1):
        for j in range(1, m+1):
            match = H[i-1, j-1] + (match_score if seq1[i-1] == seq2[j-1] else mismatch_score)
            delete = H[i-1, j] + gap_penalty
            insert = H[i, j-1] + gap_penalty
            H[i, j] = max(match, delete, insert, 0)
    return np.max(H)

# Construct benchmark sequences for RTO values
rto_values = [1, 2]
benchmark_sequences = {rto: np.array([1 if i % rto < 0.1 else 0 for i in np.arange(0, 60, 0.1)]) for rto in rto_values}

In [83]:
# Estimate DU duration using UDP packet counts
cw_duration = 60
data['CW_ID'] = ((data['Time'] - data['Time'].min()) // cw_duration).astype(int)
udp_data = data[data['Protocol_UDP'] == 1]
cw_udp_counts = udp_data.groupby('CW_ID')['Length'].count().reindex(range(data['CW_ID'].max() + 1), fill_value=0)
cw_udp_seq = cw_udp_counts.values

max_score = -np.inf
time_du = 1
for rto, seq in benchmark_sequences.items():
    score = smith_waterman(cw_udp_seq[:min(len(cw_udp_seq), len(seq))], seq[:min(len(cw_udp_seq), len(seq))])
    if score > max_score:
        max_score = score
        time_du = rto

# Create DUs based on estimated TimeDU
data['DU_ID'] = ((data['Time'] - data['Time'].min()) // time_du).astype(int)
print("Estimated DU Duration:", time_du, "seconds")
print("Packets per DU:\n", data.groupby('DU_ID').size().describe())

Estimated DU Duration: 2 seconds
Packets per DU:
 count    733.000000
mean      47.553888
std      131.489508
min        2.000000
25%        8.000000
50%       14.000000
75%       24.000000
max      733.000000
dtype: float64


In [84]:
# Feature Aggregation
data['TCP_Packets'] = data['Protocol_TCP'].astype(int)
data['Total_Packets'] = 1
data['Burstiness'] = data.groupby('DU_ID')['Packet_Rate'].transform(lambda x: x.max() / (x.mean() + 1e-10))

features_to_aggregate = [
    'Length', 'Inter_Arrival_Time', 'Connection_Duration', 'Packet_Rate',
    'Flow_Bytes_Per_Second', 'Flow_Packets_Per_Second', 'Forward_Packets',
    'Backward_Packets', 'Ratio_Fwd_Bwd', 'Entropy', 'Packet_Size_Variance',
    'Burstiness', 'TCP_Packets', 'Total_Packets'
]

# Ensure features exist in dataset
features_to_aggregate = [f for f in features_to_aggregate if f in data.columns]
agg_funcs = {col: ['mean', 'std', 'max', 'min'] for col in features_to_aggregate}
agg_funcs['Label'] = lambda x: 1 if (x == 1).mean() >= 0.05 else 0
agg_data = data.groupby('DU_ID').agg(agg_funcs)
agg_data.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in agg_data.columns]
agg_data.reset_index(inplace=True)

In [85]:
# Compute wavelet packet entropy per DU
def compute_du_entropy(group):
    signal = group['Length'].values
    if len(signal) < 2:
        return 0
    try:
        wp = pywt.WaveletPacket(data=signal, wavelet='db1', mode='symmetric', maxlevel=3)
        energies = [np.sum(np.square(node.data)) for node in wp.get_level(3)]
        total_energy = np.sum(energies) + 1e-10
        probs = np.array(energies) / total_energy
        return -np.sum(probs * np.log2(probs + 1e-10))
    except:
        return 0

agg_data['Entropy_DU'] = data.groupby('DU_ID').apply(compute_du_entropy).reindex(agg_data.index).fillna(0).values

  agg_data['Entropy_DU'] = data.groupby('DU_ID').apply(compute_du_entropy).reindex(agg_data.index).fillna(0).values


In [86]:
# Feature Scaling
agg_data_features = agg_data.drop(columns=['DU_ID', 'Label_<lambda>'])
agg_data_features = agg_data_features.fillna(0)

for col in agg_data_features.columns:
    if agg_data_features[col].dtype == bool:
        agg_data_features[col] = agg_data_features[col].astype(int)

for col in agg_data_features.columns:
    p1, p99 = agg_data_features[col].quantile([0.01, 0.99])
    agg_data_features[col] = agg_data_features[col].clip(lower=p1, upper=p99)

scaler = RobustScaler()
X_scaled = scaler.fit_transform(agg_data_features)
X_scaled_data = pd.DataFrame(X_scaled, columns=agg_data_features.columns)
X_scaled_data['DU_ID'] = agg_data['DU_ID'].values
X_scaled_data['Label'] = agg_data['Label_<lambda>'].values
X_scaled_data['Entropy_DU'] = agg_data['Entropy_DU'].values
X_scaled_data = X_scaled_data.fillna(0)

In [87]:
# Compute Pearson correlation per DU
pearson_coeffs = []
for du_id in X_scaled_data['DU_ID'].unique():
    du_data = data[data['DU_ID'] == du_id]
    if len(du_data) > 1 and du_data['Total_Packets'].sum() > 0:
        corr, _ = pearsonr(du_data['TCP_Packets'], du_data['Total_Packets'])
        pearson_coeffs.append(corr if not np.isnan(corr) else 0)
    else:
        pearson_coeffs.append(0)
pearson_map = dict(zip(X_scaled_data['DU_ID'].unique(), pearson_coeffs))
X_scaled_data['Pearson_Corr'] = X_scaled_data['DU_ID'].map(pearson_map).fillna(0)

  corr, _ = pearsonr(du_data['TCP_Packets'], du_data['Total_Packets'])


In [88]:
# AKN: Calculate Neurons' Number (Algorithm 1)
def calculate_optimal_neurons(X, threshold_ratio=0.5):
    if len(X) < 2:
        return 1, X
    centers = [X[0]]
    max_dist = 0
    second = X[1]
    for row in X[1:]:
        dist = euclidean(row, centers[0])
        if dist > max_dist:
            max_dist = dist
            second = row
    centers.append(second)
    while True:
        min_dists = [min([euclidean(row, center) for center in centers]) for row in X]
        D_max = max(min_dists)
        if D_max > threshold_ratio * euclidean(centers[0], centers[1]):
            new_center = X[np.argmax(min_dists)]
            centers.append(new_center)
        else:
            break
    return len(centers), np.array(centers)

In [89]:
# AKN: Calculate Initial Weights (Algorithm 2)
def calculate_initial_weights(R, numcl):
    R_min = R.min(axis=0)
    R_max = R.max(axis=0)
    N = (R - R_min) / (R_max - R_min + 1e-10)
    C = N.mean(axis=0)
    dmax = np.max([euclidean(n, C) for n in N])
    X = np.zeros((numcl, R.shape[1]))
    for j in range(numcl):
        X[j, :] = C + np.random.uniform(-dmax, dmax, R.shape[1])
    return X

In [90]:
# AKN: Clustering (Algorithm 3)
def kohonen_clustering(X, initial_centers, num_epochs=100, initial_lr=0.5, initial_radius=0.5):
    weights = initial_centers.copy()
    num_neurons = len(weights)
    sigma = initial_radius * num_neurons
    sigma_decay = sigma / num_epochs
    lr = initial_lr
    for epoch in range(num_epochs):
        lr = initial_lr * np.exp(-epoch / num_epochs)
        sigma = max(0.1, sigma - sigma_decay * epoch)
        for x in X:
            distances = euclidean_distances([x], weights).flatten()
            winner_idx = np.argmin(distances)
            for j in range(num_neurons):
                dist_to_winner = np.abs(j - winner_idx)
                influence = np.exp(-dist_to_winner**2 / (2 * sigma**2 + 1e-10))
                weights[j] += lr * influence * (x - weights[j])
    assignments = [np.argmin(euclidean_distances([x], weights)) for x in X]
    return assignments, weights

In [92]:
# Apply AKN on original data
X_clustering = X_scaled_data.drop(columns=['DU_ID', 'Label', 'Entropy_DU', 'Pearson_Corr']).values
num_neurons, initial_centers = calculate_optimal_neurons(X_clustering, threshold_ratio=0.5)
initial_weights = calculate_initial_weights(X_clustering, num_neurons)
cluster_labels, final_weights = kohonen_clustering(X_clustering, initial_weights, num_epochs=100)
X_scaled_data['Cluster'] = cluster_labels
print("Clustering Silhouette Score:", silhouette_score(X_clustering, cluster_labels) if len(set(cluster_labels)) > 1 else "Single cluster detected")

Clustering Silhouette Score: 0.9875121874592766


In [93]:
# Compute MAD on original data
mean_C = (X_scaled_data['TCP_Packets_mean'] / (X_scaled_data['TCP_Packets_mean'] + 1e-10)).mean()
std_C = (X_scaled_data['TCP_Packets_mean'] / (X_scaled_data['TCP_Packets_mean'] + 1e-10)).std() or 1e-10
mean_H = X_scaled_data['Entropy_DU'].mean()
std_H = X_scaled_data['Entropy_DU'].std() or 1e-10
mean_P = X_scaled_data['Pearson_Corr'].mean()
std_P = X_scaled_data['Pearson_Corr'].std() or 1e-10

mad_scores = []
for _, row in X_scaled_data.iterrows():
    C = (row['TCP_Packets_mean'] / (row['TCP_Packets_mean'] + 1e-10) - mean_C) / std_C
    H = (row['Entropy_DU'] - mean_H) / std_H
    P = (row['Pearson_Corr'] - mean_P) / std_P
    mad = np.sqrt(0.4 * C**2 + 0.3 * H**2 + 0.3 * P**2)
    mad_scores.append(mad if not np.isnan(mad) else 0)
X_scaled_data['MAD'] = mad_scores
X_scaled_data['MAD'] = scaler.fit_transform(X_scaled_data[['MAD']]).flatten()
X_scaled_data['MAD'] = X_scaled_data['MAD'].fillna(0)

In [74]:
# Balance data for threshold optimization
# X_array = X_scaled_data.drop(columns=['DU_ID', 'Label', 'Entropy_DU', 'Pearson_Corr', 'Cluster', 'MAD']).values
# y_array = X_scaled_data['Label'].values
# try:
#     smote = SMOTE(random_state=42, k_neighbors=min(5, len(y_array[y_array == 1]) - 1))
#     X_resampled, y_resampled = smote.fit_resample(X_array, y_array)
# except ValueError:
#     X_resampled, y_resampled = X_array, y_array
# X_scaled_data_resampled = pd.DataFrame(X_resampled, columns=X_scaled_data.drop(columns=['DU_ID', 'Label', 'Entropy_DU', 'Pearson_Corr', 'Cluster', 'MAD']).columns)
# X_scaled_data_resampled['Label'] = y_resampled
# X_scaled_data_resampled['MAD'] = scaler.transform(X_scaled_data[['MAD']].iloc[:len(X_resampled)].values).flatten()

In [75]:
# Balance data for threshold optimization
X_array = X_scaled_data.drop(columns=['DU_ID', 'Label', 'Entropy_DU', 'Pearson_Corr', 'Cluster', 'MAD']).values
y_array = X_scaled_data['Label'].values
try:
    smote = SMOTE(random_state=42, k_neighbors=min(5, max(1, len(y_array[y_array == 1]) - 1)))
    X_resampled, y_resampled = smote.fit_resample(X_array, y_array)
except ValueError:
    X_resampled, y_resampled = X_array, y_array
X_scaled_data_resampled = pd.DataFrame(X_resampled, columns=X_scaled_data.drop(columns=['DU_ID', 'Label', 'Entropy_DU', 'Pearson_Corr', 'Cluster', 'MAD']).columns)
X_scaled_data_resampled['Label'] = y_resampled
# Ensure MAD values align with resampled data length
mad_values = X_scaled_data['MAD'].values
if len(X_resampled) > len(mad_values):
    # Pad with mean MAD if resampled data is longer
    mad_extended = np.pad(mad_values, (0, len(X_resampled) - len(mad_values)), mode='constant', constant_values=mad_values.mean())
else:
    mad_extended = mad_values[:len(X_resampled)]
X_scaled_data_resampled['MAD'] = scaler.transform(mad_extended.reshape(-1, 1)).flatten()



In [76]:
# Optimize Thresholds
fpr, tpr, thresholds = roc_curve(X_scaled_data_resampled['Label'], X_scaled_data_resampled['MAD'])
optimal_idx = np.argmax(tpr - fpr)
optimal_z = (thresholds[optimal_idx] - X_scaled_data_resampled['MAD'].mean()) / (X_scaled_data_resampled['MAD'].std() + 1e-10)
LMAD = X_scaled_data_resampled['MAD'].mean() + optimal_z * X_scaled_data_resampled['MAD'].std()
X_scaled_data['Abnormal_DU'] = (X_scaled_data['MAD'] > LMAD).astype(int)

best_ladur = 0.55
best_f1 = 0
for ladur in [0.5, 0.55, 0.6]:
    cluster_votes = X_scaled_data.groupby('Cluster')['Abnormal_DU'].mean()
    abnormal_clusters = cluster_votes[cluster_votes >= ladur].index
    X_scaled_data['Predicted_Label'] = X_scaled_data['Cluster'].apply(lambda c: 1 if c in abnormal_clusters else 0)
    f1 = f1_score(X_scaled_data['Label'], X_scaled_data['Predicted_Label'])
    if f1 > best_f1:
        best_f1 = f1
        best_ladur = ladur

In [77]:
# Final Detection
LMAD = X_scaled_data['MAD'].mean() + optimal_z * X_scaled_data['MAD'].std()
X_scaled_data['Abnormal_DU'] = (X_scaled_data['MAD'] > LMAD).astype(int)
cluster_votes = X_scaled_data.groupby('Cluster')['Abnormal_DU'].mean()
abnormal_clusters = cluster_votes[cluster_votes >= best_ladur].index
X_scaled_data['Predicted_Label'] = X_scaled_data['Cluster'].apply(lambda c: 1 if c in abnormal_clusters else 0)
X_scaled_data['Predicted_Label'] = X_scaled_data['Predicted_Label'].fillna(0)
print("Final Detection Evaluation:\n")
print(classification_report(X_scaled_data['Label'], X_scaled_data['Predicted_Label']))

Final Detection Evaluation:

              precision    recall  f1-score   support

           0       0.05      1.00      0.10        38
           1       0.00      0.00      0.00       695

    accuracy                           0.05       733
   macro avg       0.03      0.50      0.05       733
weighted avg       0.00      0.05      0.01       733



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [78]:
# Cross-validate Random Forest
# rf = RandomForestClassifier(random_state=42)
# scores = cross_val_score(rf, X_array, X_scaled_data['Label'], cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), scoring='f1_weighted')
# print("Random Forest Cross-Validation F1 Scores:", scores, "Mean:", scores.mean())

rf = RandomForestClassifier(random_state=42)
X_array = X_scaled_data.drop(columns=['DU_ID', 'Label', 'Entropy_DU', 'Pearson_Corr', 'Cluster', 'MAD']).values
y_array = X_scaled_data['Label'].values
rf.fit(X_array, y_array)
y_pred_rf = rf.predict(X_array)
print("Random Forest Classification Report:\n")
print(classification_report(y_array, y_pred_rf))

Random Forest Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        38
           1       1.00      1.00      1.00       695

    accuracy                           1.00       733
   macro avg       1.00      1.00      1.00       733
weighted avg       1.00      1.00      1.00       733



In [96]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import train_test_split
# from imblearn.over_sampling import SMOTE


# # Feature Selection using Random Forest
# rf = RandomForestClassifier(random_state=42)
# X_array = X_scaled_data.drop(columns=['DU_ID', 'Label', 'Entropy_DU', 'Pearson_Corr', 'Cluster', 'MAD']).values
# y_array = X_scaled_data['Label'].values
# rf.fit(X_array, y_array)
# feature_importances = pd.Series(rf.feature_importances_, index=X_scaled_data.drop(columns=['DU_ID', 'Label', 'Entropy_DU', 'Pearson_Corr', 'Cluster', 'MAD']).columns)
# top_features = feature_importances.nlargest(20).index
# X_array = X_scaled_data[top_features].values
# print("Top 20 Features:\n", top_features)

# # Split data for proper evaluation
# X_train, X_test, y_train, y_test = train_test_split(X_array, y_array, test_size=0.2, stratify=y_array, random_state=42)

# # Balance training data
# try:
#     smote = SMOTE(random_state=42, k_neighbors=min(5, max(1, len(y_train[y_train == 1]) - 1)))
#     X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
# except ValueError:
#     X_train_resampled, y_train_resampled = X_train, y_train

# # Optimize Thresholds for AKN-FGD
# X_scaled_data_resampled = pd.DataFrame(X_train_resampled, columns=top_features)
# X_scaled_data_resampled['Label'] = y_train_resampled
# mad_values = X_scaled_data['MAD'].values
# if len(X_train_resampled) > len(mad_values):
#     mad_extended = np.pad(mad_values, (0, len(X_train_resampled) - len(mad_values)), mode='constant', constant_values=mad_values.mean())
# else:
#     mad_extended = mad_values[:len(X_train_resampled)]
# X_scaled_data_resampled['MAD'] = scaler.transform(mad_extended.reshape(-1, 1)).flatten()
# # Extend or truncate Cluster values to match resampled data length
# cluster_values = X_scaled_data['Cluster'].values
# if len(X_train_resampled) > len(cluster_values):
#     cluster_extended = np.pad(cluster_values, (0, len(X_train_resampled) - len(cluster_values)), mode='constant', constant_values=np.median(cluster_values))
# else:
#     cluster_extended = cluster_values[:len(X_train_resampled)]
# X_scaled_data_resampled['Cluster'] = cluster_extended

# fpr, tpr, thresholds = roc_curve(X_scaled_data_resampled['Label'], X_scaled_data_resampled['MAD'])
# optimal_idx = np.argmax(tpr - fpr)
# optimal_z = (thresholds[optimal_idx] - X_scaled_data_resampled['MAD'].mean()) / (X_scaled_data_resampled['MAD'].std() + 1e-10)
# LMAD = X_scaled_data_resampled['MAD'].mean() + optimal_z * X_scaled_data_resampled['MAD'].std()
# X_scaled_data['Abnormal_DU'] = (X_scaled_data['MAD'] > LMAD).astype(int)

# best_ladur = 0.55
# best_f1 = 0
# for ladur in [0.3, 0.4, 0.5, 0.55, 0.6, 0.7]:
#     cluster_votes = X_scaled_data.groupby('Cluster')['Abnormal_DU'].mean()
#     abnormal_clusters = cluster_votes[cluster_votes >= ladur].index
#     X_scaled_data['Predicted_Label'] = X_scaled_data['Cluster'].apply(lambda c: 1 if c in abnormal_clusters else 0)
#     f1 = f1_score(X_scaled_data['Label'], X_scaled_data['Predicted_Label'])
#     if f1 > best_f1:
#         best_f1 = f1
#         best_ladur = ladur

# # Final Detection for AKN-FGD
# LMAD = X_scaled_data['MAD'].mean() + optimal_z * X_scaled_data['MAD'].std()
# X_scaled_data['Abnormal_DU'] = (X_scaled_data['MAD'] > LMAD).astype(int)
# cluster_votes = X_scaled_data.groupby('Cluster')['Abnormal_DU'].mean()
# abnormal_clusters = cluster_votes[cluster_votes >= best_ladur].index
# X_scaled_data['Predicted_Label'] = X_scaled_data['Cluster'].apply(lambda c: 1 if c in abnormal_clusters else 0)
# X_scaled_data['Predicted_Label'] = X_scaled_data['Predicted_Label'].fillna(0)
# print("Final Detection Evaluation (AKN-FGD):\n")
# print(classification_report(X_scaled_data['Label'], X_scaled_data['Predicted_Label']))

# # Random Forest Evaluation with Cross-Validation
# rf = RandomForestClassifier(max_depth=10, min_samples_split=5, random_state=42)
# scores = cross_val_score(rf, X_train, y_train, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), scoring='f1_weighted')
# print("Random Forest Cross-Validation F1 Scores:", scores, "Mean:", scores.mean())
# rf.fit(X_train, y_train)
# y_pred_rf = rf.predict(X_test)
# print("Random Forest Test Set Classification Report:\n")
# print(classification_report(y_test, y_pred_rf))

Top 20 Features:
 Index(['Connection_Duration_mean', 'Length_max', 'Backward_Packets_mean',
       'Length_mean', 'Packet_Rate_mean', 'Flow_Packets_Per_Second_mean',
       'Flow_Bytes_Per_Second_mean', 'Inter_Arrival_Time_std',
       'Ratio_Fwd_Bwd_max', 'Ratio_Fwd_Bwd_mean', 'Forward_Packets_max',
       'Connection_Duration_min', 'Flow_Bytes_Per_Second_max',
       'Backward_Packets_min', 'Inter_Arrival_Time_mean', 'Packet_Rate_max',
       'Flow_Packets_Per_Second_max', 'Flow_Packets_Per_Second_std',
       'Ratio_Fwd_Bwd_std', 'Packet_Rate_std'],
      dtype='object')
Final Detection Evaluation (AKN-FGD):

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        38
           1       0.03      0.00      0.00       695

    accuracy                           0.00       733
   macro avg       0.01      0.00      0.00       733
weighted avg       0.02      0.00      0.00       733





Random Forest Cross-Validation F1 Scores: [0.99183237 1.         1.         1.         1.        ] Mean: 0.998366473535585
Random Forest Test Set Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         8
           1       1.00      1.00      1.00       139

    accuracy                           1.00       147
   macro avg       1.00      1.00      1.00       147
weighted avg       1.00      1.00      1.00       147



In [None]:
# # Lightweight Neural Network
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout
# from tensorflow.keras.callbacks import EarlyStopping
# from sklearn.model_selection import train_test_split

# model = Sequential([
#     Dense(32, activation='relu', input_shape=(X_train.shape[1],)),
#     Dropout(0.2),
#     Dense(16, activation='relu'),
#     Dropout(0.2),
#     Dense(1, activation='sigmoid')
# ])
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
# history = model.fit(X_train_resampled, y_train_resampled, validation_split=0.2, epochs=50, batch_size=32, callbacks=[early_stopping], verbose=0)
# y_pred_nn = (model.predict(X_test) > 0.5).astype(int)
# print("Neural Network Test Set Classification Report:\n")
# print(classification_report(y_test, y_pred_nn))