In [18]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
from scipy.stats import mode
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
file_path = "C://Users//JP//Desktop//csc//Year 2//sem 2//Machine Learning//iot//Scripts//RT_IOT2022.csv"
df = pd.read_csv(file_path)

# Drop unnecessary columns if present
df = df.drop(columns=['Unnamed: 0']) if 'Unnamed: 0' in df.columns else df

# Encode categorical variables
categorical_columns = ['proto', 'service', 'Attack_type']  # Update if needed
label_encoders = {}

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Extract Features and Labels
X = df.drop(columns=['Attack_type']).values  # Features
y = df['Attack_type'].values  # Target

# Undersample the majority class
undersample = RandomUnderSampler(sampling_strategy={max(set(y), key=list(y).count): 7000}, random_state=42)
X_under, y_under = undersample.fit_resample(X, y)

# Oversample minority classes
oversample = RandomOverSampler(sampling_strategy={cls: 5000 for cls in np.unique(y) if list(y).count(cls) < 5000}, random_state=42)
X_resampled, y_resampled = oversample.fit_resample(X_under, y_under)

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_resampled)


ModuleNotFoundError: No module named 'tensorflow'

In [None]:
# Define Autoencoder
input_dim = X_scaled.shape[1]
input_layer = Input(shape=(input_dim,))
encoded = Dense(64, activation='relu')(input_layer)
encoded = Dense(32, activation='relu')(encoded)
encoded = Dense(16, activation='relu')(encoded)

decoded = Dense(32, activation='relu')(encoded)
decoded = Dense(64, activation='relu')(decoded)
decoded = Dense(input_dim, activation='sigmoid')(decoded)

autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer='adam', loss='mse')

# Train Autoencoder
autoencoder.fit(X_scaled, X_scaled, epochs=50, batch_size=256, shuffle=True)

# Extract Encoded Features
encoder = Model(input_layer, encoded)
X_encoded = encoder.predict(X_scaled)

In [None]:
# Train K-Means on Encoded Features
optimal_k = 5  # Replace with best K found from Elbow Method
kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=42, n_init=50)
kmeans.fit(X_encoded)

df['Cluster'] = kmeans.labels_


In [None]:
# Function to Map Clusters to True Labels
def assign_cluster_labels(true_labels, cluster_labels):
    labels = np.zeros_like(cluster_labels)
    for cluster in np.unique(cluster_labels):
        mask = cluster_labels == cluster
        true_label = mode(true_labels[mask])[0][0]  # Extract most frequent class
        labels[mask] = true_label
    return labels

# Evaluate Performance
sil_score = silhouette_score(X_encoded, kmeans.labels_)
ari = adjusted_rand_score(y_resampled, kmeans.labels_)
nmi = normalized_mutual_info_score(y_resampled, kmeans.labels_)

true_labels = y_resampled
predicted_labels = assign_cluster_labels(true_labels, df['Cluster'])
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')
f1 = f1_score(true_labels, predicted_labels, average='weighted')

print(f"Silhouette Score: {sil_score:.4f}")
print(f"Adjusted Rand Index (ARI): {ari:.4f}")
print(f"Normalized Mutual Information (NMI): {nmi:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Silhouette Score (K-Means Student Model): 0.3605
Accuracy: 0.2277
Precision: 0.1578
Recall: 0.2277
F1 Score: 0.1191


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Visualization of Clusters using PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_encoded)

plt.figure(figsize=(8, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=df['Cluster'], palette='viridis')
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.title("K-Means Clustering After Knowledge Distillation")
plt.legend(title="Cluster")
plt.show()
