# Threat Clustering with Enhanced Dataset
This notebook performs unsupervised clustering of synthetic threat data based on signal strength, proximity, and geographic features.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Load enhanced dataset
df = pd.read_csv('../data/enhanced_threat_data.csv')

# Select features for clustering
features = df[['proximity_score', 'intel_signal_strength', 'latitude', 'longitude']]
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

## Determine Optimal Number of Clusters (Elbow Method)

In [None]:
inertia = []
K_range = range(1, 10)
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_features)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(8, 4))
plt.plot(K_range, inertia, marker='o')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.tight_layout()
plt.show()

## Apply KMeans Clustering

In [None]:
# We'll assume k=4 for demonstration
kmeans = KMeans(n_clusters=4, random_state=42)
df['cluster'] = kmeans.fit_predict(scaled_features)
df['cluster'] = df['cluster'].astype(str)

## Visualize Clusters on Geographic Map

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(data=df, x='longitude', y='latitude', hue='cluster', style='threat_type')
plt.title('Threat Clusters by Geolocation')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.tight_layout()
plt.show()

## PCA Plot of Clusters in Reduced Dimensions

In [None]:
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_features)
df['PC1'] = pca_result[:,0]
df['PC2'] = pca_result[:,1]

plt.figure(figsize=(8,6))
sns.scatterplot(data=df, x='PC1', y='PC2', hue='cluster', style='threat_type')
plt.title('PCA Projection of Threat Clusters')
plt.tight_layout()
plt.show()