# K-Means 

In [1]:
import sys
import os

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath("__file__"))))

from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.feature import VectorAssembler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

spark = SparkSession.builder.appName("KMeansClustering").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

## Načítanie predspracovaných dát

In [None]:
from preprocessing.preprocessing import preprocess_data

df_train_model, df_test_model = preprocess_data()

print("\nDataset schema:")
df_train_model.printSchema()

print("\nSample records:")
df_train_model.select("features", "Accident_Severity").show(5, truncate=False)


Dataset schema:
root
 |-- Location_Easting_OSGR: integer (nullable = true)
 |-- Location_Northing_OSGR: integer (nullable = true)
 |-- Police_Force: integer (nullable = true)
 |-- Accident_Severity: integer (nullable = true)
 |-- Number_of_Vehicles: integer (nullable = true)
 |-- Number_of_Casualties: integer (nullable = true)
 |-- Local_Authority_(District): integer (nullable = true)
 |-- 1st_Road_Number: integer (nullable = true)
 |-- Speed_limit: integer (nullable = true)
 |-- Junction_Detail: integer (nullable = true)
 |-- Junction_Control: integer (nullable = true)
 |-- 2nd_Road_Class: integer (nullable = true)
 |-- 2nd_Road_Number: integer (nullable = true)
 |-- Urban_or_Rural_Area: integer (nullable = true)
 |-- Did_Police_Officer_Attend_Scene_of_Accident: integer (nullable = true)
 |-- Vehicle_Reference: integer (nullable = true)
 |-- Casualty_Reference: integer (nullable = true)
 |-- Casualty_Severity: integer (nullable = true)
 |-- Casualty_Type: integer (nullable = true)
 |

## Hľadanie optimálneho počtu cluster-ov
Na nájdenie optimálneho počtu zhlukov použijeme metódu Elbow, pričom vypočítame súčet štvorcových chýb v rámci súboru (WSSSE) pre rôzne hodnoty k.

In [None]:
# Function to compute cost (WSSSE) for different k values
def compute_cost(df, k_values, features_col="features"):
    costs = []
    for k in k_values:
        kmeans = KMeans(k=k, seed=42, featuresCol=features_col)
        model = kmeans.fit(df)
        wssse = model.summary.trainingCost
        costs.append(wssse)
        print(f"k={k}, WSSSE={wssse:.4f}")
    return costs

# Define a range of k values to test
k_values = list(range(2, 11))

# Compute cost for each k
print("Computing cost for different values of k...")
costs = compute_cost(df_train_model, k_values)

# Plot the elbow curve
plt.figure(figsize=(10, 6))
plt.plot(k_values, costs, 'bo-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('WSSSE Cost')
plt.title('Elbow Method For Optimal k')
plt.grid(True)
plt.xticks(k_values)
plt.show()

## Trénovanie K-means modelu s optimálnym počtom cluster-ov

In [None]:
# Set the optimal k value based on the elbow plot
optimal_k = 5  # This should be adjusted based on the elbow curve results

# Train K-means with the optimal k
kmeans = KMeans(k=optimal_k, seed=42, featuresCol="features")
model = kmeans.fit(df_train_model)

# Print cluster centers
centers = model.clusterCenters()
print(f"Cluster Centers for k={optimal_k}:")
for i, center in enumerate(centers):
    # Print the first few dimensions of each center for brevity
    print(f"Cluster {i}: {center[:5]}...")

# Evaluate clustering by computing Silhouette score
predictions = model.transform(df_train_model)
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print(f"\nSilhouette with squared euclidean distance = {silhouette:.4f}")

## Analýza cluster-ov

In [None]:
# Apply the model to the training data
clustered_data = model.transform(df_train_model)

# Examine the distribution of Accident Severity within each cluster
severity_by_cluster = clustered_data.groupBy("prediction", "Accident_Severity").count().orderBy("prediction", "Accident_Severity")
print("Distribution of Accident Severity by Cluster:")
severity_by_cluster.show()

# Create a more detailed view of the cluster characteristics
print("Cluster Summary Statistics:")
cluster_stats = clustered_data.groupBy("prediction").agg({
    "Accident_Severity": "mean",  # Average severity
    "*": "count"  # Count of records in each cluster
}).orderBy("prediction")

cluster_stats = cluster_stats.withColumnRenamed("avg(Accident_Severity)", "avg_severity")
cluster_stats = cluster_stats.withColumnRenamed("count(1)", "cluster_size")
cluster_stats.show()

## Vizualizácia cluster-ov

In [None]:
from pyspark.ml.feature import PCA

# Apply PCA to reduce dimensions for visualization
pca = PCA(k=2, inputCol="features", outputCol="pca_features")
pca_model = pca.fit(clustered_data)
result = pca_model.transform(clustered_data)

# Convert to pandas for easier plotting
pandas_df = result.select("pca_features", "prediction", "Accident_Severity").toPandas()

# Extract the PCA components
pandas_df["pca1"] = pandas_df["pca_features"].apply(lambda x: float(x[0]))
pandas_df["pca2"] = pandas_df["pca_features"].apply(lambda x: float(x[1]))

# Plot the clusters
plt.figure(figsize=(10, 8))
for cluster in range(optimal_k):
    cluster_data = pandas_df[pandas_df["prediction"] == cluster]
    plt.scatter(cluster_data["pca1"], cluster_data["pca2"], label=f"Cluster {cluster}", alpha=0.5)

plt.title(f"K-Means Clustering with k={optimal_k}")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend()
plt.grid(True)
plt.show()

# Plot clusters colored by severity
plt.figure(figsize=(10, 8))
scatter = plt.scatter(pandas_df["pca1"], pandas_df["pca2"], c=pandas_df["Accident_Severity"], cmap="viridis", alpha=0.5)
plt.colorbar(scatter, label="Accident Severity")
plt.title("Clusters by Accident Severity")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.grid(True)
plt.show()

## Uloženie modelu a výsledkov

In [None]:
# Create a directory for models if it doesn't exist
import os
models_dir = os.path.join("..", "models")
os.makedirs(models_dir, exist_ok=True)

# Save the K-means model
model_path = os.path.join(models_dir, "kmeans_model")
model.save(model_path)
print(f"Model saved to {model_path}")

# Save a summary of the clusters
cluster_summary = clustered_data.groupBy("prediction").count().orderBy("prediction")
print("\nCluster sizes:")
cluster_summary.show()

# Export a sample of the clustered data for inspection
sample_path = os.path.join(models_dir, "kmeans_sample_results.csv")
clustered_data.select("prediction", "Accident_Severity").sample(False, 0.01).toPandas().to_csv(sample_path, index=False)
print(f"Sample results saved to {sample_path}")