# K-Means Clustering with Silhouette Score
This notebook demonstrates how to apply K-Means clustering to a dataset and evaluate the results using the Silhouette Score.
We will use a dataset from Kaggle and analyze the clusters.

In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

## Step 1: Load the Dataset

In [62]:
df = pd.read_csv("driver-data.csv")

df.head()

Unnamed: 0,id,mean_dist_day,mean_over_speed_perc
0,3423311935,71.24,28
1,3423313212,52.53,25
2,3423313724,64.54,27
3,3423311373,55.69,22
4,3423310999,54.58,25


## Step 2: Preprocess the Data

In [63]:
df

Unnamed: 0,id,mean_dist_day,mean_over_speed_perc
0,3423311935,71.24,28
1,3423313212,52.53,25
2,3423313724,64.54,27
3,3423311373,55.69,22
4,3423310999,54.58,25
...,...,...,...
3995,3423310685,160.04,10
3996,3423312600,176.17,5
3997,3423312921,170.91,12
3998,3423313630,176.14,5


In [None]:
x = df.iloc[:, [1, 2]]

In [None]:
# df_scaled = df.drop('id')

KeyError: "['id'] not found in axis"

## Step 3: Apply K-Means Clustering

In [None]:
# from sklearn.cluster import KMeans

# wcss = []
# for i in range(1,20):
#     kmeans = KMeans(n_clusters=2, init="k-means++", random_state=42)
#     kmeans.fit(df)
#     wcss.append(kmeans.inertia_)
#     plt.plot(i, wcss)
#     plt.title("Elbow Method")
#     plt.xlabel("Number of Clusters")
#     plt.ylabel("WCSS")
    
#     plt.show()

In [None]:
K = 5
kmeans = KMeans(n_clusters=K, random_state=42, n_init=10)
df['Cluster'] = kmeans.fit_predict(df)

## Step 4: Visualize the Clusters

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(df[:, 0], df_scaled[:, 1], c=df['Cluster'], cmap='viridis')
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.title("K-Means Clustering")
plt.show()

InvalidIndexError: (slice(None, None, None), 0)

<Figure size 800x600 with 0 Axes>

## Step 5: Evaluate Using Silhouette Score

In [None]:
# for i in range(1, 50):
    # kmeans = KMeans(n_clusters=i, random_state=42, n_init=10)
    # df['Cluster'] = kmeans.fit_predict(df_scaled)
sil_score = silhouette_score(df_scaled, df['Cluster'])
print(f"Silhouette Score: {sil_score:.3f}")

Silhouette Score: 0.734


## Step 6: Analysis

In [None]:
print("Analysis:")
print("- Clusters were formed based on feature similarity.")
print("- Higher Silhouette Score means better clustering.")
print("- Try different K values to see the best segmentation.")

Analysis:
- Clusters were formed based on feature similarity.
- Higher Silhouette Score means better clustering.
- Try different K values to see the best segmentation.
