#  K-Means Clustering - Machine Learning

## Step 1: Import Libraries

In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

## Step 2: Load and Prepare Data

In [2]:
# Load dataset
data = pd.read_csv("cleaned_final_data.csv")

In [4]:
data.columns

Index(['height', 'age', 'appearance', 'goals', 'assists', 'yellow cards',
       'second yellow cards', 'red cards', 'goals conceded', 'clean sheets',
       ...
       'position_Defender Centre-Back', 'position_Defender Left-Back',
       'position_Defender Right-Back', 'position_Goalkeeper',
       'position_midfield', 'position_midfield-AttackingMidfield',
       'position_midfield-CentralMidfield',
       'position_midfield-DefensiveMidfield', 'position_midfield-LeftMidfield',
       'position_midfield-RightMidfield'],
      dtype='object', length=21742)

In [None]:
print(data_clustering.dtypes)

In [8]:
# Remove label column (we don't need target labels in clustering)
data_clustering = data.drop(columns=["market_value_category"], errors='ignore')

# Standardize the features
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_clustering)

ValueError: could not convert string to float: '0.0286077560.028607756'

## Step 3: Finding Optimal k (Elbow Method)

In [9]:
# Find the optimal k using the Elbow Method
wcss = []  # Within-Cluster Sum of Squares
k_range = range(1, 15)

for k in k_range:
    kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=300, n_init=10, random_state=42)
    kmeans.fit(data_scaled)
    wcss.append(kmeans.inertia_)

# Plot the Elbow Method
plt.figure(figsize=(8, 4))
plt.plot(k_range, wcss, marker='o', linestyle='--', color='b')
plt.xlabel("Number of Clusters (k)")
plt.ylabel("WCSS (Within-Cluster Sum of Squares)")
plt.title("Elbow Method to Find Optimal k")
plt.grid(True)
plt.show()

NameError: name 'data_scaled' is not defined

## Step 4: Evaluating Clustering Quality (Silhouette Score)
- **Silhouette Score** measures **how well points fit inside clusters**.

In [None]:
# Find best k using Silhouette Score
silhouette_scores = []

for k in range(2, 15):  # Silhouette Score requires at least 2 clusters
    kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=300, n_init=10, random_state=42)
    labels = kmeans.fit_predict(data_scaled)
    score = silhouette_score(data_scaled, labels)
    silhouette_scores.append(score)

# Plot Silhouette Scores
plt.figure(figsize=(8, 4))
plt.plot(range(2, 15), silhouette_scores, marker='o', linestyle='--', color='g')
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Silhouette Score")
plt.title("Silhouette Score for Different k Values")
plt.grid(True)
plt.show()

# Best k
best_k = np.argmax(silhouette_scores) + 2
print(f" Best k based on Silhouette Score: {best_k}")


## Step 5: Train the K-Means Model
- We **train K-Means** with the best k value found.

In [None]:
# Train K-Means with the best k
kmeans = KMeans(n_clusters=best_k, init='k-means++', max_iter=300, n_init=10, random_state=42)
data_clustering["Cluster"] = kmeans.fit_predict(data_scaled)

print(data_clustering["Cluster"].value_counts())

## Step 6: Visualizing Clussters
- We **train K-Means** with the best k value found.

In [None]:
# Visualize clusters using first two features
plt.figure(figsize=(8, 6))
sns.scatterplot(x=data_scaled[:, 0], y=data_scaled[:, 1], hue=data_clustering["Cluster"], palette="viridis")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.title(f"K-Means Clustering (k={best_k})")
plt.legend(title="Cluster")
plt.show()

## Step 7: Cluster Analysis & Insights
- This helps **interpret what each cluster represents**.

In [None]:
# Analyzing average feature values per cluster
cluster_means = data_clustering.groupby("Cluster").mean()
print("\nCluster Centers (Feature Means):")
print(cluster_means)

## Step 8: Save Clustered Data
- We save the dataset with **assigned cluster labels** for further analysis.


In [None]:
# Save the clustered dataset
data_clustering.to_csv("clustered_player_data.csv", index=False)

## Step 9: Compare Clusters with Original Player Attributes

### 1. Add Cluster Labels to Original Data

In [None]:
# Merge cluster labels with the original dataset
data_clustering["Cluster"] = kmeans.labels_

# Display sample data with clusters
data_clustering.head()

### 2. Cluster Distribution by Player Position

In [None]:
# Count the number of players in each cluster per position
plt.figure(figsize=(10,6))
sns.countplot(data=data_clustering, x="position", hue="Cluster", palette="viridis")
plt.xticks(rotation=45)
plt.xlabel("Player Position")
plt.ylabel("Count")
plt.title("Cluster Distribution by Player Position")
plt.legend(title="Cluster")
plt.show()

### 3. Analyzing Market Value Across Clusters

In [None]:
# Compare market value across clusters
plt.figure(figsize=(10,6))
sns.boxplot(data=data_clustering, x="Cluster", y="market_value", palette="coolwarm")
plt.xlabel("Cluster")
plt.ylabel("Market Value (€)")
plt.title("Market Value Distribution Across Clusters")
plt.show()

### 4. Compare Clusters Based on Key Attributes

In [None]:
# Compute the average attribute values per cluster
cluster_means = data_clustering.groupby("Cluster").mean()
print("\nAverage Player Attributes per Cluster:")
print(cluster_means)

### 5. Visualizing Clusters Using Key Attributes

In [None]:
# Scatter plot of Age vs. Market Value, colored by cluster
plt.figure(figsize=(10,6))
sns.scatterplot(data=data_clustering, x="age", y="market_value", hue="Cluster", palette="viridis")
plt.xlabel("Age")
plt.ylabel("Market Value (€)")
plt.title("Player Clusters Based on Age & Market Value")
plt.legend(title="Cluster")
plt.show()

## Step 10: Labeling Clusters with Meaningful Names


### Define Cluster Labels Based on Insights

In [None]:
# Define meaningful cluster names based on insights
cluster_labels = {
    0: "Elite Players",
    1: "Young Talents",
    2: "Experienced Veterans",
    3: "Promising Midfielders",
    4: "Defensive Leaders",
    5: "Attack Prospects",
    6: "Squad Players",
    7: "Developing Talents"
}

# Apply labels to the dataset
data_clustering["Cluster_Label"] = data_clustering["Cluster"].map(cluster_labels)

# Show sample labeled clusters
data_clustering[["Cluster", "Cluster_Label"]].drop_duplicates()


### Visualizing Clusters with Names

In [None]:
# Count of players in each labeled cluster
plt.figure(figsize=(10,6))
sns.countplot(data=data_clustering, x="Cluster_Label", palette="viridis")
plt.xticks(rotation=45)
plt.xlabel("Cluster Label")
plt.ylabel("Player Count")
plt.title("Final Player Clusters with Labels")
plt.show()

### Save the Final Dataset with Labels

In [None]:
# Save the dataset with labeled clusters
data_clustering.to_csv("final_clustered_players.csv", index=False)
print("✅ Final dataset saved as 'final_clustered_players.csv'.")

###