In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('Pokemon.csv')
df.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


In [3]:
# Select stats for clustering (excluding "Type 2", name, generation, total, legendary for clustering)
cluster_features = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']

df_cluster = df[['Type 1', 'Name'] + cluster_features] 


In [4]:
# Store optimal clusters and silhouette scores
optimal_clusters = {}
silhouette_scores = {}


In [5]:
# Iterate over each Type 1 to find the best cluster count
for poke_type in df_cluster['Type 1'].unique():
    type_df = df_cluster[df_cluster['Type 1'] == poke_type]
    X = type_df[cluster_features]

    # Define a pipeline with MinMaxScaler and KMeans
    pipeline = Pipeline([
        ('scaler', MinMaxScaler()),
        ('cluster', KMeans(random_state=0))
    ])
    
    best_score = -1
    best_k = 2
    silhouette_scores[poke_type] = []

    # Find silhouette scores for clusters ranging from 2 to 14
    for k in range(2, min(15, len(X))):
        pipeline.set_params(cluster__n_clusters=k)
        pipeline.fit(X)
        labels = pipeline.named_steps['cluster'].labels_
        
        score = silhouette_score(X, labels)
        silhouette_scores[poke_type].append((k, score))
        
        # Track the best silhouette score and cluster count
        if score > best_score:
            best_score = score
            best_k = k
    
    optimal_clusters[poke_type] = (best_k, best_score)

In [6]:
# Display results in specified format
for poke_type, clusters in silhouette_scores.items():
    print(f"{poke_type}\n-----------")
    for k, score in clusters:
        print(f"{k} clusters: {score}")
    
    best_k, best_score = optimal_clusters[poke_type]
    print(f"best number of clusters: {best_k}")
    print(f"best score: {best_score}\n")
    

for poke_type, clusters in silhouette_scores.items():
    print(f"{poke_type}\n-----------")
    best_k, best_score = optimal_clusters[poke_type]
    
    # Re-run clustering with the optimal number of clusters for each type
    type_df = df_cluster[df_cluster['Type 1'] == poke_type].copy()
    X = type_df[cluster_features]
    pipeline.set_params(cluster__n_clusters=best_k)
    pipeline.fit(X)
    type_df['Cluster'] = pipeline.named_steps['cluster'].labels_

    # Display clusters with mean values
    for cluster_id in range(best_k):
        cluster_data = type_df[type_df['Cluster'] == cluster_id]
        print(f"Cluster {cluster_id}")
        # print(cluster_data[["Name"] + cluster_features].to_string(index=False))
        print(cluster_data[["Name"] + cluster_features])
        
        # Calculate and display mean statistics for each cluster
        mean_stats = cluster_data[cluster_features].mean()
        print("Mean HP:", mean_stats["HP"])
        print("Mean Attack:", mean_stats["Attack"])
        print("Mean Defense:", mean_stats["Defense"])
        print("Mean Sp. Atk:", mean_stats["Sp. Atk"])
        print("Mean Sp. Def:", mean_stats["Sp. Def"])
        print("Mean Speed:", mean_stats["Speed"], "\n")

Grass
-----------
2 clusters: 0.32466886500672937
3 clusters: 0.19578316342059898
4 clusters: 0.12714510705172744
5 clusters: 0.1344223618625614
6 clusters: 0.12584496521527
7 clusters: 0.13139952771002608
8 clusters: 0.1393127919470965
9 clusters: 0.13576097474075602
10 clusters: 0.1468445415231868
11 clusters: 0.1455368059913252
12 clusters: 0.14621654440242363
13 clusters: 0.1478212474796923
14 clusters: 0.14228469887017597
best number of clusters: 2
best score: 0.32466886500672937

Fire
-----------
2 clusters: 0.37244048426523624
3 clusters: 0.3530180167707559
4 clusters: 0.19583479899589368
5 clusters: 0.20180081774598946
6 clusters: 0.219771371151066
7 clusters: 0.1752488727688432
8 clusters: 0.1534433377918612
9 clusters: 0.158518931795672
10 clusters: 0.18724952586159632
11 clusters: 0.20783365962399164
12 clusters: 0.20324596411404908
13 clusters: 0.20415054615013886
14 clusters: 0.20419222128616207
best number of clusters: 2
best score: 0.37244048426523624

Water
-----------
