# 4. Clustering the Entire Dataset
Using DBSCAN to identify natural clusters of board games.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN, KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, silhouette_score
from sklearn.tree import DecisionTreeClassifier, plot_tree

# Set display options and styling
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.style.use('seaborn-v0_8-whitegrid')

# Create output directory for plots if it doesn't exist
import os
if not os.path.exists('../plots'):
    os.makedirs('../plots')
if not os.path.exists('../frames'):
    os.makedirs('../frames')

In [None]:
# Load processed data
filtered_df = pd.read_csv('../frames/filtered_games.csv')
all_binary_cols = np.load('../frames/all_binary_cols.npy', allow_pickle=True)

print(f"Loaded {len(filtered_df)} games with {len(all_binary_cols)} binary features")

In [None]:
# Load PCA results
pca_df = pd.read_csv('../frames/pca_results.csv')
import joblib
pca = joblib.load('../frames/pca_model.pkl')
n_components = pca.n_components_

# Get the PCA result from the dataframe
pca_result = pca_df[[f'PC{i+1}' for i in range(n_components)]].values

print(f"Loaded PCA results with {n_components} components")

# Use k-distance graph to find appropriate eps

In [None]:
# Find appropriate epsilon with k-distance graph
print("Generating k-distance graph...")
neighbors = NearestNeighbors(n_neighbors=20)
neighbors_fit = neighbors.fit(pca_result)
distances, indices = neighbors_fit.kneighbors(pca_result)

# Sort distances
distances = np.sort(distances[:, 19])

# Plot k-distance graph
plt.figure(figsize=(10, 6))
plt.plot(distances)
plt.axhline(y=0.8, color='r', linestyle='--', label='Potential epsilon')
plt.title('K-Distance Graph')
plt.xlabel('Data Points (sorted by distance)')
plt.ylabel('Distance to 20th Nearest Neighbor')
plt.grid(True)
plt.legend()
plt.show()
plt.savefig('../plots/k_distance_graph.png')
plt.close()

# Grid Search to eps and min_samples Selection

In [None]:
# Let's try a range of epsilon and min_samples values
print("\nTrying different DBSCAN parameters:")
for eps in [0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5]:
    for min_samples in [10, 15, 20, 25]:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        labels = dbscan.fit_predict(pca_result)
        
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise = list(labels).count(-1)
        
        print(f"eps={eps}, min_samples={min_samples}: {n_clusters} clusters, {n_noise} noise points ({n_noise/len(labels)*100:.1f}%)")
        
        # Calculate silhouette score if more than one cluster
        if n_clusters > 1:
            # We need to filter both the data points and labels to exclude noise points
            mask = labels != -1
            silhouette_avg = silhouette_score(pca_result[mask], labels[mask])
            print(f"Silhouette score (excluding noise): {silhouette_avg:.4f}")

# Run DBSCAN With Chosen Hyperparams

In [None]:
eps = 0.8
min_samples = 15
print(f"\nApplying DBSCAN with eps={eps}, min_samples={min_samples}")

dbscan = DBSCAN(eps=eps, min_samples=min_samples)
cluster_labels = dbscan.fit_predict(pca_result)

# Add cluster labels to dataframe
pca_df['DBSCAN_Cluster'] = cluster_labels
filtered_df['DBSCAN_Cluster'] = cluster_labels

# Analyze clusters
n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
n_noise = list(cluster_labels).count(-1)
print(f"DBSCAN found {n_clusters} clusters and {n_noise} noise points ({n_noise/len(cluster_labels)*100:.1f}%)")

# Calculate silhouette score if more than one cluster
if n_clusters > 1:
    # We need to filter both the data points and labels to exclude noise points
    mask = cluster_labels != -1
    silhouette_avg = silhouette_score(pca_result[mask], cluster_labels[mask])
    print(f"Silhouette score (excluding noise): {silhouette_avg:.4f}")

# Visualize clusters in 2D PCA space
plt.figure(figsize=(12, 10))
scatter = plt.scatter(pca_df['PC1'], pca_df['PC2'], 
                     c=pca_df['DBSCAN_Cluster'], cmap='viridis', 
                     alpha=0.6, s=30)
plt.colorbar(scatter, label='Cluster')
plt.title('Board Game Clusters in PCA Space (PC1 vs PC2)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.grid(True)
plt.show()
plt.savefig('../plots/dbscan_clusters.png')
plt.close()

# Cluster Analysis

In [None]:
# Rating distribution by cluster
if n_clusters > 0:
    plt.figure(figsize=(12, 8))
    sns.boxplot(x='DBSCAN_Cluster', y='AvgRating', data=filtered_df)
    plt.title('Rating Distribution by Cluster')
    plt.xlabel('Cluster')
    plt.ylabel('Average Rating')
    plt.show()
    plt.savefig('../plots/rating_by_cluster.png')
    plt.close()

In [None]:
# Analyze clusters
if n_clusters > 0:
    print("\nCluster Statistics:")
    cluster_stats = filtered_df.groupby('DBSCAN_Cluster').agg({
        'AvgRating': ['mean', 'std', 'count'],
        'NumUserRatings': ['mean', 'median']
    })
    print(cluster_stats)

    # For each cluster, find defining features
    print("\nDefining features for each cluster:")
    for i in sorted(set(cluster_labels)):
        if i == -1:
            continue  # Skip noise points
            
        cluster_games = filtered_df[filtered_df['DBSCAN_Cluster'] == i]
        other_games = filtered_df[filtered_df['DBSCAN_Cluster'] != i]
        
        # Compare feature presence
        distinctive_features = []
        for col in all_binary_cols:
            cluster_mean = cluster_games[col].mean()
            other_mean = other_games[col].mean()
            diff = cluster_mean - other_mean
            distinctive_features.append((col, diff, cluster_mean, other_mean))
        
        # Sort by absolute difference
        distinctive_features.sort(key=lambda x: abs(x[1]), reverse=True)
        
        print(f"\nCluster {i} ({len(cluster_games)} games, avg rating: {cluster_games['AvgRating'].mean():.2f}):")
        print("Distinctive features:")
        for feat, diff, c_mean, o_mean in distinctive_features[:10]:
            print(f"  {feat}: {diff:.4f} ({c_mean*100:.1f}% vs {o_mean*100:.1f}%)")
        
        # Show example games
        top_games = cluster_games.sort_values('AvgRating', ascending=False).head(3)
        print("Example games:")
        for _, game in top_games.iterrows():
            print(f"  {game['Name']} ({game['YearPublished']}) - Rating: {game['AvgRating']:.2f}")