# 5. Category-Specific Clustering
Analyzing clusters within different board game categories.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN, KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, silhouette_score
from sklearn.tree import DecisionTreeClassifier, plot_tree

# Set display options and styling
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.style.use('seaborn-v0_8-whitegrid')

# Create output directory for plots if it doesn't exist
import os
if not os.path.exists('../plots'):
    os.makedirs('../plots')
if not os.path.exists('../frames'):
    os.makedirs('../frames')

In [None]:
# Load processed data
filtered_df = pd.read_csv('../frames/filtered_games.csv')
all_binary_cols = np.load('../frames/all_binary_cols.npy', allow_pickle=True)

print(f"Loaded {len(filtered_df)} games with {len(all_binary_cols)} binary features")

# Cluster Each Category and Collect Results

In [None]:
# Define major categories
major_categories = [
    'Cat:Thematic', 
    'Cat:Strategy', 
    'Cat:War', 
    'Cat:Family', 
    'Cat:CGS', 
    'Cat:Abstract',
    'Cat:Party', 
    'Cat:Childrens'
]

# Initialize lists to store data for plots
category_clusters = []
noise_data = []

# Collect data from each category
for category in major_categories:
    # Get category name without "Cat:"
    cat_name = category.replace("Cat:", "").replace(" Binary", "")
    
    # Filter for this category
    category_games = filtered_df[filtered_df[category] == 1].copy()
    
    # Skip if too few games
    if len(category_games) < 100:
        continue
    
    # Apply PCA and DBSCAN (code from earlier)
    pca_cat = PCA()
    pca_cat.fit(category_games[all_binary_cols])
    explained_variance = np.cumsum(pca_cat.explained_variance_ratio_)
    n_components_cat = np.argmax(explained_variance >= 0.8) + 1
    print(f"Using {n_components_cat} components (80% variance) for {category}")
    
    pca_cat = PCA(n_components=n_components_cat)
    pca_result_cat = pca_cat.fit_transform(category_games[all_binary_cols])
    
    # Find appropriate epsilon
    neighbors = NearestNeighbors(n_neighbors=10)
    neighbors_fit = neighbors.fit(pca_result_cat)
    distances, _ = neighbors_fit.kneighbors(pca_result_cat)
    sorted_distances = np.sort(distances[:, 9])
    elbow_index = max(1, len(sorted_distances) // 10)
    eps_cat = max(0.1, sorted_distances[elbow_index])
    print(f"Using epsilon = {eps_cat:.2f}")
    
    # Apply DBSCAN
    dbscan_cat = DBSCAN(eps=eps_cat, min_samples=10)
    clusters_cat = dbscan_cat.fit_predict(pca_result_cat)
    
    # Add cluster labels
    category_games['Category_Cluster'] = clusters_cat
    
    # Analyze clusters
    n_clusters = len(set(clusters_cat)) - (1 if -1 in clusters_cat else 0)
    n_noise = list(clusters_cat).count(-1)
    total_games = len(category_games)
    noise_pct = (n_noise / total_games) * 100
    
    print(f"Found {n_clusters} clusters and {n_noise} noise points ({noise_pct:.1f}%)")
    
    # Collect data for noise percentage plot
    noise_data.append({
        'Category': cat_name,
        'Total Games': total_games,
        'Noise Percentage': noise_pct,
        'Clustered Percentage': 100 - noise_pct
    })
    
    # Collect data for cluster rating comparison
    for cluster_id in sorted(set(clusters_cat)):
        if cluster_id == -1:  # Skip noise points
            continue
            
        cluster_games = category_games[category_games['Category_Cluster'] == cluster_id]
        cluster_size = len(cluster_games)
        
        # Skip very small clusters
        if cluster_size < 10:
            continue
            
        category_clusters.append({
            'Category': cat_name,
            'Cluster': f"{cat_name} {cluster_id}",
            'AvgRating': cluster_games['AvgRating'].mean(),
            'Size': cluster_size
        })
        
        # Show top games
        top_games = cluster_games.sort_values('AvgRating', ascending=False).head(3)
        print(f"\nCluster {cluster_id} ({len(cluster_games)} games, avg rating: {cluster_games['AvgRating'].mean():.2f}):")
        print("Top games:")
        for _, game in top_games.iterrows():
            print(f"  {game['Name']} ({game['YearPublished']}) - Rating: {game['AvgRating']:.2f}")

# Visualize Results

In [None]:
# Create cluster rating comparison plot
if len(category_clusters) > 0:
    cluster_df = pd.DataFrame(category_clusters)
    
    plt.figure(figsize=(16, 8))
    ax = sns.barplot(x='Category', y='AvgRating', hue='Cluster', data=cluster_df)
    plt.title('Average Rating by Category and Cluster')
    plt.xticks(rotation=45)
    plt.ylim(5, 8)  # Adjust based on your data
    plt.grid(axis='y', alpha=0.3)
    plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()
    plt.savefig('../plots/category_cluster_ratings.png')
    plt.close()

    # Create cluster size vs rating scatter plot
    plt.figure(figsize=(16, 8))
    # Create color map
    categories = cluster_df['Category'].unique()
    color_map = {}
    for i, cat in enumerate(categories):
        color_map[cat] = plt.cm.tab10(i % 10)
    
    # Plot points
    for cat in categories:
        cat_data = cluster_df[cluster_df['Category'] == cat]
        plt.scatter(
            x=cat_data['Size'], 
            y=cat_data['AvgRating'],
            c=[color_map[cat] for _ in range(len(cat_data))],
            s=100,
            alpha=0.7,
            label=cat
        )
    
    # Add text labels for each point
    for i, row in cluster_df.iterrows():
        plt.annotate(
            f"Cluster {row['Cluster'].split()[-1]}",
            (row['Size'], row['AvgRating']),
            xytext=(5, 0),
            textcoords='offset points',
            fontsize=8
        )
        
    plt.title('Cluster Size vs. Average Rating by Category')
    plt.xlabel('Number of Games in Cluster')
    plt.ylabel('Average Rating')
    plt.grid(True, alpha=0.3)
    plt.legend(title='Category')
    plt.tight_layout()
    plt.show()
    plt.savefig('../plots/cluster_size_vs_rating.png')
    plt.close()

# Create noise percentage comparison plot
if len(noise_data) > 0:
    noise_df = pd.DataFrame(noise_data)
    
    plt.figure(figsize=(14, 6))
    bars = plt.bar(noise_df['Category'], noise_df['Noise Percentage'], color='lightgray')
    plt.title('Noise Percentage in Category Clustering')
    plt.xlabel('Category')
    plt.ylabel('Percentage of Games Classified as Noise')
    plt.xticks(rotation=45)
    plt.ylim(0, 100)
    plt.grid(axis='y', alpha=0.3)
    
    # Add value labels on top of bars
    for bar in bars:
        height = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width()/2.,
            height + 1,
            f'{height:.1f}%',
            ha='center'
        )
    
    plt.tight_layout()
    plt.show()
    plt.savefig('../plots/category_noise_percentage.png')
    plt.close()