# 4. Low Ratings Clustering Analysis
Analyzing clusters within low-rated board games (rating < 6.0).

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier, plot_tree

# Import clustering utilities
from cluster_utils import agglomerative_grid_search

# Set display options and styling
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.style.use('seaborn-v0_8-whitegrid')

# Create output directory for plots if it doesn't exist
import os
if not os.path.exists('../plots'):
    os.makedirs('../plots')
if not os.path.exists('../frames'):
    os.makedirs('../frames')

In [None]:
# Load processed data
filtered_df = pd.read_csv('../frames/filtered_games.csv')
all_binary_cols = np.load('../frames/all_binary_cols.npy', allow_pickle=True)
mechanics_cols = np.load('../frames/mechanics_cols.npy', allow_pickle=True)
themes_cols = np.load('../frames/themes_cols.npy', allow_pickle=True)
category_cols = np.load('../frames/category_cols.npy', allow_pickle=True)
subcategories_cols = np.load('../frames/subcategories_cols.npy', allow_pickle=True)

print(f"Loaded {len(filtered_df)} games with {len(all_binary_cols)} binary features")

# Filter for Low-Rated Games

In [None]:
# Define low rating threshold
LOW_RATING_THRESHOLD = 6.0

# Filter for low-rated games
low_rated_games = filtered_df[filtered_df['AvgRating'] < LOW_RATING_THRESHOLD].copy()

print(f"Total games: {len(filtered_df)}")
print(f"Low-rated games (< {LOW_RATING_THRESHOLD}): {len(low_rated_games)}")
print(f"Percentage of original dataset: {len(low_rated_games) / len(filtered_df) * 100:.1f}%")
print(f"Mean: {low_rated_games['AvgRating'].mean():.3f}")

# Visualize rating distribution of low-rated games
plt.figure(figsize=(10, 6))
sns.histplot(low_rated_games['AvgRating'], bins=20, kde=True)
plt.title(f'Rating Distribution of Low-Rated Games (< {LOW_RATING_THRESHOLD})')
plt.xlabel('Average Rating')
plt.ylabel('Count')
plt.grid(axis='y', alpha=0.3)
plt.savefig('../plots/low_rating_distribution.png')
plt.show()

# Cluster Low-Rated Games

In [None]:
# Perform clustering on low-rated games
try:
    print("\nClustering low-rated games...")
    best_params, pca_result, labels, scores, pca, n_components = agglomerative_grid_search(
        low_rated_games, all_binary_cols, variance_threshold=0.8, max_components=50
    )
    
    # Add cluster labels to low_rated_games
    low_rated_games['LowRatingCluster'] = labels
    
    print(f"\nClustering completed with {len(set(labels))} clusters")
    print(f"Best parameters: n_clusters={best_params[0]}, linkage={best_params[1]}, metric={best_params[2]}")
    print(f"Silhouette score: {scores['silhouette']:.4f}")
    print(f"Calinski-Harabasz score: {scores['ch_score']:.1f}")
    print(f"Davies-Bouldin score: {scores['db_score']:.4f}")
    
except Exception as e:
    print(f"Error in clustering: {e}")

# Visualize Clustering Results

In [None]:
# Only create visualization if we have at least one cluster
n_clusters = len(set(labels))
if n_clusters > 0:
    plt.figure(figsize=(12, 10))
    scatter = plt.scatter(
        pca_result[:, 0], pca_result[:, 1], 
        c=labels, cmap='viridis', 
        alpha=0.6, s=30
    )
    plt.colorbar(scatter, label='Cluster')
    plt.title(f'Low-Rated Board Game Clusters in PCA Space')
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.grid(True)
    plt.savefig(f'../plots/clusters_2d_low_rated.png')
    plt.show()
    plt.close()

# Analyze Cluster Profiles

In [None]:
# Get cluster statistics
cluster_stats = low_rated_games.groupby('LowRatingCluster').agg({
    'BGGId': 'count',
    'AvgRating': ['mean', 'median', 'min', 'max', 'std']
}).reset_index()

# Flatten multi-level columns
cluster_stats.columns = ['LowRatingCluster', 'Count', 'AvgRating_Mean', 'AvgRating_Median', 
                        'AvgRating_Min', 'AvgRating_Max', 'AvgRating_Std']

# Sort by highest average rating
cluster_stats = cluster_stats.sort_values('AvgRating_Mean', ascending=False)

print("\nCluster statistics (sorted by average rating):")
print(cluster_stats)

# Function to get top features by prevalence
def get_top_features(df, feature_cols, n=5):
    feature_prevalence = {}
    for col in feature_cols:
        if col in df.columns:
            feature_name = col.split(':', 1)[1] if ':' in col else col
            feature_prevalence[feature_name] = df[col].mean() * 100
    
    # Sort by prevalence and get top n
    sorted_features = sorted(feature_prevalence.items(), key=lambda x: x[1], reverse=True)
    return sorted_features[:n]

# Print cluster profiles
print("\nCluster Profiles:")
for cluster_id in low_rated_games['LowRatingCluster'].unique():
    cluster_df = low_rated_games[low_rated_games['LowRatingCluster'] == cluster_id]
    
    # Get statistics
    stats = cluster_stats[cluster_stats['LowRatingCluster'] == cluster_id].iloc[0]
    
    print(f"\n==== Cluster {cluster_id} ====")
    print(f"Games: {len(cluster_df)} ({len(cluster_df) / len(low_rated_games) * 100:.1f}% of low-rated games)")
    print(f"Average Rating: {stats['AvgRating_Mean']:.2f} (min: {stats['AvgRating_Min']:.2f}, max: {stats['AvgRating_Max']:.2f})")
    
    # Top mechanics
    print("\nTop Mechanics:")
    for mechanic, prevalence in get_top_features(cluster_df, mechanics_cols):
        print(f"  {mechanic}: {prevalence:.1f}%")
    
    # Top themes
    print("\nTop Themes:")
    for theme, prevalence in get_top_features(cluster_df, themes_cols):
        print(f"  {theme}: {prevalence:.1f}%")
    
    # Top categories
    print("\nTop Categories:")
    for category, prevalence in get_top_features(cluster_df, category_cols, n=3):
        print(f"  {category}: {prevalence:.1f}%")

    # Top subcategories
    print("\nTop Subcategories:")
    for subcategory, prevalence in get_top_features(cluster_df, subcategories_cols, n=3):
        print(f"  {subcategory}: {prevalence:.1f}%")
    
    # Example games
    print("\nExample Games:")
    for _, game in cluster_df.sort_values('AvgRating', ascending=False).head(5).iterrows():
        print(f"  {game['Name']} ({game['YearPublished']}): {game['AvgRating']:.2f}")

# Decision Tree Analysis of Low-Rated Clusters

In [None]:
# For each cluster, build a decision tree to distinguish it from other clusters
for cluster_id in sorted(low_rated_games['LowRatingCluster'].unique()):
    # Create binary target: 1 for this cluster, 0 for other clusters
    y = (low_rated_games['LowRatingCluster'] == cluster_id).astype(int)
    X = low_rated_games[all_binary_cols]
    
    # Train a decision tree
    dt = DecisionTreeClassifier(max_depth=3, min_samples_leaf=10, random_state=42)
    dt.fit(X, y)
    
    # Get feature importances
    importances = pd.DataFrame({
        'Feature': all_binary_cols,
        'Importance': dt.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    # Print top features
    print(f"\nTop features for identifying Cluster {cluster_id}:")
    for _, row in importances[importances['Importance'] > 0].head(10).iterrows():
        print(f"  {row['Feature']}: {row['Importance']:.4f}")
    
    # Visualize the decision tree
    plt.figure(figsize=(20, 10))
    plot_tree(
        dt, 
        feature_names=all_binary_cols, 
        class_names=[f'Other Clusters', f'Cluster {cluster_id}'],
        filled=True, 
        rounded=True,
        fontsize=10
    )
    plt.title(f'Decision Tree for Low-Rated Cluster {cluster_id}')
    plt.tight_layout()
    plt.savefig(f'../plots/low_rated_cluster_{cluster_id}_decision_tree.png')
    plt.show()

# Save Results

In [None]:
# Save low-rated games with cluster assignments
low_rated_games.to_csv('../frames/low_rated_games_with_clusters.csv', index=False)

# Create a cluster assignments DataFrame for integration with the main dataset
low_rating_clusters = pd.DataFrame({
    'BGGId': low_rated_games['BGGId'],
    'LowRatingCluster': low_rated_games['LowRatingCluster']
})

# Save cluster assignments
low_rating_clusters.to_csv('../frames/low_rating_clustering_assignments.csv', index=False)

# Save cluster statistics
cluster_stats.to_csv('../frames/low_rating_cluster_statistics.csv', index=False)

print(f"Saved clustering results for {len(low_rated_games)} low-rated games")