# 7. Rating-Stratified Clustering
Analyzing clusters within different rating brackets.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN, KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, silhouette_score
from sklearn.tree import DecisionTreeClassifier, plot_tree

# Set display options and styling
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.style.use('seaborn-v0_8-whitegrid')

# Create output directory for plots if it doesn't exist
import os
if not os.path.exists('../plots'):
    os.makedirs('../plots')
if not os.path.exists('../frames'):
    os.makedirs('../frames')

In [None]:
# Load processed data
filtered_df = pd.read_csv('../frames/filtered_games.csv')
all_binary_cols = np.load('../frames/all_binary_cols.npy', allow_pickle=True)

print(f"Loaded {len(filtered_df)} games with {len(all_binary_cols)} binary features")

In [None]:
# Load Random Forest results
feature_importance = pd.read_csv('../frames/feature_importance.csv')

print("Loaded Random Forest feature importance results")

# Cluster Each Rating Bin and Collect Results

In [None]:
# Initialize lists to store data for visualizations
rating_bin_clusters = []  # For cluster count visualization
feature_dist = []  # For feature distribution visualization
top_features_by_bin = {}  # For top features visualization

# Use the Rating_Bracket we created earlier
for rating_bin in filtered_df['Rating_Bracket'].unique():
    bin_games = filtered_df[filtered_df['Rating_Bracket'] == rating_bin].copy()
    
    if len(bin_games) < 100:  # Skip bins with too few games
        print(f"Skipping {rating_bin} rated games: only {len(bin_games)} games")
        continue
        
    print(f"\nAnalyzing {rating_bin} rated games ({len(bin_games)} games)")
    
    # Apply PCA with variance-based component selection
    pca_bin = PCA()
    pca_bin.fit(bin_games[all_binary_cols])
    explained_variance = np.cumsum(pca_bin.explained_variance_ratio_)
    n_components_bin = np.argmax(explained_variance >= 0.8) + 1
    print(f"Using {n_components_bin} components (80% variance) for {rating_bin} rating bin")
    
    pca_bin = PCA(n_components=n_components_bin)
    pca_result_bin = pca_bin.fit_transform(bin_games[all_binary_cols])
    
    # Find appropriate epsilon using k-distance
    neighbors = NearestNeighbors(n_neighbors=10)
    neighbors_fit = neighbors.fit(pca_result_bin)
    distances, _ = neighbors_fit.kneighbors(pca_result_bin)
    sorted_distances = np.sort(distances[:, 9])
    
    # Choose epsilon with a more principled approach - find the elbow point
    elbow_index = max(1, len(sorted_distances) // 10)  # Simple heuristic
    eps_bin = max(0.1, sorted_distances[elbow_index])  # Ensure epsilon is at least 0.1
    print(f"Using epsilon = {eps_bin:.2f}")
    
    # Apply DBSCAN
    min_samples_bin = 10
    dbscan_bin = DBSCAN(eps=eps_bin, min_samples=min_samples_bin)
    clusters_bin = dbscan_bin.fit_predict(pca_result_bin)
    
    # Add clusters to dataframe
    bin_games['Rating_Bin_Cluster'] = clusters_bin
    
    # Analyze clusters
    n_clusters = len(set(clusters_bin)) - (1 if -1 in clusters_bin else 0)
    n_noise = list(clusters_bin).count(-1)
    noise_pct = (n_noise / len(clusters_bin)) * 100
    print(f"Found {n_clusters} clusters and {n_noise} noise points ({noise_pct:.1f}%)")
    
    # Store data for cluster count visualization
    rating_bin_clusters.append({
        'Rating Bin': rating_bin,
        'Number of Clusters': n_clusters,
        'Games': len(bin_games),
        'Noise Percentage': noise_pct
    })
    
    # Get top features for this rating bin
    feature_presence = bin_games[all_binary_cols].mean()
    top_features = feature_presence.nlargest(10)
    top_features_by_bin[rating_bin] = top_features
    
    # Calculate feature prevalence for top features from feature_importance
    important_features = feature_importance.head(15)['Feature'].tolist()
    
    for feature in important_features:
        if feature in bin_games.columns:
            prevalence = bin_games[feature].mean() * 100  # Convert to percentage
            feature_dist.append({
                'Rating Bin': rating_bin,
                'Feature': feature,
                'Prevalence': prevalence
            })
    
    if n_clusters > 0:
        # Use Decision Tree to understand cluster characteristics
        if n_clusters >= 2:  # Need at least 2 clusters for classification
            # Prepare data for decision tree (excluding noise points)
            X_tree = bin_games[bin_games['Rating_Bin_Cluster'] != -1][all_binary_cols]
            y_tree = bin_games[bin_games['Rating_Bin_Cluster'] != -1]['Rating_Bin_Cluster']
            
            # Train a decision tree
            tree = DecisionTreeClassifier(max_depth=4)  # Limit depth for interpretability
            tree.fit(X_tree, y_tree)
            
            # Feature importance
            tree_importance = pd.DataFrame({
                'Feature': all_binary_cols,
                'Importance': tree.feature_importances_
            }).sort_values('Importance', ascending=False)
            
            print("\nTop features for distinguishing clusters:")
            print(tree_importance.head(10))
            
            # Visualize the tree
            plt.figure(figsize=(20, 10))
            plot_tree(tree, feature_names=all_binary_cols, 
                     class_names=[str(i) for i in tree.classes_], 
                     filled=True, rounded=True, fontsize=8)
            plt.title(f'Decision Tree for {rating_bin} Rating Bin Clusters')
            plt.show()
            plt.savefig(f'../plots/decision_tree_{rating_bin}.png', dpi=300, bbox_inches='tight')
            plt.close()
            
        # Show cluster statistics
        cluster_stats = bin_games.groupby('Rating_Bin_Cluster').agg({
            'AvgRating': ['mean', 'count'],
        })
        print("Cluster statistics:")
        print(cluster_stats)
        
        # For each cluster, show example games
        for i in sorted(set(clusters_bin)):
            if i == -1:
                continue
                
            cluster_bin_games = bin_games[bin_games['Rating_Bin_Cluster'] == i]
            
            print(f"\nCluster {i} ({len(cluster_bin_games)} games, avg rating: {cluster_bin_games['AvgRating'].mean():.2f}):")
            
            # Show example games
            top_games = cluster_bin_games.sort_values('AvgRating', ascending=False).head(3)
            print("Top games:")
            for _, game in top_games.iterrows():
                print(f"  {game['Name']} ({game['YearPublished']}) - Rating: {game['AvgRating']:.2f}")

# Visualize Results

In [None]:
# 1. Top Features by Rating Bin visualization
if top_features_by_bin:
    # Get all unique top features across rating bins
    all_top_features = set()
    for features in top_features_by_bin.values():
        all_top_features.update(features.index)
    
    # Create a dataframe for the heatmap - features by rating bins
    heatmap_data = np.zeros((len(all_top_features), len(top_features_by_bin)))
    heatmap_df = pd.DataFrame(
        heatmap_data,
        index=sorted(all_top_features),
        columns=sorted(top_features_by_bin.keys())
    )
    
    # Fill in the dataframe with feature prevalence values
    for rating_bin, features in top_features_by_bin.items():
        for feature, value in features.items():
            heatmap_df.loc[feature, rating_bin] = value
    
    # Plot heatmap
    plt.figure(figsize=(12, len(all_top_features) * 0.4))
    sns.heatmap(heatmap_df, annot=True, fmt='.2f', cmap='viridis')
    plt.title('Top Features by Rating Bin')
    plt.xlabel('Rating Bin')
    plt.ylabel('Features')
    plt.tight_layout()
    plt.show()
    plt.savefig('../plots/rating_bin_top_features.png')
    plt.close()
else:
    print("Insufficient data to create feature importance plot")

In [None]:
# 2. Cluster Count by Rating Bin visualization
if rating_bin_clusters:
    # Sort by rating bin
    sorted_bins = ['<5', '5-6', '6-7', '7-8', '8+']
    cluster_count_df = pd.DataFrame(rating_bin_clusters)
    
    # Reorder rows according to sorted_bins if possible
    if all(bin_name in sorted_bins for bin_name in cluster_count_df['Rating Bin']):
        cluster_count_df['Sort_Order'] = cluster_count_df['Rating Bin'].apply(lambda x: sorted_bins.index(x))
        cluster_count_df = cluster_count_df.sort_values('Sort_Order').drop('Sort_Order', axis=1)
    
    plt.figure(figsize=(14, 6))
    ax1 = plt.gca()
    ax2 = ax1.twinx()
    
    # Plot cluster counts
    bars = ax1.bar(cluster_count_df['Rating Bin'], cluster_count_df['Number of Clusters'], color='teal', alpha=0.7)
    ax1.set_ylabel('Number of Distinct Clusters', color='teal')
    ax1.tick_params(axis='y', labelcolor='teal')
    
    # Plot game count as line
    ax2.plot(cluster_count_df['Rating Bin'], cluster_count_df['Games'], 'ro-', linewidth=2)
    ax2.set_ylabel('Number of Games', color='red')
    ax2.tick_params(axis='y', labelcolor='red')
    
    plt.title('Number of Distinct Clusters by Rating Bin')
    plt.xticks(rotation=45)
    plt.grid(True, axis='y', alpha=0.3)
    
    # Add value labels for cluster counts
    for bar in bars:
        height = bar.get_height()
        ax1.text(
            bar.get_x() + bar.get_width()/2.,
            height + 0.1,
            f'{int(height)}',
            ha='center',
            color='teal'
        )
    
    plt.tight_layout()
    plt.show()
    plt.savefig('../plots/rating_bin_cluster_counts.png')
    plt.close()
    
    # Also create noise percentage visualization
    plt.figure(figsize=(14, 6))
    bars = plt.bar(cluster_count_df['Rating Bin'], cluster_count_df['Noise Percentage'], color='lightgray')
    plt.title('Noise Percentage in Rating Bin Clustering')
    plt.xlabel('Rating Bin')
    plt.ylabel('Percentage of Games Classified as Noise')
    plt.xticks(rotation=45)
    plt.ylim(0, 100)
    plt.grid(axis='y', alpha=0.3)
    
    # Add value labels on top of bars
    for bar in bars:
        height = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width()/2.,
            height + 1,
            f'{height:.1f}%',
            ha='center'
        )
    
    plt.tight_layout()
    plt.show()
    plt.savefig('../plots/rating_bin_noise_percentage.png')
    plt.close()
else:
    print("Insufficient data to create cluster count plot")

In [None]:
# 3. Feature Distribution Across Rating Bins visualization
if feature_dist:
    feature_dist_df = pd.DataFrame(feature_dist)
    
    # Create a heatmap of feature prevalence by rating bin
    pivot_df = feature_dist_df.pivot(index='Feature', columns='Rating Bin', values='Prevalence')
    
    # Sort columns if possible
    if all(bin_name in sorted_bins for bin_name in pivot_df.columns):
        pivot_df = pivot_df[sorted(pivot_df.columns, key=lambda x: sorted_bins.index(x))]
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(pivot_df, annot=True, fmt='.1f', cmap='viridis')
    plt.title('Important Feature Prevalence (%) by Rating Bin')
    plt.tight_layout()
    plt.show()
    plt.savefig('../plots/rating_bin_feature_distribution.png')
    plt.close()
else:
    print("Insufficient data to create feature distribution plot")

In [None]:
# 4. Additional visualization: components needed by rating bin
if rating_bin_clusters:
    # Extract PCA components information by rating bin
    components_by_rating = []
    for rating_bin in filtered_df['Rating_Bracket'].unique():
        bin_games = filtered_df[filtered_df['Rating_Bracket'] == rating_bin].copy()
        
        if len(bin_games) < 100:  # Skip bins with too few games
            continue
        
        # Calculate components for 80% variance
        pca_temp = PCA()
        pca_temp.fit(bin_games[all_binary_cols])
        explained_var = np.cumsum(pca_temp.explained_variance_ratio_)
        n_comp = np.argmax(explained_var >= 0.8) + 1
        
        components_by_rating.append({
            'Rating Bin': rating_bin,
            'Components': n_comp,
            'Games': len(bin_games)
        })
    
    # Create dataframe
    comp_df = pd.DataFrame(components_by_rating)
    
    # Sort by rating bin if possible
    if all(bin_name in sorted_bins for bin_name in comp_df['Rating Bin']):
        comp_df['Sort_Order'] = comp_df['Rating Bin'].apply(lambda x: sorted_bins.index(x))
        comp_df = comp_df.sort_values('Sort_Order').drop('Sort_Order', axis=1)
    
    # Create plot
    plt.figure(figsize=(14, 6))
    ax1 = plt.gca()
    ax2 = ax1.twinx()
    
    # Plot components
    bars = ax1.bar(comp_df['Rating Bin'], comp_df['Components'], color='steelblue', alpha=0.7)
    ax1.set_ylabel('PCA Components for 80% Variance', color='steelblue')
    ax1.tick_params(axis='y', labelcolor='steelblue')
    
    # Plot game count as line
    ax2.plot(comp_df['Rating Bin'], comp_df['Games'], 'ro-', linewidth=2)
    ax2.set_ylabel('Number of Games', color='red')
    ax2.tick_params(axis='y', labelcolor='red')
    
    plt.title('PCA Dimensionality by Rating Bin')
    plt.xticks(rotation=45)
    plt.grid(True, axis='y', alpha=0.3)
    
    # Add value labels for components
    for bar in bars:
        height = bar.get_height()
        ax1.text(
            bar.get_x() + bar.get_width()/2.,
            height + 1,
            f'{int(height)}',
            ha='center',
            color='steelblue'
        )
    
    plt.tight_layout()
    plt.show()
    plt.savefig('../plots/rating_bin_component_complexity.png')
    plt.close()