# 6. Temporal Clustering Analysis
Analyzing how board game clusters evolved over time.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN, KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, silhouette_score
from sklearn.tree import DecisionTreeClassifier, plot_tree

# Set display options and styling
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.style.use('seaborn-v0_8-whitegrid')

# Create output directory for plots if it doesn't exist
import os
if not os.path.exists('../plots'):
    os.makedirs('../plots')
if not os.path.exists('../frames'):
    os.makedirs('../frames')

In [None]:
# Load processed data
filtered_df = pd.read_csv('../frames/filtered_games.csv')
all_binary_cols = np.load('../frames/all_binary_cols.npy', allow_pickle=True)

print(f"Loaded {len(filtered_df)} games with {len(all_binary_cols)} binary features")

# Analyze AvgRating Trend Over the Decades

In [None]:
# Create decade bins with more appropriate filtering
# First, filter out unrealistic years
realistic_years_df = filtered_df[
    (filtered_df['YearPublished'] >= 1900) & 
    (filtered_df['YearPublished'] <= 2025)
].copy()

# Then create decade bins
realistic_years_df['Decade'] = (realistic_years_df['YearPublished'] // 10) * 10

# Analyze trends over time
decade_stats = realistic_years_df.groupby('Decade').agg({
    'AvgRating': ['mean', 'std', 'count'],
    'NumUserRatings': ['mean', 'count']
})
print("\nRating trends by decade (realistic years only):")
print(decade_stats)

# Plot rating trend over time
plt.figure(figsize=(12, 6))
decade_means = realistic_years_df.groupby('Decade')['AvgRating'].mean()
decade_means.plot(kind='line', marker='o')
plt.title('Average Board Game Rating by Decade')
plt.xlabel('Decade')
plt.ylabel('Average Rating')
plt.grid(True)
plt.show()
plt.savefig('../plots/rating_by_decade.png')
plt.close()

# Cluster Each Decade and Collect Results

In [None]:
# Initialize lists to store data for plots
decade_clusters = []
noise_data = []

# Cluster games from different decades
print("\nClustering games by decade...")
for decade in sorted(realistic_years_df['Decade'].unique()):
    decade_games = realistic_years_df[realistic_years_df['Decade'] == decade].copy()
    
    if len(decade_games) < 100:  # Skip decades with too few games
        print(f"Skipping {decade}s: only {len(decade_games)} games")
        continue
        
    print(f"\nAnalyzing {decade}s games ({len(decade_games)} games)")
    
    # Apply PCA with variance-based component selection
    pca_decade = PCA()
    pca_decade.fit(decade_games[all_binary_cols])
    explained_variance = np.cumsum(pca_decade.explained_variance_ratio_)
    n_components_decade = np.argmax(explained_variance >= 0.8) + 1
    print(f"Using {n_components_decade} components (80% variance) for {decade}s")
    
    pca_decade = PCA(n_components=n_components_decade)
    pca_result_decade = pca_decade.fit_transform(decade_games[all_binary_cols])
    
    # Find appropriate epsilon using k-distance
    neighbors = NearestNeighbors(n_neighbors=10)
    neighbors_fit = neighbors.fit(pca_result_decade)
    distances, _ = neighbors_fit.kneighbors(pca_result_decade)
    sorted_distances = np.sort(distances[:, 9])
    
    # Choose epsilon with a more principled approach - find the elbow point
    elbow_index = max(1, len(sorted_distances) // 10)  # Simple heuristic
    eps_decade = max(0.1, sorted_distances[elbow_index])  # Ensure epsilon is at least 0.1
    print(f"Using epsilon = {eps_decade:.2f}")
    
    # Apply DBSCAN
    dbscan_decade = DBSCAN(eps=eps_decade, min_samples=10)
    clusters_decade = dbscan_decade.fit_predict(pca_result_decade)
    
    # Add clusters to dataframe
    decade_games['Decade_Cluster'] = clusters_decade
    
    # Analyze clusters
    n_clusters = len(set(clusters_decade)) - (1 if -1 in clusters_decade else 0)
    n_noise = list(clusters_decade).count(-1)
    noise_pct = (n_noise / len(clusters_decade)) * 100
    print(f"Found {n_clusters} clusters and {n_noise} noise points ({noise_pct:.1f}%)")
    
    # Collect data for noise percentage plot
    noise_data.append({
        'Decade': f"{decade}s",
        'Total Games': len(decade_games),
        'Noise Percentage': noise_pct,
        'Clustered Percentage': 100 - noise_pct
    })
    
    if n_clusters > 0:
        # Cluster statistics
        cluster_stats = decade_games.groupby('Decade_Cluster').agg({
            'AvgRating': ['mean', 'count'],
        })
        print("Cluster statistics:")
        print(cluster_stats)
        
        # Collect data for cluster rating comparison
        for cluster_id in sorted(set(clusters_decade)):
            if cluster_id == -1:  # Skip noise points
                continue
                
            cluster_games = decade_games[decade_games['Decade_Cluster'] == cluster_id]
            cluster_size = len(cluster_games)
            
            # Skip very small clusters
            if cluster_size < 10:
                continue
                
            decade_clusters.append({
                'Decade': f"{decade}s",
                'Cluster': f"{decade}s Cluster {cluster_id}",
                'AvgRating': cluster_games['AvgRating'].mean(),
                'Size': cluster_size
            })
        
        # For each non-noise cluster, show example games
        for i in sorted(set(clusters_decade)):
            if i == -1:
                continue
                
            cluster_decade_games = decade_games[decade_games['Decade_Cluster'] == i]
            
            print(f"\nCluster {i} ({len(cluster_decade_games)} games, avg rating: {cluster_decade_games['AvgRating'].mean():.2f}):")
            
            # Show example games
            top_games = cluster_decade_games.sort_values('AvgRating', ascending=False).head(3)
            print("Top games:")
            for _, game in top_games.iterrows():
                print(f"  {game['Name']} ({game['YearPublished']}) - Rating: {game['AvgRating']:.2f}")

# Visualize Results

In [None]:
# Create cluster rating comparison plot
if len(decade_clusters) > 0:
    decade_cluster_df = pd.DataFrame(decade_clusters)
    
    plt.figure(figsize=(16, 8))
    ax = sns.barplot(x='Decade', y='AvgRating', hue='Cluster', data=decade_cluster_df)
    plt.title('Average Rating by Decade and Cluster')
    plt.xticks(rotation=45)
    plt.ylim(5, 8)  # Adjust based on your data
    plt.grid(axis='y', alpha=0.3)
    plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()
    plt.savefig('../plots/decade_cluster_ratings.png')
    plt.close()

    # Create cluster size vs rating scatter plot
    plt.figure(figsize=(16, 8))
    # Create color map
    decades = decade_cluster_df['Decade'].unique()
    color_map = {}
    for i, dec in enumerate(decades):
        color_map[dec] = plt.cm.tab10(i % 10)
    
    # Plot points
    for dec in decades:
        dec_data = decade_cluster_df[decade_cluster_df['Decade'] == dec]
        plt.scatter(
            x=dec_data['Size'], 
            y=dec_data['AvgRating'],
            c=[color_map[dec] for _ in range(len(dec_data))],
            s=100,
            alpha=0.7,
            label=dec
        )
    
    # Add text labels for each point
    for i, row in decade_cluster_df.iterrows():
        plt.annotate(
            f"Cluster {row['Cluster'].split()[-1]}",
            (row['Size'], row['AvgRating']),
            xytext=(5, 0),
            textcoords='offset points',
            fontsize=8
        )
        
    plt.title('Cluster Size vs. Average Rating by Decade')
    plt.xlabel('Number of Games in Cluster')
    plt.ylabel('Average Rating')
    plt.grid(True, alpha=0.3)
    plt.legend(title='Decade')
    plt.tight_layout()
    plt.show()
    plt.savefig('../plots/decade_cluster_size_vs_rating.png')
    plt.close()

# Create noise percentage comparison plot
if len(noise_data) > 0:
    noise_df = pd.DataFrame(noise_data)
    
    plt.figure(figsize=(14, 6))
    bars = plt.bar(noise_df['Decade'], noise_df['Noise Percentage'], color='lightgray')
    plt.title('Noise Percentage in Decade Clustering')
    plt.xlabel('Decade')
    plt.ylabel('Percentage of Games Classified as Noise')
    plt.xticks(rotation=45)
    plt.ylim(0, 100)
    plt.grid(axis='y', alpha=0.3)
    
    # Add value labels on top of bars
    for bar in bars:
        height = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width()/2.,
            height + 1,
            f'{height:.1f}%',
            ha='center'
        )
    
    plt.tight_layout()
    plt.show()
    plt.savefig('../plots/decade_noise_percentage.png')
    plt.close()

# Additional visualization: components needed by decade
if len(noise_data) > 0:
    # Create a list to store number of components by decade
    components_by_decade = []
    for decade in sorted(realistic_years_df['Decade'].unique()):
        decade_games = realistic_years_df[realistic_years_df['Decade'] == decade].copy()
        
        if len(decade_games) < 100:  # Skip decades with too few games
            continue
        
        # Calculate components for 80% variance
        pca_temp = PCA()
        pca_temp.fit(decade_games[all_binary_cols])
        explained_var = np.cumsum(pca_temp.explained_variance_ratio_)
        n_comp = np.argmax(explained_var >= 0.8) + 1
        
        components_by_decade.append({
            'Decade': f"{decade}s",
            'Components': n_comp,
            'Games': len(decade_games)
        })
    
    # Create dataframe and plot
    comp_df = pd.DataFrame(components_by_decade)
    
    plt.figure(figsize=(14, 6))
    ax1 = plt.gca()
    ax2 = ax1.twinx()
    
    # Plot components
    bars = ax1.bar(comp_df['Decade'], comp_df['Components'], color='steelblue', alpha=0.7)
    ax1.set_ylabel('PCA Components for 80% Variance', color='steelblue')
    ax1.tick_params(axis='y', labelcolor='steelblue')
    
    # Plot game count as line
    ax2.plot(comp_df['Decade'], comp_df['Games'], 'ro-', linewidth=2)
    ax2.set_ylabel('Number of Games', color='red')
    ax2.tick_params(axis='y', labelcolor='red')
    
    plt.title('PCA Dimensionality and Game Count by Decade')
    plt.xticks(rotation=45)
    plt.grid(True, axis='y', alpha=0.3)
    
    # Add value labels for components
    for bar in bars:
        height = bar.get_height()
        ax1.text(
            bar.get_x() + bar.get_width()/2.,
            height + 1,
            f'{int(height)}',
            ha='center',
            color='steelblue'
        )
    
    plt.tight_layout()
    plt.show()
    plt.savefig('../plots/decade_component_complexity.png')
    plt.close()