# 8. Market Segment Analysis for High Ratings
Deep dive into clusters of highly-rated board games.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN, KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, silhouette_score
from sklearn.tree import DecisionTreeClassifier, plot_tree

# Set display options and styling
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.style.use('seaborn-v0_8-whitegrid')

# Create output directory for plots if it doesn't exist
import os
if not os.path.exists('../plots'):
    os.makedirs('../plots')
if not os.path.exists('../frames'):
    os.makedirs('../frames')

In [None]:
# Load processed data
filtered_df = pd.read_csv('../frames/filtered_games.csv')
all_binary_cols = np.load('../frames/all_binary_cols.npy', allow_pickle=True)

print(f"Loaded {len(filtered_df)} games with {len(all_binary_cols)} binary features")

In [None]:
# Load Random Forest results
feature_importance = pd.read_csv('../frames/feature_importance.csv')

print("Loaded Random Forest feature importance results")

# Find optimal eps through k-distance plot

In [None]:
# Select high-rated games (7.5+)
#high_rated = filtered_df[filtered_df['Rating_Bracket'] == '8+'].copy()
high_rated = filtered_df[filtered_df['AvgRating'] >= 7.5].copy()
print(f"Found {len(high_rated)} highly-rated games (7.5+)")

# Apply PCA with variance-based component selection
pca_high = PCA()
pca_high.fit(high_rated[all_binary_cols])
explained_variance = np.cumsum(pca_high.explained_variance_ratio_)
n_components_high = np.argmax(explained_variance >= 0.8) + 1
print(f"Using {n_components_high} components (80% variance) for high-rated games")

pca_high = PCA(n_components=n_components_high)
#pca_high = PCA(n_components=20)
pca_result_high = pca_high.fit_transform(high_rated[all_binary_cols])

# Find appropriate epsilon
neighbors = NearestNeighbors(n_neighbors=10)
neighbors_fit = neighbors.fit(pca_result_high)
distances, _ = neighbors_fit.kneighbors(pca_result_high)
sorted_distances = np.sort(distances[:, 9])

# Choose epsilon with the same approach as before
elbow_index = max(1, len(sorted_distances) // 10)  # Simple heuristic
eps_high = max(0.1, sorted_distances[elbow_index])  # Ensure epsilon is at least 0.1
print(f"Using epsilon = {eps_high:.2f}")

# Plot k-distance graph
plt.figure(figsize=(10, 6))
plt.plot(sorted_distances)
plt.axhline(y=0.7, color='r', linestyle='--', label='Potential epsilon')
plt.title('K-Distance Graph for High-Rated Games')
plt.xlabel('Points sorted by distance')
plt.ylabel('Distance to 10th nearest neighbor')
plt.grid(True)
plt.legend()
plt.show()
plt.savefig('../plots/k_distance_high_rated.png')
plt.close()

# Run DBSCAN With Chosen Hyperparams

In [None]:
# Apply DBSCAN with chosen epsilon
eps_high = eps_high  # Adjust based on the k-distance plot
min_samples_high = 15
print(f"Applying DBSCAN with eps={eps_high}, min_samples={min_samples_high}")

dbscan_high = DBSCAN(eps=eps_high, min_samples=min_samples_high)
high_clusters = dbscan_high.fit_predict(pca_result_high)

# Add clusters to dataframe
high_rated['Cluster'] = high_clusters

# Analyze clusters
n_clusters = len(set(high_clusters)) - (1 if -1 in high_clusters else 0)
n_noise = list(high_clusters).count(-1)
print(f"DBSCAN found {n_clusters} clusters and {n_noise} noise points ({n_noise/len(high_clusters)*100:.1f}%)")

# Calculate silhouette score if more than one cluster
if n_clusters > 1:
    # We need to filter both the data points and labels to exclude noise points
    mask = high_clusters != -1
    silhouette_avg = silhouette_score(pca_result_high[mask], high_clusters[mask])
    print(f"Silhouette score (excluding noise): {silhouette_avg:.4f}")

if n_clusters > 0:
    # Visualize clusters in 2D PCA space (PC1 vs PC2)
    plt.figure(figsize=(12, 10))
    scatter = plt.scatter(
        pca_result_high[:, 0], 
        pca_result_high[:, 1], 
        c=high_clusters, 
        cmap='viridis', 
        alpha=0.6, 
        s=30
    )
    plt.colorbar(scatter, label='Cluster')
    plt.title('High-Rated Board Game Clusters in PCA Space')
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.grid(True)
    plt.show()
    plt.savefig('../plots/high_rated_clusters.png')
    plt.close()

# Defining Features For Each Cluster

In [None]:
if n_clusters > 0:
    # For each cluster, find defining features
    print("\nDefining features for each high-rated cluster:")
    for i in sorted(set(high_clusters)):
        if i == -1:
            continue  # Skip noise points
            
        cluster_games = high_rated[high_rated['Cluster'] == i]
        other_games = high_rated[high_rated['Cluster'] != i]
        
        # Compare feature presence
        distinctive_features = []
        for col in all_binary_cols:
            cluster_mean = cluster_games[col].mean()
            other_mean = other_games[col].mean()
            diff = cluster_mean - other_mean
            distinctive_features.append((col, diff, cluster_mean, other_mean))
        
        # Sort by absolute difference
        distinctive_features.sort(key=lambda x: abs(x[1]), reverse=True)
        
        print(f"\nCluster {i} ({len(cluster_games)} games, avg rating: {cluster_games['AvgRating'].mean():.2f}):")
        print("Distinctive features:")
        for feat, diff, c_mean, o_mean in distinctive_features[:10]:
            print(f"  {feat}: {diff:.4f} ({c_mean*100:.1f}% vs {o_mean*100:.1f}%)")
        
        # Show example games
        top_games = cluster_games.sort_values('AvgRating', ascending=False).head(3)
        print("Example games:")
        for _, game in top_games.iterrows():
            print(f"  {game['Name']} ({game['YearPublished']}) - Rating: {game['AvgRating']:.2f}")

# Analyze Success Factors in Segments

In [None]:
# Filter out noise points (-1)
cluster_column = 'Cluster'
clustered_games = high_rated[high_rated[cluster_column] >= 0]

for cluster_id in sorted(clustered_games[cluster_column].unique()):
    cluster_games = clustered_games[clustered_games[cluster_column] == cluster_id]
    
    print(f"Cluster {cluster_id} ({len(cluster_games)} games):")
    
    # Average complexity and playtime if available
    if 'AvgComplexity' in high_rated.columns:
        print(f"  Average Complexity: {cluster_games['AvgComplexity'].mean():.2f}")
    
    if 'PlayingTime' in high_rated.columns:
        print(f"  Average Playing Time: {cluster_games['PlayingTime'].mean():.2f} minutes")
    
    # Player counts
    if 'MinPlayers' in high_rated.columns and 'MaxPlayers' in high_rated.columns:
        print(f"  Player Count: {cluster_games['MinPlayers'].mean():.1f}-{cluster_games['MaxPlayers'].mean():.1f} players")
    
    # Ratings statistics
    print(f"  Average Rating: {cluster_games['AvgRating'].mean():.2f}")
    print(f"  Rating Range: {cluster_games['AvgRating'].min():.2f}-{cluster_games['AvgRating'].max():.2f}")
    
    # Number of User Ratings (popularity metric)
    if 'NumUserRatings' in high_rated.columns:
        print(f"  Average User Ratings: {cluster_games['NumUserRatings'].mean():.1f}")
    
    # Top game examples
    top_games = cluster_games.sort_values('AvgRating', ascending=False).head(3)
    print("  Top rated games:")
    for _, game in top_games.iterrows():
        print(f"    {game['Name']} ({game['YearPublished']}) - {game['AvgRating']:.2f}")

# Analyze Trends Within Each Segment

In [None]:
# Filter out noise points (-1)
clustered_games = high_rated[high_rated[cluster_column] >= 0]

# Create year bins for analysis
clustered_games['YearBin'] = pd.cut(
    clustered_games['YearPublished'],
    bins=[1900, 1990, 2000, 2010, 2015, 2020, 2025],
    labels=['Pre-1990', '1990s', '2000s', '2010-2015', '2015-2020', '2020+']
)

for cluster_id in sorted(clustered_games[cluster_column].unique()):
    cluster_games = clustered_games[clustered_games[cluster_column] == cluster_id]
    
    print(f"Cluster {cluster_id} ({len(cluster_games)} games):")
    
    # Calculate correlation between year and rating
    year_rating_corr = cluster_games['YearPublished'].corr(cluster_games['AvgRating'])
    print(f"  Year-Rating Correlation: {year_rating_corr:.3f}")
    
    # Average rating by year range
    year_ratings = cluster_games.groupby('YearBin')['AvgRating'].agg(['mean', 'count'])
    print("  Ratings by year range:")
    for year_bin, row in year_ratings.iterrows():
        if row['count'] > 0:  # Only show non-empty years
            print(f"    {year_bin}: {row['mean']:.2f} ({int(row['count'])} games)")
    
    # Plot trend if there are enough games
    if len(cluster_games) >= 10:
        plt.figure(figsize=(10, 6))
        sns.regplot(x='YearPublished', y='AvgRating', data=cluster_games, 
                   scatter_kws={'alpha': 0.5}, line_kws={'color': 'red'})
        plt.title(f'Rating Trend Over Time for Cluster {cluster_id}')
        plt.xlabel('Year Published')
        plt.ylabel('Average Rating')
        plt.grid(True, alpha=0.3)
        plt.show()
        plt.savefig(f'../plots/cluster_{cluster_id}_trend.png')
        plt.close()

# Comparison with Regression Features

In [None]:
# Key features from our regression analysis
top_positive_features = feature_importance.head(6)['Feature'].tolist()
top_negative_features = ['Roll / Spin and Move', 'Card Game']

# Filter out noise points (-1)
clustered_games = high_rated[high_rated[cluster_column] >= 0]

# Overall presence in highly rated games (for comparison)
overall_presence = {}
for feature in top_positive_features + top_negative_features:
    if feature in high_rated.columns:
        overall_presence[feature] = high_rated[feature].mean()

for cluster_id in sorted(clustered_games[cluster_column].unique()):
    cluster_games = clustered_games[clustered_games[cluster_column] == cluster_id]
    
    print(f"Cluster {cluster_id} ({len(cluster_games)} games):")
    
    # Check presence of positive regression features
    print("  Positive predictors of high ratings:")
    for feature in top_positive_features:
        if feature in high_rated.columns:
            presence = cluster_games[feature].mean() * 100
            overall = overall_presence[feature] * 100
            diff = presence - overall
            print(f"    {feature}: {presence:.1f}% (overall: {overall:.1f}%, diff: {diff:+.1f}%)")
    
    # Check presence of negative regression features
    print("  Negative predictors of high ratings:")
    for feature in top_negative_features:
        if feature in high_rated.columns:
            presence = cluster_games[feature].mean() * 100
            overall = overall_presence[feature] * 100
            diff = presence - overall
            print(f"    {feature}: {presence:.1f}% (overall: {overall:.1f}%, diff: {diff:+.1f}%)")

# Find Potential Outliers in Segments

In [None]:
# Filter out noise points (-1)
clustered_games = high_rated[high_rated[cluster_column] >= 0]

for cluster_id in sorted(clustered_games[cluster_column].unique()):
    cluster_games = clustered_games[clustered_games[cluster_column] == cluster_id]
    
    print(f"Cluster {cluster_id} ({len(cluster_games)} games):")
    
    # Find distinctive features for this cluster
    distinctive_features = []
    for col in all_binary_cols:
        cluster_mean = cluster_games[col].mean()
        other_mean = high_rated[high_rated[cluster_column] != cluster_id][col].mean()
        diff = cluster_mean - other_mean
        
        # Consider a feature distinctive if the difference is substantial
        if abs(diff) > 0.3:
            distinctive_features.append((col, diff))
    
    # Sort by absolute difference
    distinctive_features.sort(key=lambda x: abs(x[1]), reverse=True)
    
    # Get key positive and negative distinctive features
    pos_features = [f for f, d in distinctive_features if d > 0 and cluster_games[f].mean() > 0.8]
    neg_features = [f for f, d in distinctive_features if d < 0]
    
    # Find games that don't fit the cluster pattern
    outliers = []
    for _, game in cluster_games.iterrows():
        reasons = []
        
        # Check if missing key positive features
        for feature in pos_features[:3]:  # Check top 3 positive features
            if game[feature] == 0:
                reasons.append(f"Missing {feature}")
        
        # Check if has features the cluster typically doesn't have
        for feature in neg_features[:3]:  # Check top 3 negative features
            if game[feature] == 1:
                reasons.append(f"Has unexpected {feature}")
        
        if reasons:
            outliers.append((game['Name'], game['YearPublished'], game['AvgRating'], reasons))
    
    # Sort by rating (highest first) and show top outliers
    outliers.sort(key=lambda x: x[2], reverse=True)
    if outliers:
        print("  Potential innovation outliers:")
        for name, year, rating, reasons in outliers[:3]:
            print(f"    {name} ({year}) - Rating: {rating:.2f}")
            print(f"      Reasons: {', '.join(reasons)}")
    else:
        print("  No significant outliers found in this cluster")