# 3. Random Forest Regression Analysis
Identifying key features that predict board game ratings.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN, KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, silhouette_score
from sklearn.tree import DecisionTreeClassifier, plot_tree

# Set display options and styling
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.style.use('seaborn-v0_8-whitegrid')

# Create output directory for plots if it doesn't exist
import os
if not os.path.exists('../plots'):
    os.makedirs('../plots')
if not os.path.exists('../frames'):
    os.makedirs('../frames')

In [None]:
# Load processed data
filtered_df = pd.read_csv('../frames/filtered_games.csv')
all_binary_cols = np.load('../frames/all_binary_cols.npy', allow_pickle=True)

print(f"Loaded {len(filtered_df)} games with {len(all_binary_cols)} binary features")

# Train RandomForestRegressor Model

In [None]:
# Split data
X = filtered_df[all_binary_cols]
y = filtered_df['AvgRating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a random forest to predict ratings
print("Training Random Forest Regressor...")
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Evaluate
y_pred = rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"MSE: {mse:.4f}")
print(f"R² score: {r2:.4f}")

# Analyze Feature Importance

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'Feature': all_binary_cols,
    'Importance': rf.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nTop 20 features that predict high ratings:")
print(feature_importance.head(20))

# Visualize feature importance
plt.figure(figsize=(12, 10))
top_features = feature_importance.head(20)
sns.barplot(x='Importance', y='Feature', data=top_features)
plt.title('Top 20 Features That Predict Higher Board Game Ratings')
plt.tight_layout()
plt.show()
plt.savefig('../plots/feature_importance.png')
plt.close()

In [None]:
# Analyze top features across rating brackets
print("\nAnalyzing top features across rating brackets:")
top_regression_features = feature_importance.head(10)['Feature'].tolist()

for feature in top_regression_features:
    print(f"\nFeature: {feature}")
    feature_by_rating = filtered_df.groupby('Rating_Bracket')[feature].mean()
    print(feature_by_rating)
    
    # Count of games with this feature in each bracket
    games_with_feature = filtered_df[filtered_df[feature] == 1].groupby('Rating_Bracket').size()
    total_in_bracket = filtered_df.groupby('Rating_Bracket').size()
    percentage = (games_with_feature / total_in_bracket * 100).fillna(0)
    print(f"Percentage of games with this feature:")
    print(percentage)
    
    # Plot feature presence by rating bracket
    plt.figure(figsize=(10, 6))
    percentage.plot(kind='bar')
    plt.title(f'Presence of {feature} by Rating Bracket')
    plt.ylabel('Percentage of Games')
    plt.xlabel('Rating Bracket')
    plt.ylim(0, 100)
    plt.grid(True, axis='y')
    plt.show()
    plt.savefig(f'../plots/feature_{feature.replace("/", "_").replace(" ", "_")}_by_rating.png')
    plt.close()

In [None]:
# Save feature importance for other notebooks
feature_importance.to_csv('../frames/feature_importance.csv', index=False)