In [1]:
import pandas as pd
import numpy as np
from IPython.display import display
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import pickle

In [2]:
# Load dataset
df = pd.read_csv('../../data/player_game_statistics.csv')
display(df.columns)

Index(['stat_id', 'player_id', 'player_name', 'age', 'gender', 'country',
       'game_id', 'game_name', 'total_games_played', 'total_wins',
       'total_losses', 'total_moves', 'total_time_played_minutes', 'win_ratio',
       'rating', 'last_played'],
      dtype='object')

In [4]:
display(df.game_name.unique())

array(['battleship', 'chess', 'connect four', 'tic tac toe',
       'dots and boxes'], dtype=object)

In [None]:
# Inspect the target column (game engagement duration)
display(df.head())

In [None]:
# Engineer features
df['avg_session_duration'] = df['total_time_played_minutes'] / df['total_games_played']
df['win_rate'] = df['total_wins'] / df['total_games_played']
df['avg_moves_per_game'] = df['total_moves'] / df['total_games_played']

# Define target variable (predicted engagement time in minutes)
df['future_engagement'] = df['total_time_played_minutes'] * (1 + df['win_rate']) * (df['total_games_played'] / df['age'])

# Select features for model
features = ['avg_session_duration', 'win_rate', 'avg_moves_per_game',
           'total_games_played', 'age', 'total_time_played_minutes']


In [None]:
# Prepare features and target
X = df[features]
y = df['future_engagement']

### Train Engagement prediction models
-------------------------------------------------------

#### Split data

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#### Scale Features

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


#### Random Forest Regressor

In [None]:
# Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train_scaled, y_train)
lin_reg_pred = lin_reg.predict(X_test_scaled)

print("Linear Regression Performance:")
print(f'R2 Score: {r2_score(y_test, lin_reg_pred):.4f}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, lin_reg_pred)):.4f}')

#### Random Forest Regressor

In [None]:
# Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)
rf_pred = rf_model.predict(X_test_scaled)

print("\nRandom Forest Performance:")
print(f'R2 Score: {r2_score(y_test, rf_pred):.4f}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, rf_pred)):.4f}')

#### XGBoost Regressor

In [None]:
# XGBoost Regressor
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_model.fit(X_train_scaled, y_train)
xgb_pred = xgb_model.predict(X_test_scaled)

print("\nXGBoost Performance:")
print(f'R2 Score: {r2_score(y_test, xgb_pred):.4f}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, xgb_pred)):.4f}')


#### Support Vector Regressor

In [None]:
# SVR
svr_model = SVR(kernel='rbf')
svr_model.fit(X_train_scaled, y_train)
svr_pred = svr_model.predict(X_test_scaled)

print("\nSVR Performance:")
print(f'R2 Score: {r2_score(y_test, svr_pred):.4f}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, svr_pred)):.4f}')

#### Gradient Boosting Regressor

In [None]:
# Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train_scaled, y_train)
gb_pred = gb_model.predict(X_test_scaled)

print("\nGradient Boosting Performance:")
print(f'R2 Score: {r2_score(y_test, gb_pred):.4f}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, gb_pred)):.4f}')

#### Model Comparison Plot

In [None]:
# Model Comparison Plot
models = ['Linear Regression', 'Random Forest', 'XGBoost', 'SVR', 'Gradient Boosting']
r2_scores = [
    r2_score(y_test, lin_reg_pred),
    r2_score(y_test, rf_pred),
    r2_score(y_test, xgb_pred),
    r2_score(y_test, svr_pred),
    r2_score(y_test, gb_pred)
]

plt.figure(figsize=(12, 6))
plt.bar(models, r2_scores, color=['blue', 'green', 'red', 'purple', 'orange'])
plt.xlabel('Model')
plt.ylabel('R² Score')
plt.title('Model Comparison')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


#### Feature Importance Plot (using Random Forest)

In [None]:
# Feature Importance Plot (using Random Forest)
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': rf_model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()


#### Feature Importance Plot (using Linear Regression)

In [None]:
# Feature Importance Plot (using Linear Regression)
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': lin_reg.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

#### Feature Importance Plot (XGBoost)

In [None]:
# Feature Importance Plot (XGBoost)
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': xgb_model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

#### Feature Importance Plot (Support Vector Regression)

In [None]:
# Feature Importance Plot (Support Vector Regression)
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': svr_model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()


#### Comparison of all models

In [None]:
# Print a formatted comparison of all models
models_comparison = {
    'Linear Regression': (lin_reg_pred, 'Simple, interpretable but may underfit'),
    'Random Forest': (rf_pred, 'Good with non-linear relationships, handles outliers well'),
    'XGBoost': (xgb_pred, 'Usually high performance, may need tuning'),
    'SVR': (svr_pred, 'Good for non-linear data, slower on large datasets'),
    'Gradient Boosting': (gb_pred, 'Often best performance, may overfit')
}

print("\nModel Comparison Summary:")
print("-" * 60)
print(f"{'Model':<20} {'R² Score':<12} {'RMSE':<12} {'Best For'}")
print("-" * 60)

best_r2 = 0
best_rmse = float('inf')
best_model = None

for name, (predictions, description) in models_comparison.items():
    r2 = r2_score(y_test, predictions)
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    
    print(f"{name:<20} {r2:>10.4f}  {rmse:>10.2f}  {description}")
    
    # Track best performing model
    if r2 > best_r2:
        best_r2 = r2
        best_rmse = rmse
        best_model = name

print("-" * 60)
print(f"\nBest Performing Model: {best_model}")
print(f"Best R² Score: {best_r2:.4f}")
print(f"Best RMSE: {best_rmse:.2f}")

#### Save the best model

In [None]:
# # Save the best model (assuming Random Forest performs best)
# with open('engagement_model.pkl', 'wb') as f:
#     pickle.dump(rf_model, f)
# with open('engagement_scaler.pkl', 'wb') as f:
#     pickle.dump(scaler, f)