In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

In [None]:
# Load dataset
df = pd.read_csv('../../data/player_game_statistics.csv')

In [None]:
# Inspect the target column (game engagement duration)
print(df.head())

In [None]:
# Convert last_played to datetime
df['last_played'] = pd.to_datetime(df['last_played'])

In [None]:
# Calculate engagement duration (assuming last_played - first_played)
# Since we don't have first_played, we'll use total_time_played_minutes
engagement_duration = df['total_time_played_minutes']

In [None]:
features_df = df.copy()
    
# Encode categorical variables
le = LabelEncoder()
features_df['player_level'] = le.fit_transform(features_df['player_level'])
features_df['gender'] = le.fit_transform(features_df['gender'])
features_df['country'] = le.fit_transform(features_df['country'])
features_df['game_name'] = le.fit_transform(features_df['game_name'])
    
# Select relevant features for prediction
selected_features = [
    'age', 'total_games_played', 'total_wins', 
    'total_losses', 'win_ratio', 'total_moves',
    'highest_score', 'rating', 'player_level',
    'game_name', 'country'
]

In [None]:
# Prepare features
X = df[selected_features]
y = engagement_duration

In [None]:
# 3. Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# 4. Scale the Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# 5. Train the Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

In [None]:
# 6. Make Predictions
y_pred = rf_model.predict(X_test_scaled)

In [None]:
# 7. Evaluate the Model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Model Performance Metrics:")
print(f"RMSE: {rmse:.2f} minutes")
print(f"R² Score: {r2:.3f}")

In [None]:
# 8. Feature Importance Analysis
feature_importance = pd.DataFrame({
    'feature': selected_features,
    'importance': rf_model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

In [None]:
# 9. Visualizations
plt.figure(figsize=(15, 10))

# Feature Importance Plot
plt.subplot(2, 1, 1)
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance for Engagement Prediction')
plt.xlabel('Importance Score')

# Predicted vs Actual Plot
plt.subplot(2, 1, 2)
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Engagement Duration (minutes)')
plt.ylabel('Predicted Engagement Duration (minutes)')
plt.title('Predicted vs Actual Engagement Duration')

plt.tight_layout()
plt.show()

In [None]:
# 10. Predictions for Example Players
print("\nExample Predictions for Different Player Segments:")

# High-performing player
high_performer = X.iloc[df['win_ratio'].argmax()]
high_performer_scaled = scaler.transform([high_performer])
pred_high = rf_model.predict(high_performer_scaled)
print(f"Predicted engagement for high performer: {pred_high[0]:.0f} minutes")

# New player
new_player = X.iloc[df['total_games_played'].argmin()]
new_player_scaled = scaler.transform([new_player])
pred_new = rf_model.predict(new_player_scaled)
print(f"Predicted engagement for new player: {pred_new[0]:.0f} minutes")