In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
import pickle

In [2]:
# Load data
df = pd.read_csv('player_game_statistics.csv')

In [3]:
# Simple feature preparation
# Encode categorical variables
le_gender = LabelEncoder()
le_country = LabelEncoder()
df['gender_encoded'] = le_gender.fit_transform(df['gender'])
df['country_encoded'] = le_gender.fit_transform(df['country'])

# Convert last_played to datetime and calculate days since last played
df['last_played'] = pd.to_datetime(df['last_played'])
df['days_since_last_play'] = (pd.Timestamp.now() - df['last_played']).dt.days

# Define churn (player is considered churned if they haven't played in 30 days and have below average win ratio)
avg_win_ratio = df['win_ratio'].mean()
df['churned'] = ((df['days_since_last_play'] > 30) & (df['win_ratio'] < avg_win_ratio)).astype(int)

# Select features
features = [
    'total_games_played',
    'total_wins',
    'total_losses',
    'total_moves',
    'total_time_played_minutes',
    'win_ratio',
    'rating',
    'age',
    'gender_encoded',
    'country_encoded',
    'days_since_last_play'
]

X = df[features]
y = df['churned']


In [4]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [6]:
# Initialize models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'SVM': SVC(random_state=42, probability=True)  # Enable probability estimates for SVM
}


In [7]:
# Dictionary to store model performances
model_performances = {}

# Train and evaluate models
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_scaled, y_train)

    y_pred = model.predict(X_test_scaled)

    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)

    model_performances[name] = {
        'model': model,
        'accuracy': accuracy,
        'report': report
    }

    print(f"\nResults for {name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))



Training Random Forest...

Results for Random Forest:
Accuracy: 1.0000

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       168
           1       1.00      1.00      1.00        32

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200


Training Logistic Regression...

Results for Logistic Regression:
Accuracy: 0.8400

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.93      0.91       168
           1       0.50      0.38      0.43        32

    accuracy                           0.84       200
   macro avg       0.69      0.65      0.67       200
weighted avg       0.82      0.84      0.83       200


Training XGBoost...

Results for XGBoost:
Accuracy: 1.0000

Classification Report:
              precision    recall  f1-score   support

   

In [8]:
# Select the best model based on accuracy
best_model_name = max(model_performances.items(), key=lambda x: x[1]['accuracy'])[0]
best_model = model_performances[best_model_name]['model']

print(f"\nBest performing model: {best_model_name}")
print(f"Best Accuracy: {model_performances[best_model_name]['accuracy']:.4f}")
print("\nDetailed Classification Report for Best Model:")
print(classification_report(y_test, best_model.predict(X_test_scaled)))



Best performing model: Random Forest
Best Accuracy: 1.0000

Detailed Classification Report for Best Model:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       168
           1       1.00      1.00      1.00        32

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



In [9]:
# Save best model, scaler, and encoders
with open('churn_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

with open('churn_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open('gender_encoder.pkl', 'wb') as f:
    pickle.dump(le_gender, f)

with open('country_encoder.pkl', 'wb') as f:
    pickle.dump(le_country, f)


In [10]:
# Print feature importances if the best model supports it
if hasattr(best_model, 'feature_importances_'):
    print("\nFeature importances:")
    for feat, imp in zip(features, best_model.feature_importances_):
        print(f"{feat}: {imp:.4f}")
elif hasattr(best_model, 'coef_'):
    print("\nFeature coefficients:")
    for feat, coef in zip(features, best_model.coef_[0]):
        print(f"{feat}: {coef:.4f}")



Feature importances:
total_games_played: 0.0344
total_wins: 0.0532
total_losses: 0.0489
total_moves: 0.0371
total_time_played_minutes: 0.0420
win_ratio: 0.2443
rating: 0.0377
age: 0.0177
gender_encoded: 0.0047
country_encoded: 0.0135
days_since_last_play: 0.4665
