In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer

import warnings

warnings.filterwarnings("ignore", category=UserWarning, module="xgboost")

In [2]:
# Load dataset
df = pd.read_csv('../../data/player_game_statistics.csv')

In [3]:
# Inspect the target column
print(df['player_level'].value_counts())

KeyError: 'player_level'

In [None]:
# Handle missing values
imputer = SimpleImputer(strategy='mean')
numerical_columns = df.select_dtypes(include=np.number).columns
df[numerical_columns] = imputer.fit_transform(df[numerical_columns])
# df = df.drop(columns=['player_id', 'game_id'])

# Encode categorical columns
categorical_columns = ['country', 'gender', 'game_name']
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Encode target variable
label_encoder = LabelEncoder()
df['churned'] = label_encoder.fit_transform(df['churned'])  # Convert churned to binary
df['player_level'] = label_encoder.fit_transform(df['player_level'])

In [None]:
# Split the dataset
X = df.drop(columns=['player_level','game_id', 'player_id', 'username', 'last_played'])  # Remove non-informative columns
y = df['player_level']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Normalize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Initialize models
log_reg = LogisticRegression(max_iter=1000, random_state=42)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
gb_clf = GradientBoostingClassifier(random_state=42)
svm_clf = SVC(probability=True, random_state=42)


In [None]:
# Train and evaluate each model
models = {
    "Logistic Regression": log_reg,
    "Random Forest": rf_clf,
    "Gradient Boosting": gb_clf,
    "SVM": svm_clf
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Performance of {name}:")
    print(classification_report(y_test, y_pred))

In [None]:
# Collecting all models' results
models_list = ['Logistic Regression', 'GBoost', 'SVM', 'Random Forest', 'Voting Classifier']
accuracies = [
    0.88, 0.95, 0.88, 0.95, 0.94
]

In [None]:
# # Collecting all models' results
# models_list = ['Logistic Regression', 'GBoost', 'SVM', 'Random Forest', 'Voting Classifier']
# accuracies = [
#     accuracy_score(y_test, log_reg_pred),
#     accuracy_score(y_test, xgb_pred),
#     accuracy_score(y_test, svm_pred),
#     accuracy_score(y_test, rf_pred),
#     accuracy_score(y_test, voting_pred)
# ]

In [None]:
import matplotlib.pyplot as plt
# Plotting the comparison
plt.figure(figsize=(10, 6))
plt.bar(models_list, accuracies, color=['blue', 'orange', 'green', 'red', 'purple'])
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Model Comparison')
plt.ylim(0.6, 1.0)  # Adjust as necessary
plt.xticks(rotation=15)
plt.show()

In [None]:
# Ensemble Model (Voting Classifier)
voting_clf = VotingClassifier(
    estimators=[
        ('log_reg', log_reg),
        ('rf', rf_clf),
        ('gb', gb_clf),
        ('svm', svm_clf)
    ],
    voting='soft'  # Soft voting uses predicted probabilities
)
voting_clf.fit(X_train, y_train)
y_pred_ensemble = voting_clf.predict(X_test)

In [None]:
# Evaluate ensemble model
print("Performance of Ensemble Model (Voting Classifier):")
print(classification_report(y_test, y_pred_ensemble))

In [None]:
# Identify the best-performing model
ensemble_accuracy = accuracy_score(y_test, y_pred_ensemble)
print(f"Ensemble Model Accuracy: {ensemble_accuracy:.2f}")

#### Model Testing on new data