In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, log_loss
import numpy as np

In [None]:
# Load the dataset
file_path = 'Gaming1.csv'  # Update this path if necessary
data = pd.read_csv(file_path)

# Display the first few rows and summary of the dataset to understand its structure
data.head(), data.info(), data.describe()

In [None]:
# Define features and target variable
X = data.drop(columns=['Game', 'TotalPlayers'])
y = data['TotalPlayers']

# To convert the problem to a classification problem, we will binarize the target variable
# For simplicity, we'll classify the total number of players into 3 bins: low, medium, high
y_binned = pd.qcut(y, q=3, labels=[0, 1, 2])

In [None]:
# Define preprocessing for numeric columns (impute missing values and scale)
numeric_features = ['ReleaseDate', 'TotalEarnings', 'OfflineEarnings', 'PercentOffline', 'TotalTournaments']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Define preprocessing for categorical columns (one-hot encode)
categorical_features = ['Genre']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_binned, test_size=0.2, random_state=42)

In [None]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

In [None]:
# Train and evaluate models
results = {}
for model_name, model in models.items():
    clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_pred_proba = clf.predict_proba(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    loss = log_loss(y_test, y_pred_proba)
    
    results[model_name] = {
        'Accuracy': accuracy,
        'F1 Score': f1,
        'Precision': precision,
        'Log Loss': loss
    }

In [None]:
# Display the results
results

In [None]:
import matplotlib.pyplot as plt

# Function to plot the evaluation metrics
def plot_results(results):
    metrics = ['Accuracy', 'F1 Score', 'Precision', 'Log Loss']
    models = list(results.keys())

    fig, axes = plt.subplots(2, 2, figsize=(14, 10))

    for i, metric in enumerate(metrics):
        values = [results[model][metric] for model in models]
        ax = axes[i//2, i%2]
        ax.bar(models, values, color=['blue', 'green', 'orange'])
        ax.set_title(metric)
        ax.set_ylabel(metric)
        ax.set_ylim(0, 1 if metric != 'Log Loss' else max(values) * 1.1)
        for index, value in enumerate(values):
            ax.text(index, value, f'{value:.2f}', ha='center', va='bottom')

    plt.tight_layout()
    plt.show()

# Call the function to plot the results
plot_results(results)

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Function to plot confusion matrices
def plot_confusion_matrix(y_test, y_pred, model_name):
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Low', 'Medium', 'High'], yticklabels=['Low', 'Medium', 'High'])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.show()

# Plot confusion matrices for each model
for model_name, model in models.items():
    clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    plot_confusion_matrix(y_test, y_pred, model_name)

In [None]:
# Function to plot comparative performance
def plot_comparative_performance(results):
    metrics = ['Accuracy', 'F1 Score', 'Precision', 'Log Loss']
    models = list(results.keys())

    data = []
    for metric in metrics:
        for model in models:
            data.append([model, metric, results[model][metric]])

    df = pd.DataFrame(data, columns=['Model', 'Metric', 'Value'])

    plt.figure(figsize=(12, 8))
    sns.barplot(x='Metric', y='Value', hue='Model', data=df, palette='Set1')
    plt.title('Comparative Performance of Models')
    plt.ylim(0, 1 if 'Log Loss' not in df['Metric'].values else None)
    for index, row in df.iterrows():
        plt.text(index % 4, row.Value, f'{row.Value:.2f}', ha='center', va='bottom')
    plt.show()

# Call the function to plot comparative performance
plot_comparative_performance(results)