In [46]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score

# Load the dataset
df = pd.read_csv('data_creation_cleaned_no_duplicates.csv')

# Convert 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Create new columns for Year, Month, and Day
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

# Drop the original 'Date' column
df = df.drop('Date', axis=1)

# Identify categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns

# One-hot encode categorical columns
df = pd.get_dummies(df, columns=categorical_columns)

# Define your target variables
targets = ['TotalEarnings', 'TotalPlayers', 'TotalTournaments', 'Earnings_YoY_Growth', 'Players_YoY_Growth', 'Tournaments_YoY_Growth']

# Define the models
models = {
    'Linear Regression': LinearRegression(),
    'Gradient Boosting Regression': GradientBoostingRegressor(),
    'Logistic Regression': LogisticRegression(max_iter=1000)  # Adjust the number of iterations as needed
}

# For each target, train the models and evaluate
for target in targets:
    print(f'Target: {target}')
    print('------------------------')
    
    X = df.drop(target, axis=1)
    y = df[target]

    # Binarize the target variable for logistic regression
    y_binary = (y > y.median()).astype(int)  # You can adjust the threshold as needed

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        # Evaluate logistic regression using classification metrics
        if name == 'Logistic Regression':
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            
            print(f'{name}:')
            print(f'Accuracy: {accuracy}')
            print(f'Precision: {precision}')
            print(f'Recall: {recall}')
            print(f'F1 Score: {f1}')
        else:
            # For other regression models, calculate regression metrics
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            r2 = r2_score(y_test, y_pred)
            
            print(f'{name}:')
            print(f'Mean Squared Error (MSE): {mse}')
            print(f'Root Mean Squared Error (RMSE): {rmse}')
            print(f'R-squared (R2): {r2}')

        print('------------------------')


Target: TotalEarnings
------------------------
Linear Regression:
Mean Squared Error (MSE): 0.0004652918419456858
Root Mean Squared Error (RMSE): 0.021570624514503187
R-squared (R2): 0.9981385925244436
------------------------
Gradient Boosting Regression:
Mean Squared Error (MSE): 0.02113216206530329
Root Mean Squared Error (RMSE): 0.1453690547032046
R-squared (R2): 0.9154604467627506
------------------------
Logistic Regression:
Accuracy: 0.8842617631151974
Precision: 0.929678188319428
Recall: 0.8342245989304813
F1 Score: 0.8793686583990982
------------------------
Target: TotalPlayers
------------------------
Linear Regression:
Mean Squared Error (MSE): 0.0003044253035805937
Root Mean Squared Error (RMSE): 0.01744778792800376
R-squared (R2): 0.9987821701918926
------------------------
Gradient Boosting Regression:
Mean Squared Error (MSE): 0.0072194958414672295
Root Mean Squared Error (RMSE): 0.08496761642806763
R-squared (R2): 0.9711189670114976
------------------------
Logistic Re