In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the data
df = pd.read_csv('/Users/kkelley/Desktop/CBB Results/CBB-Results/SLCBBResults223.csv')

df.dropna(inplace=True)

df = df.round(2)

# Features: Select relevant columns for prediction
features = ['delta OR', 'delta DR', 'delta AT']
X = df[features]

# Target: Predict Away Win (1 = win, 0 = loss)
y = df['Away win']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Initialize a classifier (Random Forest)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the Random Forest model
rf_model.fit(X_train, y_train)

# Make predictions with Random Forest
rf_pred = rf_model.predict(X_test)

# Evaluate Random Forest model
rf_accuracy = accuracy_score(y_test, rf_pred)
rf_conf_matrix = confusion_matrix(y_test, rf_pred)
rf_class_report = classification_report(y_test, rf_pred)

# Initialize Logistic Regression
log_reg_model = LogisticRegression()

# Train the Logistic Regression model
log_reg_model.fit(X_train, y_train)

# Make predictions with Logistic Regression
log_reg_pred = log_reg_model.predict(X_test)

# Evaluate Logistic Regression model
log_reg_accuracy = accuracy_score(y_test, log_reg_pred)
log_reg_conf_matrix = confusion_matrix(y_test, log_reg_pred)
log_reg_class_report = classification_report(y_test, log_reg_pred)

# Get the coefficients and intercept from the trained Logistic Regression model
coefficients = log_reg_model.coef_[0]  # Coefficients for each feature
intercept = log_reg_model.intercept_[0]  # Intercept of the model

# Print Random Forest evaluation
print("Random Forest Model Evaluation:")
print(f"Accuracy: {rf_accuracy:.2f}")
print("Confusion Matrix:\n", rf_conf_matrix)
print("Classification Report:\n", rf_class_report)

# Print Logistic Regression evaluation
print("\nLogistic Regression Model Evaluation:")
print(f"Accuracy: {log_reg_accuracy:.2f}")
print("Confusion Matrix:\n", log_reg_conf_matrix)
print("Classification Report:\n", log_reg_class_report)

# Print Logistic Regression Coefficients
print("\nLogistic Regression Model Coefficients:")
print(f"Intercept: {intercept:.4f}")
for feature, coef in zip(features, coefficients):
    print(f"Coefficient for {feature}: {coef:.4f}")

#Use these in an excel function to calculate win probability: =1 / (1 + EXP(-(Intercept + (Coefficient1 * DeltaOR) + (Coefficient2 * DeltaDR) + (Coefficient3 * DeltaAT))))





Random Forest Model Evaluation:
Accuracy: 0.64
Confusion Matrix:
 [[100  30]
 [ 54  52]]
Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.77      0.70       130
           1       0.63      0.49      0.55       106

    accuracy                           0.64       236
   macro avg       0.64      0.63      0.63       236
weighted avg       0.64      0.64      0.64       236


Logistic Regression Model Evaluation:
Accuracy: 0.67
Confusion Matrix:
 [[106  24]
 [ 55  51]]
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.82      0.73       130
           1       0.68      0.48      0.56       106

    accuracy                           0.67       236
   macro avg       0.67      0.65      0.65       236
weighted avg       0.67      0.67      0.65       236


Logistic Regression Model Coefficients:
Intercept: -0.5357
Coefficient for delta OR: -0.0894
Coefficient for del