In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the data
df = pd.read_csv('/Users/kkelley/Desktop/CHE4230Project/SL311.csv')

df.dropna(inplace=True)

df = df.round(2)

# Features: Select relevant columns for prediction
features = ['Away OR', 'Away DR', 'Away AT', 'Home OR', 'Home DR', 'Home AT']
X = df[features]

# Target: Predict Away Win (1 = win, 0 = loss)
y = df['Away Win']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Initialize a classifier (Random Forest)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the Random Forest model
rf_model.fit(X_train, y_train)

# Make predictions with Random Forest
rf_pred = rf_model.predict(X_test)

# Evaluate Random Forest model
rf_accuracy = accuracy_score(y_test, rf_pred)
rf_conf_matrix = confusion_matrix(y_test, rf_pred)
rf_class_report = classification_report(y_test, rf_pred)

# Initialize Logistic Regression
log_reg_model = LogisticRegression()

# Train the Logistic Regression model
log_reg_model.fit(X_train, y_train)

# Make predictions with Logistic Regression
log_reg_pred = log_reg_model.predict(X_test)

# Evaluate Logistic Regression model
log_reg_accuracy = accuracy_score(y_test, log_reg_pred)
log_reg_conf_matrix = confusion_matrix(y_test, log_reg_pred)
log_reg_class_report = classification_report(y_test, log_reg_pred)

# Get the coefficients and intercept from the trained Logistic Regression model
coefficients = log_reg_model.coef_[0]  # Coefficients for each feature
intercept = log_reg_model.intercept_[0]  # Intercept of the model

# Print Random Forest evaluation
print("Random Forest Model Evaluation:")
print(f"Accuracy: {rf_accuracy:.2f}")
print("Confusion Matrix:\n", rf_conf_matrix)
print("Classification Report:\n", rf_class_report)

# Print Logistic Regression evaluation
print("\nLogistic Regression Model Evaluation:")
print(f"Accuracy: {log_reg_accuracy:.2f}")
print("Confusion Matrix:\n", log_reg_conf_matrix)
print("Classification Report:\n", log_reg_class_report)

# Print Logistic Regression Coefficients
print("\nLogistic Regression Model Coefficients:")
print(f"Intercept: {intercept:.4f}")
for feature, coef in zip(features, coefficients):
    print(f"Coefficient for {feature}: {coef:.4f}")

#Use these in an excel function to calculate win probability: =1 / (1 + EXP(-(Intercept + (Coefficient1 * DeltaOR) + (Coefficient2 * DeltaDR) + (Coefficient3 * DeltaAT))))





Random Forest Model Evaluation:
Accuracy: 0.66
Confusion Matrix:
 [[159  54]
 [ 67  73]]
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.75      0.72       213
           1       0.57      0.52      0.55       140

    accuracy                           0.66       353
   macro avg       0.64      0.63      0.64       353
weighted avg       0.65      0.66      0.65       353


Logistic Regression Model Evaluation:
Accuracy: 0.70
Confusion Matrix:
 [[165  48]
 [ 59  81]]
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.77      0.76       213
           1       0.63      0.58      0.60       140

    accuracy                           0.70       353
   macro avg       0.68      0.68      0.68       353
weighted avg       0.69      0.70      0.69       353


Logistic Regression Model Coefficients:
Intercept: 0.0004
Coefficient for Away OR: 0.0984
Coefficient for Away D

In [8]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score


# Load the data
df = pd.read_csv('/Users/kkelley/Desktop/CHE4230Project/SL311.csv')

# Data cleaning
df.dropna(inplace=True)
df = df.round(2)

# Features: Select relevant columns for prediction
features = ['delta OR', 'delta DR', 'delta AT']
X = df[features]

# Target: Predict Away Win (1 = win, 0 = loss)
y = df['Away Win']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Standardize the features for Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize classifiers
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
log_reg_model = LogisticRegression()
gb_model = GradientBoostingClassifier(random_state=42)

# Train the models
rf_model.fit(X_train, y_train)
log_reg_model.fit(X_train_scaled, y_train)  # Standardized features for Logistic Regression
gb_model.fit(X_train, y_train)

# Make predictions
rf_pred = rf_model.predict(X_test)
log_reg_pred = log_reg_model.predict(X_test_scaled)
gb_pred = gb_model.predict(X_test)

# Evaluate models
def evaluate_model(pred, y_test):
    accuracy = accuracy_score(y_test, pred)
    conf_matrix = confusion_matrix(y_test, pred)
    class_report = classification_report(y_test, pred)
    auc = roc_auc_score(y_test, pred)  # AUC score
    return accuracy, conf_matrix, class_report, auc

rf_accuracy, rf_conf_matrix, rf_class_report, rf_auc = evaluate_model(rf_pred, y_test)
log_reg_accuracy, log_reg_conf_matrix, log_reg_class_report, log_reg_auc = evaluate_model(log_reg_pred, y_test)
gb_accuracy, gb_conf_matrix, gb_class_report, gb_auc = evaluate_model(gb_pred, y_test)

# Print evaluation results
print("Random Forest Model Evaluation:")
print(f"Accuracy: {rf_accuracy:.2f}, AUC: {rf_auc:.2f}")
print("Confusion Matrix:\n", rf_conf_matrix)
print("Classification Report:\n", rf_class_report)

print("\nLogistic Regression Model Evaluation:")
print(f"Accuracy: {log_reg_accuracy:.2f}, AUC: {log_reg_auc:.2f}")
print("Confusion Matrix:\n", log_reg_conf_matrix)
print("Classification Report:\n", log_reg_class_report)

print("\nGradient Boosting Model Evaluation:")
print(f"Accuracy: {gb_accuracy:.2f}, AUC: {gb_auc:.2f}")
print("Confusion Matrix:\n", gb_conf_matrix)
print("Classification Report:\n", gb_class_report)

# Optionally, cross-validation to evaluate stability of the model
cv_scores_rf = cross_val_score(rf_model, X, y, cv=5, scoring='accuracy')
cv_scores_log_reg = cross_val_score(log_reg_model, X_train_scaled, y_train, cv=5, scoring='accuracy')
cv_scores_gb = cross_val_score(gb_model, X, y, cv=5, scoring='accuracy')

print("\nCross-validation Accuracy Scores:")
print(f"Random Forest: {cv_scores_rf.mean():.2f}")
print(f"Logistic Regression: {cv_scores_log_reg.mean():.2f}")
print(f"Gradient Boosting: {cv_scores_gb.mean():.2f}")


Random Forest Model Evaluation:
Accuracy: 0.66, AUC: 0.64
Confusion Matrix:
 [[156  57]
 [ 62  78]]
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.73      0.72       213
           1       0.58      0.56      0.57       140

    accuracy                           0.66       353
   macro avg       0.65      0.64      0.65       353
weighted avg       0.66      0.66      0.66       353


Logistic Regression Model Evaluation:
Accuracy: 0.70, AUC: 0.68
Confusion Matrix:
 [[165  48]
 [ 59  81]]
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.77      0.76       213
           1       0.63      0.58      0.60       140

    accuracy                           0.70       353
   macro avg       0.68      0.68      0.68       353
weighted avg       0.69      0.70      0.69       353


Gradient Boosting Model Evaluation:
Accuracy: 0.69, AUC: 0.67
Confusion Matrix:
 [[170  43

In [7]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score


# Load the data
df = pd.read_csv('/Users/kkelley/Desktop/CHE4230Project/SL311.csv')

# Data cleaning
df.dropna(inplace=True)
df = df.round(2)

# Features: Select relevant columns for prediction
features = ['delta OR', 'delta DR', 'delta AT']
X = df[features]

# Target: Predict Away Win (1 = win, 0 = loss)
y = df['Away Win']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Standardize the features for Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize classifiers
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
log_reg_model = LogisticRegression()
gb_model = GradientBoostingClassifier(random_state=42)

# Train the models
rf_model.fit(X_train, y_train)
log_reg_model.fit(X_train_scaled, y_train)  # Standardized features for Logistic Regression
gb_model.fit(X_train, y_train)

# Make predictions
rf_pred = rf_model.predict(X_test)
log_reg_pred = log_reg_model.predict(X_test_scaled)
gb_pred = gb_model.predict(X_test)

# Evaluate models
def evaluate_model(pred, y_test):
    accuracy = accuracy_score(y_test, pred)
    conf_matrix = confusion_matrix(y_test, pred)
    class_report = classification_report(y_test, pred)
    auc = roc_auc_score(y_test, pred)  # AUC score
    return accuracy, conf_matrix, class_report, auc

rf_accuracy, rf_conf_matrix, rf_class_report, rf_auc = evaluate_model(rf_pred, y_test)
log_reg_accuracy, log_reg_conf_matrix, log_reg_class_report, log_reg_auc = evaluate_model(log_reg_pred, y_test)
gb_accuracy, gb_conf_matrix, gb_class_report, gb_auc = evaluate_model(gb_pred, y_test)

# Print evaluation results
print("Random Forest Model Evaluation:")
print(f"Accuracy: {rf_accuracy:.2f}, AUC: {rf_auc:.2f}")
print("Confusion Matrix:\n", rf_conf_matrix)
print("Classification Report:\n", rf_class_report)

print("\nLogistic Regression Model Evaluation:")
print(f"Accuracy: {log_reg_accuracy:.2f}, AUC: {log_reg_auc:.2f}")
print("Confusion Matrix:\n", log_reg_conf_matrix)
print("Classification Report:\n", log_reg_class_report)

print("\nGradient Boosting Model Evaluation:")
print(f"Accuracy: {gb_accuracy:.2f}, AUC: {gb_auc:.2f}")
print("Confusion Matrix:\n", gb_conf_matrix)
print("Classification Report:\n", gb_class_report)

# Optionally, cross-validation to evaluate stability of the model
cv_scores_rf = cross_val_score(rf_model, X, y, cv=5, scoring='accuracy')
cv_scores_log_reg = cross_val_score(log_reg_model, X_train_scaled, y_train, cv=5, scoring='accuracy')
cv_scores_gb = cross_val_score(gb_model, X, y, cv=5, scoring='accuracy')

print("\nCross-validation Accuracy Scores:")
print(f"Random Forest: {cv_scores_rf.mean():.2f}")
print(f"Logistic Regression: {cv_scores_log_reg.mean():.2f}")
print(f"Gradient Boosting: {cv_scores_gb.mean():.2f}")


Random Forest Model Evaluation:
Accuracy: 0.66, AUC: 0.64
Confusion Matrix:
 [[156  57]
 [ 62  78]]
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.73      0.72       213
           1       0.58      0.56      0.57       140

    accuracy                           0.66       353
   macro avg       0.65      0.64      0.65       353
weighted avg       0.66      0.66      0.66       353


Logistic Regression Model Evaluation:
Accuracy: 0.70, AUC: 0.68
Confusion Matrix:
 [[165  48]
 [ 59  81]]
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.77      0.76       213
           1       0.63      0.58      0.60       140

    accuracy                           0.70       353
   macro avg       0.68      0.68      0.68       353
weighted avg       0.69      0.70      0.69       353


Gradient Boosting Model Evaluation:
Accuracy: 0.69, AUC: 0.67
Confusion Matrix:
 [[170  43