<a href="https://colab.research.google.com/github/HarryJ12/AI-Powered_NFL_Team_Performance_Predictor/blob/main/Model_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Include directives
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, brier_score_loss, roc_auc_score
from scipy.stats import norm

In [None]:
# Upload CSV
from google.colab import files
import io

uploaded = files.upload()

for filename in uploaded.keys():
    df = pd.read_csv(io.StringIO(uploaded[filename].decode('utf-8')))
    print(f"DataFrame from {filename}:")
    display(df.head())

Saving theOne.csv to theOne.csv
DataFrame from theOne.csv:


Unnamed: 0,season,week,month,team,opponent,location,win_numeric,score_for,score_against,first_downs,...,turnovers_forced,turnovers_commited,turnover_diff_pct,def_st_td,possession_time,redzone_efficiency,third_down_efficiency,yards_per_play,rush_yards_per_attempt,pass_completion_pct
0,2022,1,9,Bills,Rams,0,1,31,10,23,...,3,4,-1,0,31,100.0,90.0,7.1,4.8,83.9
1,2022,1,9,Rams,Bills,1,0,10,31,19,...,4,3,1,0,29,50.0,46.2,3.7,2.9,70.7
2,2022,1,9,Browns,Panthers,0,1,26,24,23,...,1,0,1,0,38,25.0,44.4,4.8,5.6,52.9
3,2022,1,9,Panthers,Browns,1,0,24,26,15,...,0,1,-1,0,22,66.7,36.4,5.2,2.8,59.3
4,2022,1,9,Jaguars,Commanders,0,0,22,28,24,...,3,1,2,0,27,40.0,25.0,6.2,6.8,57.1


In [None]:
# REGRESSION MODEL

# Prepare Data
selected_features = [
    "win_numeric",
    "score_against",
    "first_downs",
    "third_down_comp",
    "yards",
    "pass_yards",
    "rush_att",
    "rush_yards",
    "redzone_comp",
    "redzone_att",
    "sacks_num",
    "turnovers_forced",
    "turnover_diff_pct",
    "possession_time",
    "redzone_efficiency",
    "third_down_efficiency",
    "yards_per_play",
    "pass_completion_pct",
]

regression_target = "score_for"

# Split train and test data
train_df = df[df["season"].isin([2022, 2023])]
test_df = df[df["season"] == 2024]

X_train = train_df[selected_features].copy()
X_test = test_df[selected_features].copy()

y_train = train_df[regression_target].copy()
y_test = test_df[regression_target].copy()


# Since possession_time is in "MM:SS" format, convert to seconds
if X_train['possession_time'].dtype == object:
    def time_to_seconds(t):
        try:
            m, s = map(int, t.split(':'))
            return m*60 + s
        except:
            return 0
    X_train['possession_time'] = X_train['possession_time'].apply(time_to_seconds)
    X_test['possession_time'] = X_test['possession_time'].apply(time_to_seconds)

# Train Model
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Clip negative predictions
y_pred_lr = np.clip(y_pred_lr, 0, None)

# Evaluate & print evaluation metrics
print("Linear Regression RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_lr)))
print("Linear Regression R²:", r2_score(y_test, y_pred_lr))

Linear Regression RMSE: 4.349027838123368
Linear Regression R²: 0.8047872752472223


In [19]:
# LOGISTIC MODEL

# Since possession_time is in "MM:SS" format, convert to seconds
if df["possession_time"].dtype == object:
    df["possession_time_minutes"] = (
        df["possession_time"]
        .str.split(":", expand=True)
        .astype(int)
        .apply(lambda x: x[0] + x[1] / 60, axis=1)
    )
else:
    df["possession_time_minutes"] = df["possession_time"]

# Engineer turnover_impact
df["turnover_impact"] = df["turnover_diff_pct"] * df["possession_time_minutes"]

# Final feature list
features = [
    "turnovers_forced", "turnover_diff_pct",
    "redzone_comp", "redzone_att", "possession_time_minutes",
    "pass_att", "rush_att", "location",
    "redzone_efficiency", "turnover_impact"
]

target = "win_numeric"

# Split train and test data
train_df = df[df["season"].isin([2022, 2023])]
test_df  = df[df["season"] == 2024]

X_train = train_df[features]
y_train = train_df[target]

X_test = test_df[features]
y_test = test_df[target]

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# Train Model
lr2 = LogisticRegression(max_iter=5000, class_weight="balanced")
lr2.fit(X_train_scaled, y_train)

# Evaluate & print evaluation metrics
print("Logistical Regression Accuracy:", accuracy_score(y_test, (logi_proba >= 0.5).astype(int)))
print("Logistical Regression F1 Score:", f1_score(y_test, (logi_proba >= 0.5).astype(int)))
print("Logistical Regression Brier Score:", brier_score_loss(y_test, lr2.predict_proba(X_test_scaled)[:, 1]))
print("Logistical Regression ROC-AUC:", roc_auc_score(y_test, lr2.predict_proba(X_test_scaled)[:, 1]))

Logistical Regression Accuracy: 0.7982456140350878
Logistical Regression F1 Score: 0.7985989492119089
Logistical Regression Brier Score: 0.13864144000973638
Logistical Regression ROC-AUC: 0.8837426900584795


In [25]:
# META MODEL

# Get probability of the positive class (win = 1) from logistic regression
cl = lr2.predict_proba(X_clf_test_s)[:, 1]

# Adjusts a score prediction using win probability (Vegas-style bias)
def vegas_meta(score_pred, win_prob):
    bias = (win_prob - 0.5) * 2
    return float(np.clip(score_pred + bias, 0, None))

# Apply meta-model to each (score prediction, win probability) pair
meta_scores = [
    vegas_meta(score, prob)
    for score, prob in zip(y_pred_lr, cl)
]

# Evaluate & print evaluation metrics
meta_scores = np.array(meta_scores)
print(f"Meta Model RMSE: {np.sqrt(mean_squared_error(y_reg_test, meta_scores)):.3f}")
print(f"Meta Model R²: {r2_score(y_reg_test, meta_scores):.3f}")

Meta Model RMSE: 4.401
Meta Model R²: 0.800


In [29]:
# MODEL TESTER

def predict_matchup(team1, team2):
    # 3-year history (2022–2024)
    t1_games = df[df["team"] == team1]
    t2_games = df[df["team"] == team2]

    # REGRESSION INPUTS
    t1_reg = t1_games[reg_features].apply(pd.to_numeric, errors="coerce").fillna(0).mean().to_frame().T
    t2_reg = t2_games[reg_features].apply(pd.to_numeric, errors="coerce").fillna(0).mean().to_frame().T

    t1_score_raw = float(np.clip(reg.predict(t1_reg)[0], 0, None))
    t2_score_raw = float(np.clip(reg.predict(t2_reg)[0], 0, None))

    # CLASSIFIER INPUTS
    t1_clf_df = t1_games[clf_features].apply(pd.to_numeric, errors="coerce").fillna(0).mean().to_frame().T
    t2_clf_df = t2_games[clf_features].apply(pd.to_numeric, errors="coerce").fillna(0).mean().to_frame().T

    t1_win_raw = clf.predict_proba(scaler.transform(t1_clf_df))[0, 1]
    t2_win_raw = clf.predict_proba(scaler.transform(t2_clf_df))[0, 1]

    # META-ADJUSTED SCORES
    t1_score = vegas_meta(t1_score_raw, t1_win_raw)
    t2_score = vegas_meta(t2_score_raw, t2_win_raw)

    # WIN PROBABILITY CALCULATIONS
    spread = t1_score - t2_score
    t1_prob = float(norm.cdf(spread / 13.86))
    t2_prob = 1 - t1_prob

    # OUTPUT
    return {
        "matchup": f"{team1} vs {team2}",
        "predicted_scores": {
            team1: round(t1_score, 1),
            team2: round(t2_score, 1)
        },
        "win_probabilities": {
            team1: f"{round(t1_prob * 100):02d}%",
            team2: f"{round(t2_prob * 100):02d}%"
        }
    }

predict_matchup("Rams", "Bills")

{'matchup': 'Rams vs Bills',
 'predicted_scores': {'Rams': 21.7, 'Bills': 28.4},
 'win_probabilities': {'Rams': '32%', 'Bills': '68%'}}