In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import shap

In [2]:
raw_data = pd.read_excel('ncaa_data.xlsx')
required_columns = {'Team Display Name', 'Date', 'Opponent', 'PTS', 'FGM', 'FGA', 
                    'FTM', 'FTA', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TO', 'Win', 'Game Score'}
if not required_columns.issubset(raw_data.columns):
    print("Error: The input data is missing one or more required columns.")
raw_data['Date'] = pd.to_datetime(raw_data['Date'])

In [3]:
team_stats = raw_data.groupby(['Team Display Name', 'Date', 'Opponent']).agg(
    PTS=('PTS', 'sum'),
    FGM=('FGM', 'sum'),
    FGA=('FGA', 'sum'),
    FTM=('FTM', 'sum'),
    FTA=('FTA', 'sum'),
    OREB=('OREB', 'sum'),
    DREB=('DREB', 'sum'),
    REB=('REB', 'sum'),
    AST=('AST', 'sum'),
    STL=('STL', 'sum'),
    BLK=('BLK', 'sum'),
    TO=('TO', 'sum'),
    Win=('Win', 'first'),
    Game_Score=('Game Score', 'sum')
).reset_index()

team_stats['Possessions'] = team_stats['FGA'] - team_stats['OREB'] + team_stats['TO'] + (0.4 * team_stats['FTA'])

# Calculate offensive rating (points per 100 possessions)
team_stats['ORtg'] = (team_stats['PTS'] / team_stats['Possessions']) * 100

# Map opponent points to calculate defensive rating
team_points = team_stats[['Team Display Name', 'Date', 'Opponent', 'PTS']].copy()
team_points.rename(columns={'Team Display Name': 'Opponent', 
                           'Opponent': 'Team Display Name', 
                           'PTS': 'PTS_Allowed'}, inplace=True)

# Merge opponent points to calculate defensive rating
team_stats = team_stats.merge(team_points, on=['Team Display Name', 'Date', 'Opponent'], how='left')
team_stats['DRtg'] = (team_stats['PTS_Allowed'] / team_stats['Possessions']) * 100


In [4]:
new_features = ['ORtg', 'DRtg', 'Game_Score']
for feature in new_features:
    team_stats[f'Last_5_{feature}_avg'] = team_stats.groupby('Team Display Name')[feature]\
        .transform(lambda x: x.rolling(window=5, min_periods=1).mean())

In [5]:
matchup_data = []
for _, row in team_stats.iterrows():
    team = row['Team Display Name']
    opponent = row['Opponent']
    date = row['Date']

    # Get team's recent performance before this game
    team_history = team_stats[(team_stats['Team Display Name'] == team) & 
                              (team_stats['Date'] < date)]
    if team_history.empty:
        continue
    team_features = team_history.iloc[-1]

    # Get opponent's recent performance before this game
    opponent_history = team_stats[(team_stats['Team Display Name'] == opponent) & 
                                 (team_stats['Date'] < date)]
    if opponent_history.empty:
        continue
    opponent_features = opponent_history.iloc[-1]

    # Create matchup feature vector using advanced metrics
    matchup_data.append({
        'date': date,
        'team_ortg_avg': team_features['Last_5_ORtg_avg'],
        'opp_ortg_avg': opponent_features['Last_5_ORtg_avg'],
        'team_drtg_avg': team_features['Last_5_DRtg_avg'],
        'opp_drtg_avg': opponent_features['Last_5_DRtg_avg'],
        'team_game_score_avg': team_features['Last_5_Game_Score_avg'],
        'opp_game_score_avg': opponent_features['Last_5_Game_Score_avg'],
        'outcome': row['Win']
    })


In [6]:
matchup_df = pd.DataFrame(matchup_data)
X = matchup_df.drop(columns=['date', 'outcome'])
y = matchup_df['outcome']

split_index = int(len(X) * 0.8)
X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

In [10]:
base_models = [
    ('logreg', LogisticRegression(max_iter=1000, C=0.1)),
    ('rf', RandomForestClassifier(n_estimators=200, max_depth=5)),
    ('xgb', XGBClassifier(learning_rate=0.1, max_depth=3))
]

meta_model = LogisticRegression(random_state=42)
ensemble = StackingClassifier(estimators=base_models, final_estimator=meta_model, stack_method='predict_proba')

In [11]:
ensemble.fit(X_train, y_train)

y_pred = ensemble.predict(X_test)
y_pred_proba = ensemble.predict_proba(X_test)[:, 1]
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print("Ensemble Accuracy:", accuracy)
print("ROC AUC:", roc_auc)

Ensemble Accuracy: 0.7142857142857143
ROC AUC: 0.7779073912057288
