In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('results/ncaa_scores_2024-09-23.csv')

In [3]:
import pandas as pd
import numpy as np

# Initialize Elo ratings (usually starting from 1500)
ELO_DEFAULT = 1500
K_FACTOR = 30  # Determines how much Elo changes after each game

def initialize_elo(df):
    teams = pd.concat([df['home_team'], df['away_team']]).unique()
    elo_ratings = {team: ELO_DEFAULT for team in teams}
    return elo_ratings

def expected_result(elo_a, elo_b):
    """
    Calculate expected result for home team vs away team based on current Elo ratings.
    """
    return 1 / (1 + 10 ** ((elo_b - elo_a) / 400))

def update_elo(elo_ratings, home_team, away_team, home_goals, away_goals):
    """
    Update Elo ratings after each game based on the actual result.
    """
    home_elo = elo_ratings[home_team]
    away_elo = elo_ratings[away_team]
    
    # Calculate expected results
    expected_home_win = expected_result(home_elo, away_elo)
    expected_away_win = 1 - expected_home_win
    
    # Actual result
    if home_goals > away_goals:
        actual_home = 1  # Home win
        actual_away = 0  # Away loss
    elif home_goals < away_goals:
        actual_home = 0  # Home loss
        actual_away = 1  # Away win
    else:
        actual_home = 0.5  # Draw
        actual_away = 0.5
    
    # Update ratings
    new_home_elo = home_elo + K_FACTOR * (actual_home - expected_home_win)
    new_away_elo = away_elo + K_FACTOR * (actual_away - expected_away_win)
    
    # Update elo_ratings
    elo_ratings[home_team] = new_home_elo
    elo_ratings[away_team] = new_away_elo
    
    return elo_ratings, new_home_elo, new_away_elo

def parse_score(score):
    """
    Parse the score from string format 'home_score-away_score' to integers.
    """
    home_score, away_score = map(int, score.split('-'))
    return home_score, away_score

# Main function to compute Elo ratings over a season and store them in a DataFrame
def compute_elo_ratings(df):
    # Initialize Elo ratings
    elo_ratings = initialize_elo(df)
    
    # Create a list to store Elo rating history
    rating_history = []

    # Iterate through each match and update Elo ratings
    for index, row in df.iterrows():
        home_team = row['home_team']
        away_team = row['away_team']
        home_goals, away_goals = parse_score(row['score'])
        
        # Update Elo ratings
        elo_ratings, home_elo, away_elo = update_elo(elo_ratings, home_team, away_team, home_goals, away_goals)
        
        # Append the current match and updated Elo ratings to the history
        rating_history.append({
            'match_index': index,
            'home_team': home_team,
            'away_team': away_team,
            'home_goals': home_goals,
            'away_goals': away_goals,
            'home_elo_before': elo_ratings[home_team] - K_FACTOR * (home_goals > away_goals),  # Elo before the match
            'away_elo_before': elo_ratings[away_team] - K_FACTOR * (away_goals > home_goals),
            'home_elo_after': home_elo,
            'away_elo_after': away_elo
        })
    
    # Convert the rating history to a DataFrame
    elo_history_df = pd.DataFrame(rating_history)
    return elo_history_df

# Compute Elo ratings and store them in a DataFrame
elo_history_df = compute_elo_ratings(df)

In [4]:
import pandas as pd

# Initialize Elo ratings (usually starting from 1500)
ELO_DEFAULT = 1500
K_FACTOR = 30  # Determines how much Elo changes after each game

# Function to initialize Elo ratings in a dataframe
def initialize_team_elo(df):
    teams = pd.concat([df['home_team'], df['away_team']]).unique()
    elo_df = pd.DataFrame({'team': teams, 'elo': ELO_DEFAULT})
    return elo_df.set_index('team')

# Calculate expected result
def expected_result(elo_a, elo_b):
    return 1 / (1 + 10 ** ((elo_b - elo_a) / 400))

# Update Elo ratings after each match
def update_elo_ratings(elo_df, home_team, away_team, home_goals, away_goals):
    home_elo = elo_df.at[home_team, 'elo']
    away_elo = elo_df.at[away_team, 'elo']
    
    # Expected results
    expected_home_win = expected_result(home_elo, away_elo)
    
    # Actual results
    if home_goals > away_goals:
        actual_home = 1
        actual_away = 0
    elif home_goals < away_goals:
        actual_home = 0
        actual_away = 1
    else:
        actual_home = 0.5
        actual_away = 0.5
    
    # Update Elo ratings
    new_home_elo = home_elo + K_FACTOR * (actual_home - expected_home_win)
    new_away_elo = away_elo + K_FACTOR * (actual_away - (1 - expected_home_win))
    
    # Update dataframe
    elo_df.at[home_team, 'elo'] = new_home_elo
    elo_df.at[away_team, 'elo'] = new_away_elo
    
    return elo_df

# Parse score
def parse_score(score):
    home_score, away_score = map(int, score.split('-'))
    return home_score, away_score

# Main function to track and update team Elo ratings after every match
def track_team_elo(df):
    # Initialize team Elo ratings
    elo_df = initialize_team_elo(df)
    
    # Iterate through each match and update the Elo ratings
    for index, row in df.iterrows():
        home_team = row['home_team']
        away_team = row['away_team']
        home_goals, away_goals = parse_score(row['score'])
        
        # Update Elo ratings for the home and away teams
        elo_df = update_elo_ratings(elo_df, home_team, away_team, home_goals, away_goals)
    
    return elo_df



# Track Elo ratings for each team throughout the season
final_elo_df = track_team_elo(df)

# Display the final Elo ratings

  elo_df.at[home_team, 'elo'] = new_home_elo


In [5]:
final_elo_df = final_elo_df.sort_values(by='elo', ascending=False)

In [6]:
elo_df = final_elo_df.reset_index()

# Rename the columns if needed (optional)
elo_df.columns = ['team', 'elo']

In [7]:
elo_df

Unnamed: 0,team,elo
0,Ohio St.,1592.405480
1,Elon,1588.858125
2,Pittsburgh,1581.731197
3,UConn,1579.190620
4,Stanford,1579.094073
...,...,...
234,Columbia,1417.159744
235,Central Conn. St.,1411.668800
236,Southern Ind.,1408.385939
237,Radford,1408.064959


In [9]:
df

Unnamed: 0,home_team,score,away_team
0,George Washington,2-4,Old Dominion
1,VMI,4-2,Emory & Henry
2,Delaware,3-0,Howard
3,Boston U.,0-0,New Hampshire
4,Villanova,2-2,Saint Peter's
...,...,...,...
802,Denver,2-1,Seattle U
803,Stanford,2-2,Duke
804,UC Santa Barbara,3-0,Portland
805,San Francisco,4-0,San Jose St.


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Create features for ML
df['home_elo'] = df['home_team'].map(elo_ratings)
df['away_elo'] = df['away_team'].map(elo_ratings)
df['elo_diff'] = df['home_elo'] - df['away_elo']
df['home_win'] = df.apply(lambda row: 1 if int(row['score'].split('-')[0]) > int(row['score'].split('-')[1]) else 0, axis=1)

# Features and target
X = df[['home_elo', 'away_elo', 'elo_diff']]
y = df['home_win']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict outcomes and evaluate
accuracy = model.score(X_test, y_test)
print(f'Model accuracy: {accuracy * 100:.2f}%')


NameError: name 'elo_ratings' is not defined

In [12]:
import numpy as np

# Function to calculate expected result based on Elo ratings
def expected_result(elo_a, elo_b):
    return 1 / (1 + 10 ** ((elo_b - elo_a) / 400))

# Function to compute win, loss, and draw probabilities
def match_probabilities(home_elo, away_elo, draw_factor=0.2):
    """
    Calculate the win, draw, and loss probabilities for a match.
    
    Parameters:
    home_elo (float): The Elo rating of the home team.
    away_elo (float): The Elo rating of the away team.
    draw_factor (float): A scaling factor for the draw probability. Default is 0.2.
    
    Returns:
    dict: A dictionary with home win, away win, and draw probabilities.
    """
    
    # Expected outcome for home and away team
    home_win_prob = expected_result(home_elo, away_elo)
    away_win_prob = 1 - home_win_prob
    
    # Adjust for draw probability
    draw_prob = (home_win_prob * away_win_prob) ** draw_factor
    
    # Normalize probabilities (so they sum to 1)
    total_prob = home_win_prob + away_win_prob + draw_prob
    home_win_prob /= total_prob
    away_win_prob /= total_prob
    draw_prob /= total_prob
    
    return {
        'home_win_prob': home_win_prob,
        'away_win_prob': away_win_prob,
        'draw_prob': draw_prob
    }

# Calculate win, loss, and draw probabilities
home_team = 'Penn'
away_team = 'Pittsburgh'

# Access Elo ratings directly using .loc
home_elo = final_elo_df.loc[home_team, 'elo']
away_elo = final_elo_df.loc[away_team, 'elo']

# Calculate probabilities
probs = match_probabilities(home_elo, away_elo)

# Display the probabilities
print(f"Home Win Probability: {probs['home_win_prob'] * 100:.2f}%")
print(f"Away Win Probability: {probs['away_win_prob'] * 100:.2f}%")
print(f"Draw Probability: {probs['draw_prob'] * 100:.2f}%")


Home Win Probability: 26.26%
Away Win Probability: 30.65%
Draw Probability: 43.08%
