In [1]:
import pandas as pd
import numpy as np

In [27]:
df = pd.read_csv(r'results\ncaa_scores_2024-09-23.csv')

In [6]:
# Step 1: Initialize the table dataframe for storing results
table = pd.DataFrame(columns=['team', 'GF', 'GA', 'PTS'])
table.set_index('team', inplace=True)

# Function to update table
def update_table(team, gf, ga, pts):
    if team not in table.index:
        table.loc[team] = [0, 0, 0]  # Initialize if the team is not in the table
    
    # Update goals for, goals against, and points
    table.at[team, 'GF'] += gf
    table.at[team, 'GA'] += ga
    table.at[team, 'PTS'] += pts

# Step 2: Process each match
for index, row in df.iterrows():
    home_team = row['home_team']
    away_team = row['away_team']
    
    # Split the score into goals
    home_goals, away_goals = map(int, row['score'].split('-'))
    
    # Points assignment
    if home_goals > away_goals:
        home_pts, away_pts = 3, 0
    elif home_goals < away_goals:
        home_pts, away_pts = 0, 3
    else:
        home_pts, away_pts = 1, 1

    # Update the table for home and away teams
    update_table(home_team, home_goals, away_goals, home_pts)
    update_table(away_team, away_goals, home_goals, away_pts)

# Reset index to see the table as a DataFrame
table = table.reset_index()
print(table)


                     team  GF  GA  PTS
0       George Washington   6  21    6
1            Old Dominion  10   8   13
2                     VMI  10  16   10
3           Emory & Henry   2   4    0
4                Delaware  20  13   13
..                    ...  ..  ..  ...
234              Edgewood   0   1    0
235  Our Lady of the Lake   0   4    0
236              Westmont   2   2    1
237        Dominican (CA)   0   5    0
238         Wis.-Parkside   1   2    0

[239 rows x 4 columns]


In [9]:
table.sort_values(by='PTS', ascending=False)

Unnamed: 0,team,GF,GA,PTS
170,Stanford,21,7,22
48,Elon,21,7,22
140,Ohio St.,22,4,22
144,Pittsburgh,18,5,21
59,Col. of Charleston,24,12,21
...,...,...,...,...
205,Erskine,0,9,0
151,Evangel,0,3,0
153,Grand View,0,3,0
173,Northwest (WA),1,4,0


In [64]:
data = df.copy()

In [78]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import numpy as np

# Initialize the table dataframe for storing results
table = pd.DataFrame(columns=['team', 'Wins', 'Draws', 'Losses', 'GF', 'GA', 'Adjusted_GF', 'Adjusted_GA', 'PTS', 'Adjusted_GD', 'xPTS'])
table.set_index('team', inplace=True)

# Function to update the table with wins, draws, losses, GF, GA, Adjusted GF, Adjusted GA, PTS, and Adjusted GD
def update_table(team, gf, ga, result):
    if team not in table.index:
        # Initialize team data with all zeros
        table.loc[team] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    
    # Update uncapped goals for and against
    table.at[team, 'GF'] += gf
    table.at[team, 'GA'] += ga
    
    # Update capped goals for and against (max of 6 goals per game)
    capped_gf = min(gf, 6)
    capped_ga = min(ga, 6)
    table.at[team, 'Adjusted_GF'] += capped_gf
    table.at[team, 'Adjusted_GA'] += capped_ga
    
    # Update Adjusted Goal Difference (GD)
    table.at[team, 'Adjusted_GD'] = table.at[team, 'Adjusted_GF'] - table.at[team, 'Adjusted_GA']
    
    # Update result (win, draw, loss)
    if result == 'win':
        table.at[team, 'Wins'] += 1
        table.at[team, 'PTS'] += 3
    elif result == 'draw':
        table.at[team, 'Draws'] += 1
        table.at[team, 'PTS'] += 1
    elif result == 'loss':
        table.at[team, 'Losses'] += 1

# Process each match in the dataframe
for index, row in df.iterrows():
    home_team = row['home_team']
    away_team = row['away_team']
    
    # Split the score into goals
    home_goals, away_goals = map(int, row['score'].split('-'))
    
    # Determine the result and update the table
    if home_goals > away_goals:
        update_table(home_team, home_goals, away_goals, 'win')
        update_table(away_team, away_goals, home_goals, 'loss')
    elif home_goals < away_goals:
        update_table(home_team, home_goals, away_goals, 'loss')
        update_table(away_team, away_goals, home_goals, 'win')
    else:
        update_table(home_team, home_goals, away_goals, 'draw')
        update_table(away_team, away_goals, home_goals, 'draw')

# Step 1: Prepare the feature set (X) and the target variable (y)
X = table[['GF', 'GA', 'Adjusted_GF', 'Adjusted_GA', 'Adjusted_GD']]  # Features: Total Goals For, Goals Against, Adjusted values
y = table['PTS']                                                      # Target: Points

# Step 2: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 4: Predict points for all teams based on uncapped and capped goals
table['xPTS'] = model.predict(X)

# Reset index to show teams in the final dataframe
table = table.sort_values(by='xPTS', ascending=False)
table = table.reset_index()
table.drop(columns=['Adjusted_GF', 'Adjusted_GA'])
table = table[['team', 'Wins', 'Draws', 'Losses', 'GF', 'GA', 'Adjusted_GD', 'PTS', 'xPTS']]

In [80]:
table['PD'] = table['PTS'] - table['xPTS']

In [85]:
table.head(25)

Unnamed: 0,team,Wins,Draws,Losses,GF,GA,Adjusted_GD,PTS,xPTS,PD
0,Ohio St.,7,1,0,22,4,18,22,23.103231,-1.103231
1,Col. of Charleston,7,0,1,24,12,12,21,22.355871,-1.355871
2,UC Santa Barbara,6,2,1,21,7,14,20,21.135198,-1.135198
3,Stanford,7,1,1,21,7,14,22,21.135198,0.864802
4,Elon,7,1,0,21,7,14,22,21.135198,0.864802
5,Wisconsin,6,0,1,20,6,14,18,20.505287,-2.505287
6,Gonzaga,6,2,2,22,12,10,20,20.426988,-0.426988
7,Western Mich.,6,3,0,20,7,13,21,20.170757,0.829243
8,Pittsburgh,7,0,1,18,5,13,21,18.910934,2.089066
9,West Virginia,5,2,0,18,6,12,17,18.576404,-1.576404
