In [979]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

In [980]:
#function to make sure the date is a valid date for sorting data and not one from the future
#this function will add the year onto the date as it currently doesn't have it 
def adjust_date(row):
    
    season_year = row['Season']
    month = row['Date'].split(' ')[0] 
    day = row['Date'].split(' ')[1]
    
    month_numbers = {
        "January": 1, "February": 2, "March": 3, "April": 4,
        "May": 5, "June": 6, "July": 7, "August": 8,
        "September": 9, "October": 10, "November": 11, "December": 12
    }
    month_number = month_numbers[month]

    #adjusting for january and february games as the season rolls into the next year
    year = season_year + 1 if month_number in [1, 2] else season_year

    # Return the adjusted date in MM/DD/YYYY format
    return f"{month_number:02d}/{day}/{year}"

In [981]:
#cleaning data to prepare it for machine learning
#matches.dtypes
matches = pd.read_csv('matches.csv')

#Removing playoff row error due to differing season lengths
#playoffIdx = match_df[ (match_df['Date'] == 'Playoffs')].index
#match_df =  match_df.drop(playoffIdx)
#match_df = match_df.reset_index(drop = 'true')
#match_df.to_csv('matches.csv', index=False, encoding='utf-8')
#match_df


#Removing playoff row error due to differing season lengths
playoffIdx = matches[ (matches['Date'] == 'Playoffs')].index
matches =  matches.drop(playoffIdx)
matches = matches.reset_index(drop = 'true')
matches.to_csv('matches.csv', index=False, encoding='utf-8')

#converting dates to proper dates so that they can be stored as date types
matches['Date'] = matches.apply(adjust_date, axis=1)
matches['Date'] = pd.to_datetime(matches['Date'])


#converting home/away into numeric data
matches['Home/AwayCode'] = matches['Home/Away'].astype('category').cat.codes #0 is away, 1 is home



#converting team into numeric data
#matches['TeamCode'] = matches['Team'].astype('category').cat.codes

#converting opponents into numeric data
#matches['OppCode'] = matches['Opp'].astype('category').cat.codes

#adding hour column 
matches['Hour'] = matches['Time'].str.replace(':.+', '', regex = True).astype('int')


#adding day of the week column
matches['DayCode'] = matches['Date'].dt.dayofweek
#NOTE STILL NEED TO CLEAN UP DATE FIRST


#converting all other stats from objects to ints
for column in matches.columns[10:24]:
    matches[column] = pd.to_numeric(matches[column])

#adding in team year and opp year column to make teams separate by year
matches['TeamYear'] = matches['Team'].astype('str') + '' + matches['Season'].apply(str)
matches['OppYear'] = matches['Opp'].astype('str') + '' + matches['Season'].apply(str)


matches['TeamCode'] = matches['TeamYear'].astype('category').cat.codes

matches['OppCode'] = matches['OppYear'].astype('category').cat.codes

#converting Win/Loss to numbers
matches['Target'] = (matches['Result'] == 'W').astype('int')

In [982]:
rf = RandomForestClassifier(n_estimators = 50, min_samples_split = 10, random_state = 1)
training = matches[matches['Date'] < '12-24-2024']
train = matches[matches['Date'] > '04-04-2022'] #delete this and change above line to fix
test = matches[matches['Date'] > '12-24-2024']
predictors = ['Home/AwayCode', 'TeamCode', 'OppCode']
rf.fit(train[predictors], train['Target'])
preds = rf.predict(test[predictors])

In [983]:
acc = accuracy_score(test['Target'], preds)
acc

0.75

In [984]:
precision_score(test['Target'],preds)

0.2

In [985]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values('Date')
    rolling_stats = group[cols].rolling(3, closed = 'left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset = new_cols)
    return group

In [998]:
#cols = ['TmSc', 'OppSc', 'Off1stD', 'OffTotYd', 'OffPassY', 'OffRushY', 'OffTO', 'Def1stD', 'DefTotYd', 'DefPassY', 'DefRushY', 'DefTO', 'EPOff', 'EPDef', 'EPSp.']
#cols = ['TmSc', 'OppSc', 'Off1stD', 'OffTotYd', 'OffPassY', 'OffRushY', 'OffTO', 'Def1stD', 'DefTotYd', 'DefPassY', 'DefRushY', 'DefTO']
cols = ['TmSc', 'OppSc', 'OffTotYd', 'DefTotYd', 'OffRushY', 'OffTO']
#cols = ['TmSc', 'OppSc']
#cols = ['EPOff', 'EPDef', 'EPSp.']
predictors = ['Home/AwayCode', 'OppCode', 'TeamCode']

new_cols = [f'{col}_rolling' for col in cols]
matches_rolling = matches.groupby('Team').apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel('Team')
matches_rolling.index = range(matches_rolling.shape[0])




def make_predictions(data, predictors):
    rf = RandomForestClassifier(n_estimators = 100, min_samples_split = 10, random_state = 1)
    training = data[data['Date'] < '04-04-2024']
    train = training[training['Date'] > '04-04-2022'] #delete this and change above line to fix
    test = data[data['Date'] > '04-04-2024']
    rf.fit(train[predictors], train['Target'])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test['Target'], prediction=preds))
    precision = precision_score(test['Target'],preds)
    print("Accuracy", accuracy_score(test['Target'], preds))
    return combined, precision, rf

new_predictors = predictors + new_cols
combined, precision, rf = make_predictions(matches_rolling, predictors)
print("Precision", precision)

Accuracy 0.63671875
Precision 0.6085271317829457


  matches_rolling = matches.groupby('Team').apply(lambda x: rolling_averages(x, cols, new_cols))


In [987]:
combined = combined.merge(matches_rolling[['Date','Team','Opp','Result']], left_index = True, right_index = True)
merged = combined.merge(combined, left_on = ['Date', 'Team'], right_on = ['Date', 'Opp'])
merged[(merged['prediction_x'] == 1) & (merged['prediction_y'] == 0)]['actual_x'].value_counts()

actual_x
1    149
0     93
Name: count, dtype: int64

In [988]:
combined.to_csv('predictions.csv')

In [989]:
feature_importances = rf.feature_importances_
for feature, importance in zip(new_predictors, feature_importances):

    print(f"{feature}: {importance}") 


Home/AwayCode: 0.03523801372361643
OppCode: 0.4755374819047376
TeamCode: 0.48922450437164605


In [1003]:
#get week 17 games
week17 = matches[matches['Season'] == 2024].copy(deep = True)
week17 = week17[week17['Week'] == 17]
week17
week17_preds = rf.predict(week17[predictors])
combined = pd.DataFrame(dict(actual=test['Target'], prediction=preds))
week17_table = combined.merge(week17[['Date','Team','Opp']], left_index = True, right_index = True)
week17_table = week17_table[ week17_table['Date'] <= pd.Timestamp.now()]
week17_table

Unnamed: 0,actual,prediction,Date,Team,Opp
79,1,1,2024-12-25,Baltimore Ravens,Houston Texans
95,0,0,2024-12-25,Pittsburgh Steelers,Kansas City Chiefs
143,0,0,2024-12-25,Houston Texans,Baltimore Ravens
207,1,1,2024-12-25,Kansas City Chiefs,Pittsburgh Steelers
383,0,0,2024-12-26,Chicago Bears,Seattle Seahawks
479,0,0,2024-12-26,Seattle Seahawks,Chicago Bears
