In [516]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

In [517]:
#function to make sure the date is a valid date for sorting data and not one from the future
#this function will add the year onto the date as it currently doesn't have it 
def adjust_date(row):
    
    season_year = row['Season']
    month = row['Date'].split(' ')[0] 
    day = row['Date'].split(' ')[1]
    
    month_numbers = {
        "January": 1, "February": 2, "March": 3, "April": 4,
        "May": 5, "June": 6, "July": 7, "August": 8,
        "September": 9, "October": 10, "November": 11, "December": 12
    }
    month_number = month_numbers[month]

    #adjusting for january and february games as the season rolls into the next year
    year = season_year + 1 if month_number in [1, 2] else season_year

    # Return the adjusted date in MM/DD/YYYY format
    return f"{month_number:02d}/{day}/{year}"

In [767]:
#cleaning data to prepare it for machine learning
#matches.dtypes
matches = pd.read_csv('matches.csv')

#Removing playoff row error due to differing season lengths
#playoffIdx = match_df[ (match_df['Date'] == 'Playoffs')].index
#match_df =  match_df.drop(playoffIdx)
#match_df = match_df.reset_index(drop = 'true')
#match_df.to_csv('matches.csv', index=False, encoding='utf-8')
#match_df


#Removing playoff row error due to differing season lengths
playoffIdx = matches[ (matches['Date'] == 'Playoffs')].index
matches =  matches.drop(playoffIdx)
matches = matches.reset_index(drop = 'true')
matches.to_csv('matches.csv', index=False, encoding='utf-8')

#converting dates to proper dates so that they can be stored as date types
matches['Date'] = matches.apply(adjust_date, axis=1)
matches['Date'] = pd.to_datetime(matches['Date'])


#converting home/away into numeric data
matches['Home/AwayCode'] = matches['Home/Away'].astype('category').cat.codes
matches['Home/AwayCode']


#converting opponents into numeric data
matches['OppCode'] = matches['Opp'].astype('category').cat.codes

#adding hour column 
matches['Hour'] = matches['Time'].str.replace(':.+', '', regex = True).astype('int')


#adding day of the week column
matches['DayCode'] = matches['Date'].dt.dayofweek
#NOTE STILL NEED TO CLEAN UP DATE FIRST


#converting all other stats from objects to ints
for column in matches.columns[10:24]:
    matches[column] = pd.to_numeric(matches[column])


#converting Win/Loss to numbers
matches['Target'] = (matches['Result'] == 'W').astype('int')


In [783]:
rf = RandomForestClassifier(n_estimators = 50, min_samples_split = 10, random_state = 1)
training = matches[matches['Date'] < '04-04-2024']
train = matches[matches['Date'] > '04-04-2023'] #delete this and change above line to fix
test = matches[matches['Date'] > '04-04-2024']
predictors = ['Home/AwayCode', 'OppCode', 'DayCode']
rf.fit(train[predictors], train['Target'])
preds = rf.predict(test[predictors])

In [777]:
acc = accuracy_score(test['Target'], preds)
acc

0.7040441176470589

In [778]:
combined = pd.DataFrame(dict(actual=test['Target'], prediction=preds))
pd.crosstab(index=combined['actual'], columns = combined['prediction'])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,221,81
1,80,162


In [779]:
precision_score(test['Target'],preds)

0.6666666666666666

In [771]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values('Date')
    rolling_stats = group[cols].rolling(3, closed = 'left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset = new_cols)
    return group

In [790]:
#cols = ['TmSc', 'OppSc', 'Off1stD', 'OffTotYd', 'OffPassY', 'OffRushY', 'OffTO', 'Def1stD', 'DefTotYd', 'DefPassY', 'DefRushY', 'DefTO', 'EPOff', 'EPDef', 'EPSp.']
#cols = ['TmSc', 'OppSc', 'Off1stD', 'OffTotYd', 'OffPassY', 'OffRushY', 'OffTO', 'Def1stD', 'DefTotYd', 'DefPassY', 'DefRushY', 'DefTO']
cols = ['TmSc', 'OppSc', 'OffTotYd', 'DefTotYd', 'OffRushY', 'OffTO']
#cols = ['TmSc', 'OppSc']
#cols = ['EPOff', 'EPDef', 'EPSp.']
predictors = ['Home/AwayCode', 'OppCode', 'DayCode']

new_cols = [f'{col}_rolling' for col in cols]
rolling_averages(group, cols, new_cols)
matches_rolling = matches.groupby('Team').apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel('Team')
matches_rolling.index = range(matches_rolling.shape[0])




def make_predictions(data, predictors):
    rf = RandomForestClassifier(n_estimators = 100, min_samples_split = 10, random_state = 1)
    training = data[data['Date'] < '04-04-2024']
    train = training[training['Date'] > '04-04-2022'] #delete this and change above line to fix
    test = data[data['Date'] > '04-04-2024']
    rf.fit(train[predictors], train['Target'])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test['Target'], prediction=preds))
    precision = precision_score(test['Target'],preds)
    print("Accuracy", accuracy_score(test['Target'], preds))
    return combined, precision, rf

new_predictors = predictors + new_cols
combined, precision, rf = make_predictions(matches_rolling, new_predictors)
print("Precision", precision)

Accuracy 0.5968992248062015
Precision 0.5726495726495726


  matches_rolling = matches.groupby('Team').apply(lambda x: rolling_averages(x, cols, new_cols))


Accuracy 0.5968992248062015
Precision 0.56640625


In [791]:
combined = combined.merge(matches_rolling[['Date','Team','Opp','Result']], left_index = True, right_index = True)
merged = combined.merge(combined, left_on = ['Date', 'Team'], right_on = ['Date', 'Opp'])

In [792]:
merged[(merged['prediction_x'] == 1) & (merged['prediction_y'] == 0)]['actual_x'].value_counts()

actual_x
1    87
0    48
Name: count, dtype: int64

In [793]:
combined.to_csv('predictions.csv')

In [794]:
feature_importances = rf.feature_importances_
for feature, importance in zip(new_predictors, feature_importances):

    print(f"{feature}: {importance}") 


Home/AwayCode: 0.026211059861506847
OppCode: 0.13522933657130096
DayCode: 0.03071874978397106
TmSc_rolling: 0.14322105065594806
OppSc_rolling: 0.13404380817621103
OffTotYd_rolling: 0.1668782009192601
DefTotYd_rolling: 0.14462225168429274
OffRushY_rolling: 0.14890925036049446
OffTO_rolling: 0.07016629198701486
