In [70]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

In [165]:
#function to make sure the date is a valid date for sorting data and not one from the future
#this function will add the year onto the date as it currently doesn't have it 
def adjust_date(row):
    
    season_year = row['Season']
    month = row['Date'].split(' ')[0] 
    day = row['Date'].split(' ')[1]
    
    month_numbers = {
        "January": 1, "February": 2, "March": 3, "April": 4,
        "May": 5, "June": 6, "July": 7, "August": 8,
        "September": 9, "October": 10, "November": 11, "December": 12
    }
    month_number = month_numbers[month]

    #adjusting for january and february games as the season rolls into the next year
    year = season_year + 1 if month_number in [1, 2] else season_year

    # Return the adjusted date in MM/DD/YYYY format
    return f"{month_number:02d}/{day}/{year}"

In [166]:
#cleaning data to prepare it for machine learning
#matches.dtypes
matches = pd.read_csv('matches.csv')


#converting dates to proper dates so that they can be stored as date types
matches['Date'] = matches.apply(adjust_date, axis=1)
matches['Date'] = pd.to_datetime(matches['Date'])


#converting home/away into numeric data
matches['Home/AwayCode'] = matches['Home/Away'].astype('category').cat.codes
matches['Home/AwayCode']


#converting opponents into numeric data
matches['OppCode'] = matches['Opp'].astype('category').cat.codes

#adding hour column 
matches['Hour'] = matches['Time'].str.replace(':.+', '', regex = True).astype('int')


#adding day of the week column
matches['DayCode'] = matches['Date'].dt.dayofweek
#NOTE STILL NEED TO CLEAN UP DATE FIRST


#converting all other stats from objects to ints
for column in matches.columns[10:24]:
    matches[column] = pd.to_numeric(matches[column])


#converting Win/Loss to numbers
matches['Target'] = (matches['Result'] == 'W').astype('int')

In [306]:
rf = RandomForestClassifier(n_estimators = 50, min_samples_split = 10, random_state = 1)
training = matches[matches['Date'] < '04-04-2024']
test = matches[matches['Date'] > '04-04-2024']
predictors = ['Home/AwayCode', 'OppCode']
rf.fit(train[predictors], train['Target'])
preds = rf.predict(test[predictors])

In [307]:
acc = accuracy_score(test['Target'], preds)
acc

0.6066176470588235

In [308]:
combined = pd.DataFrame(dict(actual=test['Target'], prediction=preds))
pd.crosstab(index=combined['actual'], columns = combined['prediction'])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,189,113
1,101,141


In [309]:
precision_score(test['Target'],preds)

0.5551181102362205

In [310]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values('Date')
    rolling_stats = group[cols].rolling(3, closed = 'left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset = new_cols)
    return group

In [351]:
cols = ['TmSc', 'OppSc', 'Off1stD', 'OffTotYd', 'OffPassY', 'OffRushY', 'OffTO', 'Def1stD', 'DefTotYd', 'DefPassY', 'DefRushY', 'DefTO', 'EPOff', 'EPDef', 'EPSp.']
new_cols = [f'{col}_rolling' for col in cols]
rolling_averages(group, cols, new_cols)
matches_rolling = matches.groupby('Team').apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel('Team')
matches_rolling.index = range(matches_rolling.shape[0])

  matches_rolling = matches.groupby('Team').apply(lambda x: rolling_averages(x, cols, new_cols))


In [352]:
def make_predictions(data, predictors):
    train = data[data['Date'] < '04-04-2024']
    test = data[data['Date'] > '04-04-2024']
    rf.fit(train[predictors], train['Target'])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test['Target'], prediction=preds))
    precision = precision_score(test['Target'],preds)
    print(accuracy_score(test['Target'], preds))
    return combined, precision

In [353]:
new_predictors = predictors + new_cols
combined, precision = make_predictions(matches_rolling, new_predictors)

0.5872093023255814


In [354]:
precision

0.5568627450980392

In [336]:
combined = combined.merge(matches_rolling[['Date','Team','Opp','Result']], left_index = True, right_index = True)
combined

Unnamed: 0,actual,prediction,Date,Team,Opp,Result
31,0,0,2024-09-08,Arizona Cardinals,Buffalo Bills,L
32,1,0,2024-09-15,Arizona Cardinals,Los Angeles Rams,W
33,0,1,2024-09-22,Arizona Cardinals,Detroit Lions,L
34,0,0,2024-09-29,Arizona Cardinals,Washington Commanders,L
35,1,0,2024-10-06,Arizona Cardinals,San Francisco 49ers,W
...,...,...,...,...,...,...
1501,0,0,2024-11-24,Washington Commanders,Dallas Cowboys,L
1502,1,0,2024-12-01,Washington Commanders,Tennessee Titans,W
1503,1,0,2024-12-15,Washington Commanders,New Orleans Saints,W
1504,1,1,2024-12-22,Washington Commanders,Philadelphia Eagles,W


In [303]:
merged = combined.merge(combined, left_on = ['Date', 'Team'], right_on = ['Date', 'Opp'])
merged 

Unnamed: 0,actual_x,prediction_x,Date,Team_x,Opp_x,Result_x,actual_y,prediction_y,Team_y,Opp_y,Result_y
0,0,0,2024-09-08,Arizona Cardinals,Buffalo Bills,L,1,1,Buffalo Bills,Arizona Cardinals,W
1,1,0,2024-09-15,Arizona Cardinals,Los Angeles Rams,W,0,1,Los Angeles Rams,Arizona Cardinals,L
2,0,1,2024-09-22,Arizona Cardinals,Detroit Lions,L,1,1,Detroit Lions,Arizona Cardinals,W
3,0,0,2024-09-29,Arizona Cardinals,Washington Commanders,L,1,0,Washington Commanders,Arizona Cardinals,W
4,1,0,2024-10-06,Arizona Cardinals,San Francisco 49ers,W,0,1,San Francisco 49ers,Arizona Cardinals,L
...,...,...,...,...,...,...,...,...,...,...,...
507,0,0,2024-11-24,Washington Commanders,Dallas Cowboys,L,1,0,Dallas Cowboys,Washington Commanders,W
508,1,0,2024-12-01,Washington Commanders,Tennessee Titans,W,0,0,Tennessee Titans,Washington Commanders,L
509,1,0,2024-12-15,Washington Commanders,New Orleans Saints,W,0,0,New Orleans Saints,Washington Commanders,L
510,1,1,2024-12-22,Washington Commanders,Philadelphia Eagles,W,0,0,Philadelphia Eagles,Washington Commanders,L


In [280]:
merged[(merged['prediction_x'] == 1) & (merged['prediction_y'] == 0)]['actual_x'].value_counts()

actual_x
1    79
0    41
Name: count, dtype: int64

In [281]:
combined.to_csv('predictions.csv')

In [304]:
feature_importances = rf.feature_importances_
for feature, importance in zip(new_predictors, feature_importances):

    print(f"{feature}: {importance}") 


Home/AwayCode: 0.018200863301765888
OppCode: 0.06883345247362141
TmSc_rolling: 0.06325527445344928
OppSc_rolling: 0.05907965739221923
Off1stD_rolling: 0.05756275795680412
OffTotYd_rolling: 0.07199893974520674
OffPassY_rolling: 0.07146615053789389
OffRushY_rolling: 0.0665568222425732
OffTO_rolling: 0.03251318925275419
Def1stD_rolling: 0.046372605835869746
DefTotYd_rolling: 0.06319444959548429
DefPassY_rolling: 0.059846053114983574
DefRushY_rolling: 0.0756024093374357
DefTO_rolling: 0.039105838970269056
EPOff_rolling: 0.08094120256996698
EPDef_rolling: 0.059455925619214374
EPSp._rolling: 0.06601440760048821
