In [1]:
from matplotlib import pyplot as plt
import pandas as pd
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor


from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error


from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

pd.set_option('display.max_rows', None)

In [2]:

def run_experiment(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("R^2 : ", r2_score(y_test, y_pred))
    print("MAE :", mean_absolute_error(y_test,y_pred))
    print("RMSE:",np.sqrt(mean_squared_error(y_test, y_pred)))

In [3]:
df = pd.read_csv('../Data/Team Data/Combined/league_stats.csv',index_col=0)

df.columns = [i.strip() for i in df.columns]

df.Year = df.Year.astype(int)

df.Year.unique()


array([1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990,
       1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
       2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
       2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023])

In [4]:
df.columns = [i.strip() for i in df.columns]
df.Team = df.Team.str.strip()
df = df.drop(df[df['Team'].str.contains('League')].index)

df.Team = df.Team.str.strip('*')

df = df.fillna(0)


df = df.reset_index(drop=True)


df['Win%'] = df['W'].astype(float) / (df['W'].astype(float) + df['L'].astype(float))

In [5]:

feature_list = ['FG%','3P%','2P%', 'FT%','TRB', 'AST', 'STL', 'BLK',
       'TOV', 'PF', 'PTS', 'Age', 'ORtg',
       'DRtg',  'Pace', 'FTr', '3PAr', 'TS%','Win%']


In [6]:
for i in df.Team.unique():
    df.loc[df['Team'] == i,'W_next'] = df.loc[df['Team'] == i,'Win%'].shift(-2)
    df.loc[df['Team'] == i,'num_W_next'] = df.loc[df['Team'] == i,'W'].shift(-2)

In [7]:
df = df.drop(df[df['W_next'].isna()].index)

In [8]:
X = df[feature_list]
y = df['W_next']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
# df.iloc[y_test.index].Year.unique()

In [47]:
overwrite_test_set = True
year = 2021

if overwrite_test_set:
    X_train = df.loc[df['Year'] != year, feature_list]
    X_test = df.drop(X_train.index)[feature_list]

    y_train = df.loc[df['Year'] != year, 'W_next']
    y_test = df.drop(y_train.index)['W_next']

In [48]:
# reg = LinearRegression().fit(X_train, y_train)
# dec_tree = tree.DecisionTreeClassifier().fit(X_train, y_train)
print('linear reg')
run_experiment(LinearRegression())
print('DecisionTree')
run_experiment(tree.DecisionTreeRegressor())
print('Random Forest')
run_experiment(RandomForestRegressor())

linear reg
R^2 :  0.09378362967192322
MAE : 0.06430510908917536
RMSE: 0.08740988109943272
DecisionTree
R^2 :  -4.539821369725709
MAE : 0.17682926829268295
RMSE: 0.21611875572710237
Random Forest
R^2 :  -0.010508619733433644
MAE : 0.07443599329376177
RMSE: 0.09230273912330338


In [49]:
reg = RandomForestRegressor().fit(X_train,y_train)

In [50]:
y_pred = reg.predict(X_test)


In [51]:
res = df.loc[X_test.index,['Team','Year','G','W','W_next','num_W_next']]
res['Predicted'] = y_pred

In [52]:
res['pred_W'] = res.Predicted * res.G
res.pred_W = res.pred_W.astype(int)
# res.num_W_next = res.num_W_next.astype(float).astype(int)
# res.Year = res.Year.astype(float).astype(int)

In [53]:
res.num_W_next = res.num_W_next.astype(float).astype(int)

In [54]:
res['Predicted_W_Differential'] = res.num_W_next - res.pred_W

In [55]:
# res[['Team','Year','num_W_next','pred_W','Predicted_W_Differential']]
predictions = res[['Team','Year','pred_W',]].sort_values('pred_W',ascending=False)
predictions['Actual'] = y_test * res['G']
predictions.Actual = predictions.Actual.astype(float).astype(int)
# predictions['Rank'] = predictions.pred_W.rank(ascending=False)


predictions['Differential'] = predictions.Actual - predictions.pred_W
predictions

Unnamed: 0,Team,Year,pred_W,Actual,Differential
992,Philadelphia 76ers,2021,43,47,4
1005,Boston Celtics,2021,41,50,9
990,Utah Jazz,2021,39,32,-7
991,Phoenix Suns,2021,39,39,0
1012,Minnesota Timberwolves,2021,39,36,-3
1001,Atlanta Hawks,2021,39,36,-3
994,Denver Nuggets,2021,39,46,7
993,Brooklyn Nets,2021,39,39,0
1003,Golden State Warriors,2021,38,38,0
1004,Memphis Grizzlies,2021,38,44,6


In [177]:
res.Predicted_W_Differential.mean()

-0.9545454545454546

In [183]:
df['Year']

0       1980
1       1980
2       1980
3       1980
4       1980
5       1980
6       1980
7       1980
8       1980
9       1980
10      1980
11      1980
12      1980
13      1980
14      1980
15      1980
16      1980
17      1980
18      1980
19      1980
20      1980
21      1980
22      1981
23      1981
24      1981
25      1981
26      1981
27      1981
28      1981
29      1981
30      1981
31      1981
32      1981
33      1981
34      1981
35      1981
36      1981
37      1981
38      1981
39      1981
40      1981
41      1981
42      1981
43      1981
44      1982
45      1982
46      1982
47      1982
48      1982
49      1982
50      1982
51      1982
52      1982
53      1982
54      1982
55      1982
56      1982
57      1982
58      1982
59      1982
60      1982
61      1982
62      1982
63      1982
64      1982
65      1982
66      1983
67      1983
68      1983
69      1983
70      1983
71      1983
72      1983
73      1983
74      1983
75      1983
76      1983

In [195]:
X_test = df.loc[(df['Year'] == 2023) & (~df['Team'].str.contains('League')),feature_list]