<h1 align="center"><font color='green'>NBA Predictions</font></h1>

### <font color='289C4E'>Table of contents<font><a class='anchor' id='top'></a>
- [Processing Data Cleaning](#1)
- [Feature Selection](#2)
- [Modelling](#3)
- [Conclusion](#4)

<h2 align="center"> <font color='grey'>Processing Data Cleaning</font></h2>

In [43]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import RandomizedSearchCV as RSCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector


 # setting output lengths for panda DFs
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_row',50)


In [44]:
df = pd.read_csv('Clean.csv').set_index('date').sort_index()
dkeep=['Season','team','team_opp','home']
keep=df.columns[~df.columns.isin(dkeep)]

In [45]:
Splits = {'ATL':df[df['team']=='ATL'], 'CHI':df[df['team']=='CHI'], 'DET':df[df['team']=='DET'], 'CLE':df[df['team']=='CLE'], 'NOP':df[df['team']=='NOP'], 'GSW':df[df['team']=='GSW'], 'WAS':df[df['team']=='WAS'], 'ORL':df[df['team']=='ORL'], 'TOR':df[df['team']=='TOR'],
       'IND':df[df['team']=='IND'], 'MIA':df[df['team']=='MIA'], 'BOS':df[df['team']=='BOS'], 'PHI':df[df['team']=='PHI'], 'CHO':df[df['team']=='CHO'], 'UTA':df[df['team']=='UTA'], 'BRK':df[df['team']=='BRK'], 'OKC':df[df['team']=='OKC'], 'MEM': df[df['team']=='MEM'],
       'NYK':df[df['team']=='NYK'], 'MIL':df[df['team']=='MIL'], 'SAS':df[df['team']=='SAS'], 'DEN':df[df['team']=='DEN'], 'HOU':df[df['team']=='HOU'], 'DAL':df[df['team']=='DAL'], 'LAC':df[df['team']=='LAC'], 'SAC':df[df['team']=='SAC'], 'PHO':df[df['team']=='PHO'],
       'POR':df[df['team']=='POR'], 'MIN':df[df['team']=='MIN'], 'LAL':df[df['team']=='LAL']}
       

In [46]:
SeasonSplit ={}

In [47]:
for season in range(2016,2024):
    for key in Splits:
        tempDF = Splits[key]
        tempDF = tempDF[tempDF['Season']==season]
        SeasonSplit[key+str(season)]= tempDF  


In [48]:
df = df.reset_index()
AVG = pd.DataFrame(columns=df.columns)
tempDFs = []

for key in SeasonSplit.keys():
    tempDF = pd.DataFrame(SeasonSplit[key]).reset_index().sort_values('date')
    tempDF2 = tempDF.copy()
    for index, row in tempDF.iterrows():
        tempDF2.loc[index, keep] = tempDF.loc[:index, keep].mean()
    tempDFs.append(tempDF2)

AVG = pd.concat(tempDFs, axis=0, ignore_index=True)


In [49]:
def Won_Next(team):
    team['won_next']=team['won'].shift(-1)
    return team
def Next_team(team):
    team['next_opp'] = team['team_opp'].shift(-1)
    return team
def Next_date(team):
    team['next_date'] = team['date'].shift(-1)
    return team
def Next_home(team):
    team['home_next'] = team['home'].shift(-1)
    return team

In [50]:
df.sort_values('date',inplace=True)
df =df.groupby(['team','Season'],group_keys=False).apply(Won_Next)
df =df.groupby(['team','Season'],group_keys=False).apply(Next_date)
df =df.groupby(['team','Season'],group_keys=False).apply(Next_team)
df =df.groupby(['team','Season'],group_keys=False).apply(Next_home)

In [51]:
dkeep.append('next_opp')
dkeep.append('next_date')
dkeep.append('won_next')
dkeep.append('home_next')

In [52]:
def getAvg10(team):
    rolling = team.rolling(10).mean()
    return rolling

In [53]:
print(type(df))

<class 'pandas.core.frame.DataFrame'>


In [54]:
df.set_index('date',inplace=True)
df.sort_index(inplace=True)
AVG.set_index('date',inplace=True)
AVG.sort_index(inplace=True)

In [55]:
rolling10 = df.copy()

In [56]:
numeric = df.select_dtypes(include=np.number).columns.tolist()

In [57]:
rolling10[numeric] = rolling10.groupby(['team','Season'],group_keys=False)[numeric].apply(getAvg10)

In [58]:
rolling10.columns = [f'{col}_10' for col in rolling10.columns]
AVG.columns = [f'{col}_S_avg' for col in AVG.columns]
AVGcols =list(AVG.columns.drop(['Season_S_avg','home_S_avg','team_S_avg']))
rolcol = list(rolling10.columns.drop('Season_10'))

In [59]:
df = df.sort_index()
rolling10 =rolling10.sort_index()
AVG = AVG.sort_index()

In [60]:
Alldf = pd.concat([rolling10,AVG,df], axis=1).dropna()

In [61]:
print('The data have ', Alldf.shape[0], ' rows and ', Alldf.shape[1], ' columns\n')
print('column names: \n')
print('\n'.join(list(Alldf.columns)))

The data have  16811  rows and  443  columns

column names: 

mp_total_10
mp_total.1_10
fg_total_10
fga_total_10
fg%_total_10
3p_total_10
3pa_total_10
3p%_total_10
ft_total_10
fta_total_10
ft%_total_10
orb_total_10
drb_total_10
trb_total_10
ast_total_10
stl_total_10
blk_total_10
tov_total_10
pf_total_10
pts_total_10
ts%_total_10
efg%_total_10
3par_total_10
ftr_total_10
orb%_total_10
drb%_total_10
trb%_total_10
ast%_total_10
stl%_total_10
blk%_total_10
tov%_total_10
usg%_total_10
ortg_total_10
drtg_total_10
fg_max_10
fga_max_10
fg%_max_10
3p_max_10
3pa_max_10
3p%_max_10
ft_max_10
fta_max_10
ft%_max_10
orb_max_10
drb_max_10
trb_max_10
ast_max_10
stl_max_10
blk_max_10
tov_max_10
pf_max_10
pts_max_10
+/-_max_10
ts%_max_10
efg%_max_10
3par_max_10
ftr_max_10
orb%_max_10
drb%_max_10
trb%_max_10
ast%_max_10
stl%_max_10
blk%_max_10
tov%_max_10
usg%_max_10
ortg_max_10
drtg_max_10
team_10
total_10
home_10
index_opp_10
mp_total_opp_10
mp_total_opp.1_10
fg_total_opp_10
fga_total_opp_10
fg%_total_op

In [62]:
get_rid=['Season_10','Season_S_avg','home_S_avg','team_S_avg']
keep=Alldf.columns[~Alldf.columns.isin(get_rid)]
Alldf= Alldf[keep].dropna()

In [63]:
Alldf.reset_index(inplace=True)

In [64]:
AVGcols.extend(rolcol)
AVGcols.extend(['next_opp','team','next_date','ELO','home_next'])

In [65]:
Alldf =Alldf.reset_index()

In [66]:
BothTeams = Alldf.merge(Alldf[AVGcols],left_on=['team','next_date'],right_on=['next_opp','next_date'])

In [67]:
Alldf = Alldf.set_index('date').sort_index()
BothTeams = BothTeams.set_index('date').sort_index()

In [68]:
print('The data have ', BothTeams.shape[0], ' rows and ', BothTeams.shape[1], ' columns\n')
print('column names: \n')
print('\n'.join(list(BothTeams.columns)))

The data have  16584  rows and  734  columns

column names: 

index
mp_total_10_x
mp_total.1_10_x
fg_total_10_x
fga_total_10_x
fg%_total_10_x
3p_total_10_x
3pa_total_10_x
3p%_total_10_x
ft_total_10_x
fta_total_10_x
ft%_total_10_x
orb_total_10_x
drb_total_10_x
trb_total_10_x
ast_total_10_x
stl_total_10_x
blk_total_10_x
tov_total_10_x
pf_total_10_x
pts_total_10_x
ts%_total_10_x
efg%_total_10_x
3par_total_10_x
ftr_total_10_x
orb%_total_10_x
drb%_total_10_x
trb%_total_10_x
ast%_total_10_x
stl%_total_10_x
blk%_total_10_x
tov%_total_10_x
usg%_total_10_x
ortg_total_10_x
drtg_total_10_x
fg_max_10_x
fga_max_10_x
fg%_max_10_x
3p_max_10_x
3pa_max_10_x
3p%_max_10_x
ft_max_10_x
fta_max_10_x
ft%_max_10_x
orb_max_10_x
drb_max_10_x
trb_max_10_x
ast_max_10_x
stl_max_10_x
blk_max_10_x
tov_max_10_x
pf_max_10_x
pts_max_10_x
+/-_max_10_x
ts%_max_10_x
efg%_max_10_x
3par_max_10_x
ftr_max_10_x
orb%_max_10_x
drb%_max_10_x
trb%_max_10_x
ast%_max_10_x
stl%_max_10_x
blk%_max_10_x
tov%_max_10_x
usg%_max_10_x
ortg_

In [69]:
dkeep =['next_opp_y','team_opp_10_x','team_opp_10_y','next_opp_10_y','next_opp_10_x','team_y','won_next_10_y','home_10_y','next_opp_x','team_x','won_next_10_x','home_10_x','next_date','won_next','Season','next_date','team_opp','team_10_x','team_10_y','next_date_10_y','next_date_10_x','index_opp_S_avg_x','team_opp_S_avg_x','index_opp_S_avg_y','team_opp_S_avg_y']
BothTeams[dkeep+['won']].sort_values(['team_x'])

Unnamed: 0_level_0,next_opp_y,team_opp_10_x,team_opp_10_y,next_opp_10_y,next_opp_10_x,team_y,won_next_10_y,home_10_y,next_opp_x,team_x,won_next_10_x,home_10_x,next_date,won_next,Season,next_date,team_opp,team_10_x,team_10_y,next_date_10_y,next_date_10_x,index_opp_S_avg_x,team_opp_S_avg_x,index_opp_S_avg_y,team_opp_S_avg_y,won
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
2017-04-19 19:00:00,ATL,WAS,ATL,ATL,WAS,WAS,0.5,0.3,WAS,ATL,0.5,0.4,2017-04-22 17:30:00,1.0,2017,2017-04-22 17:30:00,WAS,ATL,WAS,2017-04-22 17:30:00,2017-04-22 17:30:00,0.488095,ATL,0.511905,WAS,0
2019-02-25 20:00:00,ATL,HOU,SAC,ATL,MIN,MIN,0.4,0.7,MIN,ATL,0.9,0.4,2019-02-27 19:30:00,1.0,2019,2019-02-27 19:30:00,HOU,ATL,MIN,2019-02-27 19:30:00,2019-02-27 19:30:00,0.516129,PHI,0.500000,MIN,0
2017-01-15 15:00:00,ATL,MIL,TOR,ATL,NYK,NYK,0.2,0.4,NYK,ATL,0.6,0.5,2017-01-16 13:00:00,1.0,2017,2017-01-16 13:00:00,MIL,ATL,NYK,2017-01-16 13:00:00,2017-01-16 13:00:00,0.475000,NYK,0.525000,MIL,1
2021-03-28 21:00:00,ATL,DEN,CHO,ATL,PHO,PHO,0.6,0.5,PHO,ATL,0.6,0.3,2021-03-30 22:00:00,0.0,2021,2021-03-30 22:00:00,DEN,ATL,PHO,2021-03-30 22:00:00,2021-03-30 22:00:00,0.520548,ATL,0.480769,CHO,0
2017-01-16 13:00:00,ATL,NYK,LAL,ATL,DET,DET,0.5,0.3,DET,ATL,0.3,0.5,2017-01-18 20:00:00,0.0,2017,2017-01-18 20:00:00,NYK,ATL,DET,2017-01-18 20:00:00,2017-01-18 20:00:00,0.500000,ATL,0.511111,DET,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-11-25 19:00:00,WAS,POR,PHO,WAS,MIN,MIN,0.6,0.3,MIN,WAS,0.6,0.5,2017-11-28 20:00:00,1.0,2018,2017-11-28 20:00:00,POR,WAS,MIN,2017-11-28 20:00:00,2017-11-28 20:00:00,0.555556,ORL,0.500000,PHO,0
2019-02-27 19:30:00,WAS,BRK,POR,WAS,BOS,BOS,0.4,0.5,BOS,WAS,0.4,0.4,2019-03-01 20:00:00,0.0,2019,2019-03-01 20:00:00,BRK,WAS,BOS,2019-03-01 20:00:00,2019-03-01 20:00:00,0.508197,ATL,0.492063,CHI,1
2017-01-10 19:00:00,WAS,CHI,TOR,WAS,BOS,BOS,0.1,0.6,BOS,WAS,0.5,0.6,2017-01-11 20:00:00,0.0,2017,2017-01-11 20:00:00,CHI,WAS,BOS,2017-01-11 20:00:00,2017-01-11 20:00:00,0.432432,CHI,0.552632,TOR,1
2015-12-26 16:00:00,WAS,BRK,UTA,WAS,LAC,LAC,0.4,0.6,LAC,WAS,0.3,0.8,2015-12-28 19:00:00,0.0,2016,2015-12-28 19:00:00,BRK,WAS,LAC,2015-12-28 19:00:00,2015-12-28 19:00:00,0.466667,WAS,0.483871,UTA,1


In [70]:
keep=BothTeams.columns[~BothTeams.columns.isin(dkeep)]

In [71]:
BothTeams.sort_index(inplace=True)
X=BothTeams[keep]
y = BothTeams['won_next']
logreg=LogisticRegression()

In [72]:
Alldf[['team','team_opp','next_opp','next_date','won','won_next']].iloc[:20]

Unnamed: 0_level_0,team,team_opp,next_opp,next_date,won,won_next
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-11-11 20:00:00,MEM,GSW,POR,2015-11-13 20:00:00,0,1.0
2015-11-12 20:00:00,MIN,GSW,IND,2015-11-13 19:00:00,0,0.0
2015-11-13 19:00:00,ORL,UTA,WAS,2015-11-14 19:00:00,1,0.0
2015-11-13 19:00:00,IND,MIN,CHI,2015-11-16 20:00:00,1,0.0
2015-11-13 19:30:00,NOP,TOR,NYK,2015-11-15 12:00:00,0,0.0
2015-11-13 19:30:00,CLE,NYK,MIL,2015-11-14 20:30:00,1,0.0
2015-11-13 19:30:00,TOR,NOP,SAC,2015-11-15 21:00:00,1,0.0
2015-11-13 20:00:00,CHO,CHI,POR,2015-11-15 17:00:00,0,1.0
2015-11-13 20:00:00,OKC,PHI,BOS,2015-11-15 19:00:00,1,0.0
2015-11-13 22:00:00,BRK,SAC,GSW,2015-11-14 22:30:00,0,0.0


In [73]:
BothTeams[['team_x','team_opp','team_y','next_date','won','won_next']].iloc[:20]

Unnamed: 0_level_0,team_x,team_opp,team_y,next_date,won,won_next
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-11-14 19:00:00,ORL,WAS,MIN,2015-11-18 19:00:00,0,1.0
2015-11-14 20:00:00,DAL,HOU,PHI,2015-11-16 19:00:00,1,1.0
2015-11-14 20:30:00,SAS,PHI,POR,2015-11-16 20:30:00,1,1.0
2015-11-14 20:30:00,PHI,SAS,DAL,2015-11-16 19:00:00,0,0.0
2015-11-14 21:00:00,PHO,DEN,LAL,2015-11-16 21:00:00,1,1.0
2015-11-14 22:30:00,GSW,BRK,TOR,2015-11-17 22:30:00,1,1.0
2015-11-14 22:30:00,BRK,GSW,ATL,2015-11-17 19:30:00,0,1.0
2015-11-15 12:00:00,NYK,NOP,CHO,2015-11-17 19:30:00,1,1.0
2015-11-15 15:30:00,MEM,MIN,OKC,2015-11-16 20:00:00,1,1.0
2015-11-15 17:00:00,POR,CHO,SAS,2015-11-16 20:30:00,0,0.0


In [74]:
split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(logreg,  
                                n_features_to_select=30, 
                                direction="backward",
                                cv=split,
                                n_jobs=1
                               )

In [75]:
sfs.fit(X,y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

KeyboardInterrupt: 

In [None]:
predictors = list(keep[sfs.get_support()])
predictors

['drb_total_10_x',
 'usg%_total_10_x',
 'usg%_total_opp_10_x',
 'stl_max_opp_10_x',
 'fg_total_S_avg',
 'fta_total_S_avg',
 'stl_total_S_avg',
 'usg%_total_S_avg',
 'stl%_max_S_avg',
 'ortg_max_S_avg',
 'usg%_total_opp_S_avg',
 'ft%_max_opp_S_avg',
 'trb_max_opp_S_avg',
 'stl%_max_opp_S_avg',
 'drtg_max_opp_S_avg',
 'usg%_total',
 'pts_max',
 'usg%_total_opp',
 'ft%_max_opp',
 'stl%_max_opp',
 'won',
 'ft_total_10_y',
 'drb_total_10_y',
 'usg%_total_10_y',
 'efg%_max_10_y',
 'ast%_total_opp_10_y',
 'usg%_total_opp_10_y',
 '3p_max_opp_10_y',
 'stl%_max_opp_10_y',
 'usg%_max_opp_10_y']

In [85]:
predictors =['efg%_total_10_x',
 'blk_total_opp_10_x',
 'trb%_total_opp_10_x',
 'fga_max_opp_10_x',
 '3p_max_opp_10_x',
 'ts%_total_S_avg_x',
 '3p%_max_S_avg_x',
 'ft_max_S_avg_x',
 'drtg_max_S_avg_x',
 '3par_total_opp_S_avg_x',
 'fg_total',
 'blk_total',
 '3par_total',
 'fga_max',
 'ft_max',
 'stl_max',
 '3p_total_opp',
 '3pa_total_opp',
 'orb_max_opp',
 'trb_max_opp',
 'total_opp',
 '3pa_total_opp_S_avg_y',
 'won_S_avg_y',
 'ts%_total_10_y',
 'blk%_total_10_y',
 'usg%_max_10_y',
 'pts_total_opp_10_y',
 '3p_max_opp_10_y',
 'pf_max_opp_10_y',
 'home_next_x','home_next_y',
 'ELO_x','ELO_y',
 'won_next',
 'Season']

In [86]:
BothTeams[predictors].to_csv('Model.csv')