In [149]:

import os

import joblib
import pandas as pd
import seaborn as sns
import xgboost as xgb
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

sns.set_style("darkgrid")

In [2]:
def trim_set(frame):
    return frame[((frame['TRB'] > 2) | (frame['AST'] > 5)) & (frame['PTS'] > 15) & (frame['G'] > 5)]
    # return frame[((frame['TRB'] > 5) | (frame['AST'] > 5)) & (frame['PTS'] > 15) & (frame['Win%'] > .5)]

In [3]:
player_data_dir = '../Basketball Reference Stat Scraper/player_stats'

In [27]:
full_data = []

for year in range(1980, 2024):
    # print(year)
    player = pd.read_csv(os.path.join(player_data_dir, f'{year}_player_stats.csv'), index_col=0)
    player.Player = player.Player.str.strip('*')

    player = player.fillna(0)
    full_data.append(player)

len(full_data)
df = pd.concat(full_data)
df = df.reset_index(drop=True)
print(len(df))

18827


In [28]:
df.Year.unique()

array([1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990,
       1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
       2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
       2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023])

In [29]:
features_to_predict = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
                       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
                       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PER', 'TS%',
                       '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%',
                       'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP',
                       ]

for j in features_to_predict:

    for i in df.Player.unique():
        df.loc[df['Player'] == i, f'{j}_Diff'] = df.loc[df['Player'] == i, j].diff().shift(-1)

    df.loc[df['Player'] == i, 'Years_Exp'] = range(len(df.loc[df['Player'] == i, 'Player']))
# train.loc[train['Player'] == 'Michael Jordan', train.columns.str.contains('Diff')]
len(df)

18827

In [30]:
df.to_csv('1980_2023_Stats_Diff.csv')

In [31]:
df = trim_set(df)

# train = train[~train['PTS_Diff'].isna()]

df.rename(columns={"WS/48": "WS_48", 'WS/48_Diff': 'WS_48_Diff'},
          inplace=True)

In [63]:
train = df[df['Year'] < 2022]
print(len(train))
train = train.dropna(subset=['PTS_Diff'], axis=0)
train = train.drop(columns=['Years_Exp'])
test = df[df['Year'] == 2022]

2681


In [64]:
train.isna().sum().sort_values(ascending=False)

Player       0
Pos          0
AST_Diff     0
TRB_Diff     0
DRB_Diff     0
            ..
PER          0
PTS          0
PF           0
TOV          0
VORP_Diff    0
Length: 96, dtype: int64

In [66]:

feature_list = [
    'Age', 'G', 'GS', 'MP', 'FG', 'FGA',
    'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA',
    'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
    'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%',
    'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS_48', 'OBPM', 'DBPM', 'BPM', 'VORP'
]

features_to_predict = [
    # 'Age',
    'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
    '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
    'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PER', 'TS%',
    '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%',
    'USG%', 'OWS', 'DWS', 'WS', 'WS_48', 'OBPM', 'DBPM', 'BPM', 'VORP',
]

x = train[feature_list]

for i in features_to_predict:
    y = train.loc[:, f'{i}_Diff', ]

    pipe = Pipeline(

        [('feat_selection', SelectPercentile(percentile=60),),
         ('scaler', MinMaxScaler()),
         ('reg', RandomForestRegressor())]
    )

    pipe.fit(x, y)

    filename = f'All_Stat_Models/{i}.pkl'
    joblib.dump(pipe, filename)
    print(filename)

All_Stat_Models/Age.pkl
All_Stat_Models/G.pkl
All_Stat_Models/GS.pkl
All_Stat_Models/MP.pkl
All_Stat_Models/FG.pkl
All_Stat_Models/FGA.pkl
All_Stat_Models/FG%.pkl
All_Stat_Models/3P.pkl
All_Stat_Models/3PA.pkl
All_Stat_Models/3P%.pkl
All_Stat_Models/2P.pkl
All_Stat_Models/2PA.pkl
All_Stat_Models/2P%.pkl
All_Stat_Models/eFG%.pkl
All_Stat_Models/FT.pkl
All_Stat_Models/FTA.pkl
All_Stat_Models/FT%.pkl
All_Stat_Models/ORB.pkl
All_Stat_Models/DRB.pkl
All_Stat_Models/TRB.pkl
All_Stat_Models/AST.pkl
All_Stat_Models/STL.pkl
All_Stat_Models/BLK.pkl
All_Stat_Models/TOV.pkl
All_Stat_Models/PF.pkl
All_Stat_Models/PTS.pkl
All_Stat_Models/PER.pkl
All_Stat_Models/TS%.pkl
All_Stat_Models/3PAr.pkl
All_Stat_Models/FTr.pkl
All_Stat_Models/ORB%.pkl
All_Stat_Models/DRB%.pkl
All_Stat_Models/TRB%.pkl
All_Stat_Models/AST%.pkl
All_Stat_Models/STL%.pkl
All_Stat_Models/BLK%.pkl
All_Stat_Models/TOV%.pkl
All_Stat_Models/USG%.pkl
All_Stat_Models/OWS.pkl
All_Stat_Models/DWS.pkl
All_Stat_Models/WS.pkl
All_Stat_Models/

In [70]:
results = pd.DataFrame()

for i in feature_list:
    print(i)
    filename = f'All_Stat_Models/{i}.pkl'
    pipe = joblib.load(filename)

    results[f'{i}_Diff'] = pipe.predict(test.loc[:, feature_list])

Age
G
GS
MP
FG
FGA
FG%
3P
3PA
3P%
2P
2PA
2P%
eFG%
FT
FTA
FT%
ORB
DRB
TRB
AST
STL
BLK
TOV
PF
PTS
PER
TS%
3PAr
FTr
ORB%
DRB%
TRB%
AST%
STL%
BLK%
TOV%
USG%
OWS
DWS
WS
WS_48
OBPM
DBPM
BPM
VORP


In [94]:
predicted_stats = pd.DataFrame()

In [141]:
d = pd.concat([test.reset_index(drop=True), results], axis=1)
next_years_predicted_stats = pd.DataFrame()
for i in feature_list:
    # print(i)
    next_years_predicted_stats.loc[:, i] = d[[f'{i}_Diff', i]].sum(axis=1)

In [142]:
next_years_predicted_stats = pd.concat([test[['Player', ]].reset_index(drop=True), next_years_predicted_stats], axis=1)

# Beginning to predict the MVP

In [145]:
df = pd.read_csv('train_set_full.csv', index_col=0)
df.rename(columns={'WS/48': 'WS_48'}, inplace=True)
print(df.columns)

df = df.drop(columns=['STAR', 'DPOY_Rank', 'DPOY_First', 'DPOY_Pts Won', 'DPOY_Pts Max', 'DPOY_Share',
                      'DPOY', 'MIP_Rank', 'MIP_First', 'MIP_Pts Won', 'MIP_Pts Max',
                      'MIP_Share', 'MIP', 'MVP_First', 'MVP_Pts Won',
                      'MVP_Pts Max', 'ROTY_Rank', 'ROTY_First',
                      'ROTY_Pts Won', 'ROTY_Pts Max', 'ROTY_Share', 'ROTY', 'SMOTY_Rank',
                      'SMOTY_First', 'SMOTY_Pts Won', 'SMOTY_Pts Max', 'SMOTY_Share',
                      'SMOTY'], errors='ignore')

# df = trim_set(df)
df.Year.unique()

Index(['Year', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA',
       'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA',
       'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
       'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%',
       'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS_48', 'OBPM', 'DBPM',
       'BPM', 'VORP', 'MVP_Rank', 'MVP_First', 'MVP_Pts Won', 'MVP_Pts Max',
       'MVP_Share'],
      dtype='object')


array([1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990,
       1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
       2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
       2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021])

In [146]:
train_set = df.copy()
test_set = next_years_predicted_stats.copy()
test_set = trim_set(test_set)

x_train = df.drop(columns=['MVP_Rank', 'MVP_Share', ])
y_train = train_set['MVP_Share']

test_set.Player.unique()

array(['Bam Adebayo', 'Jarrett Allen', 'Giannis Antetokounmpo',
       'OG Anunoby', 'Deandre Ayton', 'LaMelo Ball', 'Desmond Bane',
       'Scottie Barnes', 'RJ Barrett', 'Bradley Beal', 'Bojan Bogdanović',
       'Devin Booker', 'Miles Bridges', 'Jaylen Brown', 'Jalen Brunson',
       'Jimmy Butler', 'Jordan Clarkson', 'Stephen Curry',
       'Anthony Davis', 'DeMar DeRozan', 'Luka Dončić', 'Kevin Durant',
       'Anthony Edwards', 'Joel Embiid', "De'Aaron Fox", 'Darius Garland',
       'Paul George', 'Shai Gilgeous-Alexander', 'Jerami Grant',
       'Jalen Green', 'Tyrese Haliburton', 'James Harden', 'Tyler Herro',
       'Jrue Holiday', 'Brandon Ingram', 'Kyrie Irving',
       'Jaren Jackson Jr.', 'LeBron James', 'Keldon Johnson',
       'Nikola Jokić', 'Kyle Kuzma', 'Zach LaVine', 'Damian Lillard',
       'Tyrese Maxey', 'CJ McCollum', 'Donovan Mitchell', 'Ja Morant',
       'Dejounte Murray', 'Jordan Poole', 'Kevin Porter Jr.',
       'Kristaps Porziņģis', 'Julius Randle', 'Terry

In [147]:
feature_list = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA',
                'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA',
                'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
                'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%',
                'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS_48', 'OBPM', 'DBPM',
                'BPM', 'VORP']

x_train = train_set[feature_list]
x_test = test_set[feature_list]

In [150]:
pipe = Pipeline([('feat_selection', SelectPercentile(),),
                 ('scaler', MinMaxScaler()),
                 ('reg', RandomForestRegressor())])

param_grid = [
    {
        'feat_selection': [SelectKBest(), SelectPercentile(percentile=55), ],
        # 'feat_selection__percentile':range(45,80,5),
        'scaler': [MinMaxScaler(), StandardScaler()],
        'reg': [RandomForestRegressor(), xgb.XGBRegressor(objective="reg:linear"), SVR(), GradientBoostingRegressor()]

    },
]
ppln_fitted = GridSearchCV(pipe, param_grid, cv=5, verbose=5, n_jobs=-1, return_train_score=True)

ppln_fitted = ppln_fitted.fit(x_train, y_train, )

print(ppln_fitted.best_params_)
print(ppln_fitted.best_score_)
best = ppln_fitted.best_params_

Fitting 5 folds for each of 16 candidates, totalling 80 fits
{'feat_selection': SelectPercentile(percentile=55), 'reg': RandomForestRegressor(), 'scaler': StandardScaler()}
0.6013401843266883


In [151]:
y_pred = ppln_fitted.predict(x_test)

test_set['pred'] = y_pred
test_set['pred_scaled'] = MinMaxScaler().fit_transform(y_pred.reshape(-1, 1))
test_set['pred_rank'] = test_set.pred.rank(ascending=False).astype("int")

In [152]:
test_set.sort_values('pred_rank', ascending=True).head(10)[
    ['Player', 'G', 'PTS', 'FG%', 'AST', 'TRB', 'STL', 'BLK', 'TOV', 'pred']]

Unnamed: 0,Player,G,PTS,FG%,AST,TRB,STL,BLK,TOV,pred
53,Nikola Jokić,52.35,24.471,0.61688,9.458,9.934,1.327,0.702,3.663,0.55019
28,Luka Dončić,62.12,33.377,0.49164,8.222,8.548,1.534,0.575,3.554,0.47884
36,Shai Gilgeous-Alexander,71.75,31.323,0.51212,5.794,4.989,1.592,0.817,2.846,0.36487
32,Joel Embiid,59.08,30.838,0.54317,4.749,9.518,1.048,1.702,3.27,0.28515
2,Giannis Antetokounmpo,56.53,29.996,0.54369,5.912,12.143,0.785,0.763,3.852,0.25241
21,Jimmy Butler,65.81,22.004,0.54776,5.182,5.964,1.726,0.297,1.559,0.08309
57,Damian Lillard,78.51,26.151,0.45994,5.563,4.257,0.977,0.222,2.829,0.04416
76,Jayson Tatum,53.69,30.016,0.46162,4.718,8.373,1.092,0.713,3.009,0.04267
72,Domantas Sabonis,76.42,18.423,0.60181,7.289,11.546,0.839,0.647,2.692,0.03901
26,Anthony Davis,70.2,24.096,0.55418,2.405,12.502,0.986,1.677,2.211,0.02743


In [154]:
test_set = test_set.round(2)

In [157]:
test_set[['Age','G','GS']] = test_set[['Age','G','GS']].astype(int)

In [161]:
test_set.sort_values('pred_rank').reset_index(drop=True).to_csv('NICK BITCH.csv')