In [37]:

import os
from datetime import date

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectPercentile
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

import joblib

import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("darkgrid")


In [38]:
def trim_set(frame):
    return frame[((frame['TRB'] > 2) | (frame['AST'] > 5)) & (frame['PTS'] > 15) & (frame['G'] > 5)]
    # return frame[((frame['TRB'] > 5) | (frame['AST'] > 5)) & (frame['PTS'] > 15) & (frame['Win%'] > .5)]

In [39]:
mvp_data_dir = '../Basketball Reference Stat Scraper/Awards/MVP'
player_data_dir = '../Basketball Reference Stat Scraper/player_stats'

In [104]:
full_data = []

for year in [2022]:
    # print(year)
    player = pd.read_csv(os.path.join(player_data_dir,f'{year}_player_stats.csv'),index_col=0)
    player.Player = player.Player.str.strip('*')

    player = player.fillna(0)
    full_data.append(player)


test = pd.concat(full_data)
test = test.reset_index(drop=True)
test.rename(columns={'WS/48':'WS_48'},inplace=True)
len(test)

605

In [105]:
df = test.drop(columns=['STAR', 'DPOY_Rank', 'DPOY_First', 'DPOY_Pts Won', 'DPOY_Pts Max', 'DPOY_Share',
                      'DPOY', 'MIP_Rank', 'MIP_First', 'MIP_Pts Won', 'MIP_Pts Max',
                      'MIP_Share', 'MIP', 'MVP_First', 'MVP_Pts Won',
                      'MVP_Pts Max', 'ROTY_Rank', 'ROTY_First',
                      'ROTY_Pts Won', 'ROTY_Pts Max', 'ROTY_Share', 'ROTY', 'SMOTY_Rank',
                      'SMOTY_First', 'SMOTY_Pts Won', 'SMOTY_Pts Max', 'SMOTY_Share',
                      'SMOTY','First'], errors='ignore')

In [106]:
df = trim_set(df)


In [107]:


feature_list = [
    'Age', 'G', 'GS', 'MP', 'FG', 'FGA',
    'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA',
    'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
    'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%',
    'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS','WS_48',
    'OBPM', 'DBPM','BPM', 'VORP'
]

x = df[feature_list]
feature_list.remove('Age')

In [108]:
results = pd.DataFrame()

In [109]:

for i in feature_list:
    print(i)
    filename = f'All_Stat_Models/{i}.pkl'
    pipe = joblib.load(filename)


    results[f'{i}_Diff'] = pipe.predict(x)


G
GS
MP
FG
FGA
FG%
3P
3PA
3P%
2P
2PA
2P%
eFG%
FT
FTA
FT%
ORB
DRB
TRB
AST
STL
BLK
TOV
PF
PTS
PER
TS%
3PAr
FTr
ORB%
DRB%
TRB%
AST%
STL%
BLK%
TOV%
USG%
OWS
DWS
WS
WS_48
OBPM
DBPM
BPM
VORP


In [110]:
predicted_stats = pd.DataFrame()
a = pd.concat([df,results],axis=1,)


In [111]:
predicted_stats['Age'] = df['Age'] + 1

for i in feature_list:
    predicted_stats[f'{i}'] = df[i] + results[f'{i}_Diff']


In [112]:
predicted_stats = pd.concat([df[['Player','Pos']],predicted_stats],axis=1)

In [113]:
predicted_stats.to_csv('Next_Year_Predicted_Stats.csv')