In [47]:

import os
from datetime import date

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectPercentile
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

import joblib

In [34]:
def trim_set(frame):
    return frame[((frame['TRB'] > 2) | (frame['AST'] > 5)) & (frame['PTS'] > 15)]
    # return frame[((frame['TRB'] > 5) | (frame['AST'] > 5)) & (frame['PTS'] > 15) & (frame['Win%'] > .5)]

In [35]:
mvp_data_dir = '../Basketball Reference Stat Scraper/Awards/MVP'
player_data_dir = '../Basketball Reference Stat Scraper/player_stats'

In [36]:
full_data = []

for year in range(1980,2023):
    mvp = pd.read_csv(os.path.join(mvp_data_dir,f'{year}_MVP.csv'),index_col=0)
    player = pd.read_csv(os.path.join(player_data_dir,f'{year}_player_stats.csv'),index_col=0)
    player.Player = player.Player.str.strip('*')


    all = pd.merge(player,mvp,how='left')
    all['Pts Max'] = mvp['Pts Max'].min()
    all = all.fillna(0)
    full_data.append(all)

# len(full_data)
train = pd.concat(full_data)

In [37]:
df = train.drop(columns=['STAR', 'DPOY_Rank', 'DPOY_First', 'DPOY_Pts Won', 'DPOY_Pts Max', 'DPOY_Share',
                      'DPOY', 'MIP_Rank', 'MIP_First', 'MIP_Pts Won', 'MIP_Pts Max',
                      'MIP_Share', 'MIP', 'MVP_First', 'MVP_Pts Won',
                      'MVP_Pts Max', 'ROTY_Rank', 'ROTY_First',
                      'ROTY_Pts Won', 'ROTY_Pts Max', 'ROTY_Share', 'ROTY', 'SMOTY_Rank',
                      'SMOTY_First', 'SMOTY_Pts Won', 'SMOTY_Pts Max', 'SMOTY_Share',
                      'SMOTY','First'], errors='ignore')

In [38]:
df = trim_set(df)

In [43]:


feature_list = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA',
                    'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA',
                    'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
                    'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%',
                    'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM',
                    'BPM', 'VORP']

x = df[feature_list]
y = df.loc[:, 'Share',]

In [44]:


# declare a  pipeline, explicitly giving names to both steps.
pipe = Pipeline(

    [('feat_selection', SelectPercentile(percentile=60),), ('scaler', MinMaxScaler()),
     ('reg', RandomForestRegressor())])

In [45]:
pipe.fit(x,y)

In [49]:
# filename = f'{date.today().strftime("%Y-%m-%d")}_MVP_Predictions.pkl'
filename = 'MVP_Predictions.pkl'
joblib.dump(pipe, filename)

['MVP_Predictions.pkl']