### *Imports*

In [277]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
import statsmodels.api as sm
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LinearRegression, ElasticNetCV
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.svm import SVC
from sklearn.dummy import DummyRegressor
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Input
from keras.regularizers import l1, l2
from keras.callbacks import EarlyStopping
from sklearn.ensemble import GradientBoostingRegressor

In [278]:
import warnings
warnings.filterwarnings('ignore')

### *Baseline Model*

In [279]:
data = pd.read_csv('data/finaldata.csv') #Read in features dataframe 
df = data.select_dtypes([np.number]) #only look at numeric columns
ytarget = pd.read_csv('data/ytargets.csv') #Read in target dataframe

In [280]:
X = df #397 different fangraphs statistics for the features

In [281]:
y = ytarget#trying to predict wins

In [284]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=12) #train test split

In [285]:
ss = StandardScaler()
ss.fit(X_train) #Scale and fit data

StandardScaler()

In [286]:
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test) #Scale and Transform data

In [287]:
# instantiate baseline model that predicts the mean of the data every time
base_model = DummyRegressor(strategy = 'mean')

# Fit model
base_model = base_model.fit(X_train_sc, y_train)

# get predictions
base_test_preds = base_model.predict(X_test_sc)


# Evaluate: Let's use RMSE

print(f'Baseline model Test RMSE: {np.sqrt(mean_squared_error((y_test), base_test_preds))}')

Baseline model Test RMSE: 13.061666465815055


Score to beat: 
Baseline model Test RMSE: 13.061666465815055


In [288]:
cor = df.corr()
#Correlation with output variable
cor_target = abs(cor["W./P"])
#Selecting highly correlated features but not too highly correlated
features = cor_target[(cor_target > .3) & (cor_target < .5)]
features.sort_values(ascending= False)

L.1./P        0.487254
K%+./P        0.471963
K/BB+./P      0.471805
xFIP-./P      0.458418
WAR           0.451891
                ...   
wFB/C         0.313673
BB%+          0.309580
TBF./P        0.307666
wFA/C (pi)    0.302626
tERA./P       0.300103
Name: W./P, Length: 84, dtype: float64

In [289]:
featureslist = features.index.tolist() #move these features into a list

In [293]:
#removing some features that repeat/too correlated to others
nicelist = []
for i in featureslist:
    if i not in ('RE24./P','RA9-WAR./P', 'WAR.1','RAR','RE24','ERA.1./P','Off.1', '-WPA./P','RAR./P','WAR.1./P','wRC+.1',
                   'Bat', 'SV.1./P','wFB./P','wFB/C./P','FDP-Wins./P','wRAA','Starting./P','wFA (pfx)./P','wFA/C (pfx)./P',
                   'RS./P','R.1','wFA/C (pi)./P','wFA (pi)./P','RBI.1','FIP.1./P','ERA-./P','FIP-./P','xFIP-./P','AVG+./P',
                   'WHIP+./P','LOB%+./P','K%+./P', 'Off', 'ER./P', 'K/BB+./P', 'H/9+./P','H/9./P', 'xFIP./P', 'L.1./P','OBP.1', 
                    'BB/K', 'Age', 'OBP', 'R','SIERA./P','kwERA./P','K/9+./P','xFIP.1./P','K%+', 'BB%+', 'wFB','wFB/C','wFA/C (pi)',
                    'wRC', 'wOBA.1', 'Balls', 'PA.1', 'PA', 'TBF./P','SO./P', 'tERA./P', 'RS/9./P','BB/9+./P',
                    'K/9.1./P',):
        nicelist.append(i)

In [294]:
nicelist

['K/9./P',
 'ERA./P',
 'FIP./P',
 'WAR./P',
 'H./P',
 'R./P',
 'K/BB./P',
 'AVG./P',
 'WHIP./P',
 'WPA./P',
 'REW./P',
 'WPA/LI./P',
 'RBI',
 'wOBA',
 'wRC+',
 'WAR',
 'OPS',
 'WPA',
 'REW',
 'WPA/LI',
 'OBP+']

In [295]:
X = df[nicelist] #features to predict wins
y = ytarget #wins

In [296]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=12)
ss = StandardScaler()
ss.fit(X_train) #fit/standardize

StandardScaler()

In [297]:
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test) #transform standardization

In [298]:
model = Sequential()


model.add(BatchNormalization())


model.add(Dense(64, kernel_regularizer=l2(0.01), activation="relu", input_shape=(X_train_sc.shape[1],)))
model.add(Dense(64, kernel_regularizer=l2(0.01), activation="relu", input_shape=(X_train_sc.shape[1],)))
model.add(Dropout(0.05))
model.add(Dense(128, kernel_regularizer=l2(0.01), activation="relu", input_shape=(X_train_sc.shape[1],)))
model.add(Dropout(0.2))


model.add(Dense(1, kernel_initializer='normal'))


model.compile(loss='mean_squared_error', optimizer='adam')


early_stop = EarlyStopping(patience=5)


history = model.fit(X_train_sc, y_train,
                   epochs=100,
                   validation_data=(X_test_sc, y_test),
                   callbacks = [early_stop])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [299]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization_3 (Batch (None, 21)                84        
_________________________________________________________________
dense_12 (Dense)             (None, 64)                1408      
_________________________________________________________________
dense_13 (Dense)             (None, 64)                4160      
_________________________________________________________________
dropout_6 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_14 (Dense)             (None, 128)               8320      
_________________________________________________________________
dropout_7 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_15 (Dense)             (None, 1)                

In [302]:
y_preds = model.predict(X_test_sc)
#looking at our RMSE score for this question of best model
print(f'Neural Net model Test RMSE: {np.sqrt(metrics.mean_squared_error(y_test, y_preds))}')

Neural Net model Test RMSE: 12.492042035297633


In [303]:
y_preds #Make sure the predictions below are between 60-110.. (most common number of wins in a season)

array([[ 88.51291 ],
       [ 74.58269 ],
       [ 73.92764 ],
       [ 64.63246 ],
       [ 80.860886],
       [ 75.67685 ],
       [ 81.586494],
       [ 62.25392 ],
       [ 78.13336 ],
       [ 67.049644],
       [111.513374],
       [ 64.767296],
       [ 78.42414 ],
       [ 98.46448 ],
       [ 67.31401 ],
       [ 78.95346 ],
       [ 83.11597 ],
       [ 83.07031 ],
       [ 84.511795],
       [ 62.367588],
       [ 67.70044 ],
       [ 68.36766 ],
       [ 89.25569 ],
       [ 79.34595 ],
       [ 61.260796],
       [ 91.545494],
       [ 78.345245],
       [ 68.156494],
       [ 74.21062 ],
       [ 84.71857 ],
       [ 68.53618 ],
       [ 72.90519 ],
       [ 66.334656],
       [121.86804 ],
       [ 82.56873 ],
       [ 73.96861 ],
       [ 98.67325 ],
       [108.08586 ],
       [ 85.98009 ],
       [ 71.33304 ],
       [ 73.0538  ],
       [ 71.82733 ],
       [ 70.153984],
       [ 69.43558 ],
       [ 80.67774 ],
       [ 80.90561 ],
       [ 75.61869 ],
       [ 65.3

In [304]:
def rmse(regressor):  #created a function that lets you put in your type of regressor you want to use and it tells you the rmse score
    steps = [
        ('scaler', StandardScaler()),
        ('regressor', regressor())
    ]

    pipe = Pipeline(steps = steps)

    model = pipe.fit(X_train_sc, y_train)
    
    train_pred = model.predict(X_train_sc)    
    print(f'train RMSE: {np.sqrt(mean_squared_error(y_train, train_pred))}')
    test_pred = model.predict(X_test_sc)
    print(f'test RMSE: {np.sqrt(mean_squared_error(y_test, test_pred))}')

In [305]:
rmse(LinearRegression)

train RMSE: 9.419272945888014
test RMSE: 10.495933551860258


In [306]:
rmse(KNeighborsRegressor)

train RMSE: 8.716094946196439
test RMSE: 10.886150222492185


In [307]:
rmse(DecisionTreeRegressor)

train RMSE: 0.0
test RMSE: 12.687001221722966


In [308]:
rmse(BaggingRegressor)

train RMSE: 4.599425084846051
test RMSE: 10.499479352171074


In [309]:
rmse(RandomForestRegressor)

train RMSE: 3.8267005224983164
test RMSE: 10.457392664203317


In [310]:
rmse(AdaBoostRegressor)

train RMSE: 7.2539637105478825
test RMSE: 10.286084906152114


In [311]:
rmse(SVR)

train RMSE: 9.954837050368818
test RMSE: 11.213255169484835


In [312]:
rmse(Ridge)

train RMSE: 9.446427596649423
test RMSE: 10.446166355843857


In [313]:
rmse(GradientBoostingRegressor)

train RMSE: 3.3362020458573487
test RMSE: 10.998659933939354


# Let's put our models into a pipeline, gridsearch the parameters:

In [71]:
lr_pipeline = Pipeline([
    ("lr", LinearRegression())
    ])

In [72]:
lr_parameters = {
    'lr__fit_intercept':[True,False],
    'lr__normalize' :[True,False],
    'lr__copy_X' : [True,False]}

In [73]:
lr_gs = GridSearchCV(lr_pipeline, param_grid=lr_parameters, n_jobs = -1, scoring='neg_root_mean_squared_error')
lr_gs.fit(X_train_sc, y_train)

print(f"Best score: {lr_gs.best_score_*-1}")
print(f"Best score: {lr_gs.best_params_}")

Best score: 10.486704256308988
Best score: {'lr__copy_X': True, 'lr__fit_intercept': True, 'lr__normalize': True}


----------------------------------------------------------------------------------------------------

In [79]:
knr_pipeline = Pipeline([
    ("knr", KNeighborsRegressor())
    ])

In [80]:
knr_parameters = {
    'knr__n_neighbors':[5, 10 , 15, 20, 25, 27, 30, 35],
    'knr__weights' :['uniform', 'distance']}

In [81]:
knr_gs = GridSearchCV(knr_pipeline, param_grid=knr_parameters, n_jobs = -1, scoring='neg_root_mean_squared_error')
knr_gs.fit(X_train_sc, y_train)

print(f"Best score: {knr_gs.best_score_*-1}")
print(f"Best score: {knr_gs.best_params_}")

Best score: 10.087238923865195
Best score: {'knr__n_neighbors': 20, 'knr__weights': 'uniform'}


------------------------------

In [82]:
dtr_pipeline = Pipeline([
    ("dtr", DecisionTreeRegressor())
    ])

In [87]:
dtr_parameters = {
    "dtr__criterion": ["mse", "mae"],
"dtr__min_samples_split": [0, 1, 2, 3 , 4],
"dtr__min_samples_leaf": [1, 5, 25, 30]}

In [88]:
dtr_gs = GridSearchCV(dtr_pipeline, param_grid=dtr_parameters, n_jobs = -1, scoring='neg_root_mean_squared_error')
dtr_gs.fit(X_train_sc, y_train)

print(f"Best score: {dtr_gs.best_score_*-1}")
print(f"Best parameters: {dtr_gs.best_params_}")

Best score: 10.37480302413272
Best parameters: {'dtr__criterion': 'mse', 'dtr__min_samples_leaf': 30, 'dtr__min_samples_split': 2}


----------------------------------------------------------------------------------------------------

In [89]:
br_pipeline = Pipeline([
    ("br", BaggingRegressor())
    ])

In [92]:
br_parameters = {
    "br__max_samples": [150, 160, 180, 100],
    "br__max_features": [11, 13, 15, 17, 19, 21]}

In [93]:
br_gs = GridSearchCV(br_pipeline, param_grid=br_parameters, n_jobs = -1, scoring='neg_root_mean_squared_error')
br_gs.fit(X_train_sc, y_train)

print(f"Best score: {br_gs.best_score_*-1}")
print(f"Best parameters: {br_gs.best_params_}")

Best score: 9.93221256155348
Best parameters: {'br__max_features': 17, 'br__max_samples': 160}


----------------------------------------------------------------------------------------------------

In [94]:
rfr_pipeline = Pipeline([
    ("rfr", RandomForestRegressor())
    ])

In [95]:
rfr_parameters = {
    "rfr__n_estimators": [150, 175, 200, 225, 250]}

In [96]:
rfr_gs = GridSearchCV(rfr_pipeline, param_grid=rfr_parameters, n_jobs = -1, scoring='neg_root_mean_squared_error')
rfr_gs.fit(X_train_sc, y_train)

print(f"Best score: {rfr_gs.best_score_*-1}")
print(f"Best parameters: {rfr_gs.best_params_}")

Best score: 10.017869042484898
Best parameters: {'rfr__n_estimators': 175}


----------------------------------------------------------------------------------------------------

In [97]:
abr_pipeline = Pipeline([
    ("abr", AdaBoostRegressor())
    ])

In [98]:
abr_parameters = {
    "abr__n_estimators": [350, 375, 400, 325, 350],
    "abr__learning_rate": [0, .5, 1.5, 2.0]}

In [99]:
abr_gs = GridSearchCV(abr_pipeline, param_grid=abr_parameters, n_jobs = -1, scoring='neg_root_mean_squared_error')
abr_gs.fit(X_train_sc, y_train)

print(f"Best score: {abr_gs.best_score_*-1}")
print(f"Best parameters: {abr_gs.best_params_}")

Best score: 10.345948298546862
Best parameters: {'abr__learning_rate': 0.5, 'abr__n_estimators': 400}


----------------------------------------------------------------------------------------------------

In [100]:
svr_pipeline = Pipeline([
    ("svr", SVR())
    ])

In [101]:
svr_parameters = {
    "svr__degree": [-2, -1, 0, .5, 1],
    "svr__max_iter": [-3, -2, -1, 0, 1, 2, 3]}

In [102]:
svr_gs = GridSearchCV(svr_pipeline, param_grid=svr_parameters, n_jobs = -1, scoring='neg_root_mean_squared_error')
svr_gs.fit(X_train_sc, y_train)

print(f"Best score: {svr_gs.best_score_*-1}")
print(f"Best parameters: {svr_gs.best_params_}")

Best score: 10.357041568890194
Best parameters: {'svr__degree': 0, 'svr__max_iter': -1}


----------------------------------------------------------------------------------------------------

In [103]:
rid_pipeline = Pipeline([
    ("rid", Ridge())
    ])

In [104]:
rid_parameters = {
    'rid__alpha': np.logspace(0, 1000, 10),
    'rid__fit_intercept': [True, False],
    'rid__normalize': [True, False],
    'rid__copy_X': [True, False],
    'rid__tol': [.00001, .000001, .0000001, .00000001, .000000001],
    'rid__solver': ['auto', 'svd’, ‘cholesky’, ‘lsqr’, ‘sparse_cg’, ‘sag’, ‘saga']}

In [105]:
rid_gs = GridSearchCV(rid_pipeline, param_grid=rid_parameters, n_jobs = -1, scoring='neg_root_mean_squared_error')
rid_gs.fit(X_train_sc, y_train)

print(f"Best score: {rid_gs.best_score_*-1}")
print(f"Best parameters: {rid_gs.best_params_}")

Best score: 9.935812564675102
Best parameters: {'rid__alpha': 1.0, 'rid__copy_X': True, 'rid__fit_intercept': True, 'rid__normalize': True, 'rid__solver': 'auto', 'rid__tol': 1e-05}


----------------------------------------------------------------------------------------------------

In [106]:
gbr_pipeline = Pipeline([
    ("gbr", GradientBoostingRegressor())
    ])

In [107]:
gbr_parameters = {
    'gbr__n_estimators': [250, 275, 300, 325, 350],
    'gbr__subsample': [.25, .3, .4, .5, .6]}

In [108]:
gbr_gs = GridSearchCV(gbr_pipeline, param_grid=gbr_parameters, n_jobs = -1, scoring='neg_root_mean_squared_error')
gbr_gs.fit(X_train_sc, y_train)

print(f"Best score: {gbr_gs.best_score_*-1}")
print(f"Best parameters: {gbr_gs.best_params_}")

Best score: 10.787132680072835
Best parameters: {'gbr__n_estimators': 325, 'gbr__subsample': 0.6}


--------------------------------------------------------------------------------------------------------

#### Testing Time

In [314]:
def specific(year): #function that will pull and clean mlb data/ doing this agin b/c we want the correct wins for that year
    Teamdic = {'HOU':'Astros', 'LAD':'Dodgers', 'CHC':'Cubs', 'ARI':'Diamondbacks', 'MIL':'Brewers', 'TBR':'Rays',
        'ATL':'Braves', 'BOS':'Red Sox', 'CLE': 'Indians','NYY':'Yankees','OAK': 'Athletics','STL':'Cardinals', 
        'SFG':'Giants','PIT':'Pirates', 'WSN': 'Nationals','NYM': 'Mets','SEA': 'Mariners','PHI':'Phillies','LAA':'Angels',
        'COL':'Rockies', 'SDP':'Padres', 'MIN':'Twins', 'DET':'Tigers','CIN': 'Reds','MIA':'Marlins','FLA':'Marlins','CHW':'White Sox',
        'TOR': 'Blue Jays', 'TEX':'Rangers', 'KCR' : 'Royals', 'BAL':'Orioles'} 
    hittingdata = pd.read_csv(f'data/{year}.csv') #read in yearly csvs
    hittingdata.Team.replace(Teamdic, inplace=True)
    hittingdata.sort_values(by = 'Team', inplace=True)
    hittingdata.set_index('Team', inplace= True)
    pitchingdata = pd.read_csv(f'data/{year}P.csv')
    pitchingdata.Team.replace(Teamdic, inplace=True)
    pitchingdata.sort_values(by = 'Team', inplace=True)
    pitchingdata.set_index('Team', inplace= True)
    pitchingdata.columns = pitchingdata.columns + './P'
    pitchingdata.rename(columns={"Team./P": "Team"}, inplace = True)  
    df_both = pd.merge(pitchingdata, hittingdata, on='Team', how = 'outer')
    df_both.dropna(axis = 1, inplace=True)# remove nans
    return df_both

In [315]:
dfeighteen = specific(2018) #using 2018 stats to predict 2019 wins
dfnineteen = specific(2019) #Only using wins from this to compare to projected wins

In [316]:
X = dfeighteen[nicelist]
y = dfnineteen['W./P'] 
ssX = ss.transform(X)

In [317]:
#They all performed similarly until testing RMSE using the 2018 and 2019 data.. Then Random Forest was the best
test_pred = rfr_gs.predict(ssX)
print(f'test RMSE: {np.sqrt(mean_squared_error(y, test_pred))}')

test RMSE: 8.014876576130929


In [318]:
predictions = rfr_gs.predict(ssX)

In [319]:
predictions

array([ 77.19428571, 101.37142857,  93.98857143,  71.24      ,
        88.18285714,  88.83428571,  84.51428571,  85.84571429,
        82.52571429, 101.49142857,  74.96      ,  94.05714286,
        71.68      ,  70.41714286,  83.42857143,  91.41714286,
        63.34285714,  71.17142857,  79.64      ,  74.47428571,
        74.21714286,  93.93142857,  92.26285714,  74.25142857,
        73.54857143,  62.78285714,  68.94857143,  73.49714286,
        72.32      ,  98.89714286])

In [320]:
preddf = pd.DataFrame(predictions) #turn predictions into a dataframe

In [321]:
preddf.index = dfeighteen.index  #put teams into new prediction dataframe

In [322]:
preddf.columns = ['Projected_Wins']

In [323]:
preddf

Unnamed: 0_level_0,Projected_Wins
Team,Unnamed: 1_level_1
Angels,77.194286
Astros,101.371429
Athletics,93.988571
Blue Jays,71.24
Braves,88.182857
Brewers,88.834286
Cardinals,84.514286
Cubs,85.845714
Diamondbacks,82.525714
Dodgers,101.491429


In [324]:
finaldf = dfnineteen.merge(preddf, on='Team')

In [325]:
finaldf['Projected_-_Actual'] = finaldf['Projected_Wins'] - finaldf['W./P']

finaldf[['W./P', 'Projected_Wins', 'Projected_-_Actual']] #create a comparison between actual and predictions

Unnamed: 0_level_0,W./P,Projected_Wins,Projected_-_Actual
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Angels,72,77.194286,5.194286
Astros,107,101.371429,-5.628571
Athletics,97,93.988571,-3.011429
Blue Jays,67,71.24,4.24
Braves,97,88.182857,-8.817143
Brewers,89,88.834286,-0.165714
Cardinals,91,84.514286,-6.485714
Cubs,84,85.845714,1.845714
Diamondbacks,85,82.525714,-2.474286
Dodgers,106,101.491429,-4.508571


In [326]:
print(f'The average difference between the Model Projections and the Actual Wins is {(finaldf["Projected_-_Actual"].abs().sum())/len(finaldf["Projected_-_Actual"])}')

The average difference between the Model Projections and the Actual Wins is 5.304571428571429


---------------------------------------------------------------------

### 2021 Projections

In [327]:
dftwenty = specific(2020) #2020s Stats

In [328]:
dftwenty.sort_values(by=['Team'], ascending = True, inplace= True)

*2020 was a covid shortened year. We are going to convert the stats from 60 games to 162 games. Some stats we will use from the 2019 season due to the small sample size.*

In [330]:
dftwenty['SV./P'] = dftwenty['SV./P']*162/ 60 #Convert 60 game season to 162 games

In [331]:
dftwenty['WAR./P'] = dftwenty['WAR./P']*162/ 60  #Convert 60 game season to 162 games

In [332]:
dftwenty['H./P'] = dftwenty['H./P']*162/ 60  #Convert 60 game season to 162 games

In [333]:
dftwenty['R./P'] = dftwenty['R./P']*162/ 60  #Convert 60 game season to 162 games

In [334]:
dftwenty['WPA./P'] = dfnineteen['WPA./P'] #Use 2019's stat 

In [335]:
dftwenty['REW./P'] = dfnineteen['REW./P'] #Use 2019's stat

In [336]:
dftwenty['WPA/LI./P'] = dfnineteen['WPA/LI./P'] #Use 2019's stat

In [337]:
dftwenty['R'] = dfnineteen['R']*162/ 60 #Convert 60 game season to 162 games

In [338]:
dftwenty['RBI'] = dftwenty['RBI']*162/ 60 #Convert 60 game season to 162 games

In [339]:
dftwenty['WAR']= dftwenty['WAR']*162/ 60 #Convert 60 game season to 162 games

In [340]:
dftwenty['WPA'] = dfnineteen['WPA'] #Use 2019's stat
dftwenty['REW'] = dfnineteen['REW'] #Use 2019's stat
dftwenty['WPA/LI'] = dfnineteen['WPA/LI'] #Use 2019's stat

In [341]:
dftwenty['W./P'] = dftwenty['W./P']*162/ 60 #Convert 60 game season to 162 games

In [342]:
X = dftwenty[nicelist] #2020s stats altered a bit
ssX = ss.transform(X)

In [343]:
predictions = rid_gs.predict(ssX)

In [344]:
predictions

array([[80.98588172],
       [86.69352775],
       [88.25432108],
       [76.33792043],
       [88.53434885],
       [84.57023899],
       [83.65907088],
       [84.6589174 ],
       [76.21660703],
       [98.88698208],
       [81.3547757 ],
       [88.15908236],
       [74.54547062],
       [70.79586826],
       [86.26937817],
       [80.25474046],
       [76.70536698],
       [87.74816127],
       [80.92766746],
       [71.42257233],
       [70.32494041],
       [89.12736128],
       [76.5947749 ],
       [84.05010824],
       [68.90541195],
       [74.73958703],
       [67.51605373],
       [90.9575739 ],
       [85.07844733],
       [91.41402539]])

In [345]:
preddf = pd.DataFrame(predictions) #put predictions into dataframe

In [346]:
preddf.index = dftwenty.index #create an index with team names

In [347]:
preddf.columns = ['Projected_Wins'] #create column

In [348]:
preddf

Unnamed: 0_level_0,Projected_Wins
Team,Unnamed: 1_level_1
Angels,80.985882
Astros,86.693528
Athletics,88.254321
Blue Jays,76.33792
Braves,88.534349
Brewers,84.570239
Cardinals,83.659071
Cubs,84.658917
Diamondbacks,76.216607
Dodgers,98.886982


In [349]:
def round_of_rating(number): #round win to the nearest .5/.0

    return round(number * 2) / 2 

In [350]:
proj = round_of_rating(preddf['Projected_Wins'])
proj = pd.DataFrame(proj)

In [352]:
AlEast = proj.iloc[[21, 29, 3, 16, 22]].sort_values(by=['Projected_Wins'], ascending = False) 
AlEast

Unnamed: 0_level_0,Projected_Wins
Team,Unnamed: 1_level_1
Yankees,91.5
Rays,89.0
Blue Jays,76.5
Orioles,76.5
Red Sox,76.5


In [353]:
AlCentral = proj.iloc[[27, 11, 28, 25, 26]].sort_values(by=['Projected_Wins'], ascending = False)  
AlCentral

Unnamed: 0_level_0,Projected_Wins
Team,Unnamed: 1_level_1
Twins,91.0
Indians,88.0
White Sox,85.0
Royals,74.5
Tigers,67.5


In [354]:
AlWest = proj.iloc[[2, 1, 12, 0, 20]].sort_values(by=['Projected_Wins'], ascending = False)  
AlWest

Unnamed: 0_level_0,Projected_Wins
Team,Unnamed: 1_level_1
Athletics,88.5
Astros,86.5
Angels,81.0
Mariners,74.5
Rangers,70.5


In [355]:
NlEast = proj.iloc[[4, 13, 18, 14, 15]].sort_values(by=['Projected_Wins'], ascending = False)  
NlEast

Unnamed: 0_level_0,Projected_Wins
Team,Unnamed: 1_level_1
Braves,88.5
Mets,86.5
Phillies,81.0
Nationals,80.5
Marlins,71.0


In [356]:
NlCentral = proj.iloc[[7, 6, 23, 5, 19]].sort_values(by=['Projected_Wins'], ascending = False)  
NlCentral

Unnamed: 0_level_0,Projected_Wins
Team,Unnamed: 1_level_1
Cubs,84.5
Brewers,84.5
Reds,84.0
Cardinals,83.5
Pirates,71.5


In [357]:
NlWest = proj.iloc[[9, 17, 10, 24, 8]].sort_values(by=['Projected_Wins'], ascending = False)  
NlWest

Unnamed: 0_level_0,Projected_Wins
Team,Unnamed: 1_level_1
Dodgers,99.0
Padres,87.5
Giants,81.5
Diamondbacks,76.0
Rockies,69.0


---

---