# Import libraries

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
from pybaseball import batting_stats

In [3]:
# Batting Stats , qual is the number of players appearences 
batting = batting_stats(2007,2022, qual=200)
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
2,15640,2022,Aaron Judge,NYY,30,157,570,696,177,87,...,118.4,246,0.609,404,0.169,0.287,,,,11.2
6,13611,2018,Mookie Betts,BOS,25,136,520,614,180,96,...,110.6,217,0.500,434,0.220,0.270,,,,10.4
46,10155,2013,Mike Trout,LAA,21,157,589,716,190,115,...,,0,,0,0.200,0.266,,,,10.2
104,10155,2012,Mike Trout,LAA,20,139,559,639,182,117,...,,0,,0,0.221,0.293,,,,10.1
119,9166,2012,Buster Posey,SFG,25,148,530,610,178,114,...,,0,,0,0.190,0.251,,,,10.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5198,1698,2010,Gerald Laird,DET,30,89,270,299,56,40,...,,0,,0,0.166,0.252,,,,-2.4
5368,9362,2010,Adam Moore,SEA,26,60,205,218,40,30,...,,0,,0,0.181,0.325,,,,-2.4
5330,9272,2018,Chris Davis,BAL,32,128,470,522,79,51,...,111.8,113,0.401,282,0.174,0.316,,,,-3.1
5027,319,2011,Adam Dunn,CHW,31,122,415,496,66,39,...,,0,,0,0.169,0.295,,,,-2.9


In [4]:
# Export DataFrame to csv
batting.to_csv('../Data/Raw_Data/batting.csv', index=False)

OSError: Cannot save file into a non-existent directory: '../Data/Raw_Data'

In [None]:
# Drop players with only 1 season of data 
batting = batting.groupby("IDfg", group_keys=False).filter(lambda x: x.shape[0]>1)

In [None]:
batting

In [None]:
def next_season(player):
    player = player.sort_values("Season")
    player["Next_BsR"] = player["BsR"].shift(-1)
    return player

batting = batting.groupby("IDfg", group_keys=False).apply(next_season)

In [None]:
# Find the next BsR of each year 
bsr_df=batting[['Name','Season','BsR','Next_BsR']]
bsr_df

## Cleaning Data

In [None]:
null_data=batting.isnull().sum()
null_data

In [None]:
complete_cols=list(batting.columns[null_data == 0])
batting = batting[complete_cols + ["Next_BsR"]].copy()

In [None]:
# Clean DataFrame
batting

In [None]:
batting.dtypes

### Dummifying all of the categorical variables 

In [None]:
batting.dtypes[batting.dtypes == 'object']

In [None]:
# Drop 'Dol' (Dollar value of player) and 'Age Rng' (Player's age range during a season) 
batting = batting.drop(['Dol','Age Rng'], axis = 1, inplace = False)

In [None]:
# Assign each team name to number 
batting["team_code"] = batting["Team"].astype("category").cat.codes

In [None]:
# Copy batting data 
batting_full = batting.copy()

# Drop Nan Values from 'Next_Raw'
batting = batting.dropna().copy()

In [None]:
batting

## Machine Learning Model to predict next BsR for batters

In [None]:
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

rr = Ridge(alpha=1)

split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, 
                                n_features_to_select=20, 
                                direction="forward",
                                cv=split,
                                n_jobs=8
                               )

In [None]:
# Remove some columns 
removed_columns = ["Next_BsR", "Name", "Team", "IDfg", "Season"]
selected_columns = batting.columns[~batting.columns.isin(removed_columns)]

In [None]:
# Scale the data 
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
batting.loc[:,selected_columns] = scaler.fit_transform(batting[selected_columns])

In [None]:
# Most values are now between 0 and 1
batting.describe()

In [None]:
# Fit the data 
sfs.fit(batting[selected_columns], batting["Next_BsR"])

In [None]:
# sfs.get_support() returns True to each column we want to select
sfs.get_support()

In [None]:
# Assign sfs to a variable called predictors
predictors = list(selected_columns[sfs.get_support()])
predictors

In [None]:
# All the seasons'years 
years = sorted(batting["Season"].unique())
years

In [None]:
def backtest(data, model, predictors, start=5, step=1):
    all_predictions = []
    
    years = sorted(data["Season"].unique())
    
    for i in range(start, len(years), step):
        current_year = years[i]
        
        # Train data is all years before current_year
        train = data[data["Season"] < current_year]
        
        # Test data is the current_year
        test = data[data["Season"] == current_year]
        
        model.fit(train[predictors], train["Next_BsR"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["Next_BsR"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [None]:
model_pred=backtest(batting, rr, predictors)
model_pred

In [None]:
# Calculate the mean squared error value (To compare the prediction from actual value)
from sklearn.metrics import mean_squared_error

mean_squared_error(model_pred["actual"], model_pred["prediction"])

In [None]:
# Square root of mean_squared_error
5.9255496118791635 ** 0.5

In [None]:
batting['Next_BsR'].describe()

std = 3.30

Square root of mean_squared_error = 2.4342451831890646 < std 
Which is good for the model

### Add player history data to improve accuracy

In [None]:
def player_history(df):
    df = df.sort_values("Season")
        
    df["player_season"] = range(0, df.shape[0])
    # Find the corr between "player_season" and "WAR"
    df["BsR_corr"] = list(df[["player_season", "BsR"]].expanding().corr().loc[(slice(None), "player_season"),"BsR"])
    df["BsR_corr"].fillna(0, inplace=True)
    
    df["BsR_diff"] = df["BsR"] / df["BsR"].shift(1)
    # Replace NaN values with 1 when there's no previous season 
    df["BsR_diff"].fillna(1, inplace=True)
    # Replace infinite value with 1
    df["BsR_diff"][df["BsR_diff"] == np.inf] = 1
    
    return df

batting = batting.groupby("IDfg", group_keys=False).apply(player_history)


In [None]:
# Find averages across seasos; tell us if a player performed better than the average of a season or no 
def group_averages(df):
    return df["BsR"] / df["BsR"].mean()

In [None]:
batting["BsR_season"] = batting.groupby("Season", group_keys=False).apply(group_averages)

In [None]:
new_predictors = predictors + ["player_season", "BsR_corr", "BsR_season", "BsR_diff"]

In [None]:
predictions = backtest(batting, rr, new_predictors)

In [None]:
# Calculate the mean squared error value (To compare the prediction from actual value)
mean_squared_error(predictions["actual"], predictions["prediction"]) 

Value slighly less than before

In [None]:
pd.Series(rr.coef_, index=new_predictors).sort_values()

All small values indicated that the model is not taking these columns into account to make the prediction.

In [None]:
merged = predictions.merge(batting, left_index=True, right_index=True)
merged["diff"] = (predictions["actual"] - predictions["prediction"]).abs()
next_BsR_df=merged[["IDfg", "Season", "Name", "BsR", "Next_BsR", "diff"]].sort_values(["diff"])
next_BsR_df.head()

In [None]:
import json
next_BsR_batt_js=next_BsR_df.to_json(orient = "records")
next_BsR_batt_js

In [None]:
# Save JSON file in clean data file
save_file=open('../Data/Clean_Data/next_BsR_batt.js','w')
json.dump(next_BsR_batt_js, save_file, indent = 6)  
save_file.close() 