In [110]:
import pandas as pd
import numpy as np
import os
import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

### Read Data

Read data which was parsed from CPBL and CPBLSTAT  
CPBL Data: 2021 all game's Win/Loss, Home/Away, Batting Order and the Player's Name with PAs and SP/RP's Name with PAs  
CPBLSTAT: 2021 all player's advance baseball index before 8/11, include batting: 'BABIP','wOBA','wRC+'  Pitching: 'FIP','WAR'  

In [111]:
#set os
os.chdir(r'D:\100. Issue\24. CPBL')

#read data
Score_board = pd.read_csv('board.csv')
Score_board['Win'] = Score_board['Win'].astype(str)
Score_board['Home'] = Score_board['Home'].astype(str)

Batter_list = pd.read_csv('batter_list.csv')
Pitcher_list = pd.read_csv('pitcher_list.csv')
Batting_stat = pd.read_csv('batting_stat.csv')
Pitching_stat = pd.read_csv('pitching_stat.csv')

### Create function for the weighted average of advance baseball index for all batting order and SP/RP

In [112]:
# weighted average
def weighted_average(df,col, weight_col):
    return np.average(df[col], weights = df[weight_col])

def groupby_cal(df,col):
    return df.groupby(['Win','Date','No','Team','Home','Order'],as_index=False).apply(lambda x:weighted_average(x,col,'BA'))

### Data cleaning

Merge the Score_board, Batter_List, Batting_stat   
and leave 'Win','Date','No','Team','Home','Order','BA','OPS+','BB/K','BABIP','wOBA','wRC+'  

In [113]:
#### Batting
# Merge data
Batting_board = Score_board.merge(Batter_list,on=['Date','No','Team'])
Batting_board = Batting_board.merge(Batting_stat, left_on = 'Name', right_on='NAME', how = 'left')
Batting_board = Batting_board[['Win','Date','No','Team','Home','Order','BA','OPS+','BB/K','BABIP','wOBA','wRC+']]
Batting_board.fillna(0, inplace=True)
Batting_board['BB/K'] = Batting_board['BB/K'].replace('-','0')
Batting_board.iloc[:,7:] = Batting_board.iloc[:,7:].apply(lambda x: x.astype(float))
Batting_board = Batting_board[Batting_board.Win != '0.5']
Batting_board = Batting_board[Batting_board.BA != 0]

Calculate the weighted average for the advance batting index by PAs

In [114]:
df_list = []
for i in ['OPS+','BB/K','BABIP','wOBA','wRC+']:
    df = groupby_cal(Batting_board,i)
    df.columns = ['Win', 'Date', 'No', 'Team', 'Home', 'Order', i]
    df_list.append(df)

for i in range(0,len(df_list)):
    if i == 0:
        df = df_list[0]
    else:
        df = df.merge(df_list[i])

DataFrame reshape  
Long data transfer to wide data by batting order

In [115]:
# Data reshape
df = df.sort_values(by=['Date','No','Team','Order'])
df.reset_index(drop=True, inplace = True)

df_list = []

for item in ['OPS+','BB/K','BABIP','wOBA','wRC+']:
    ls_all = []
    ls = []

    for i in range(len(df)):
        if df.Order[i] == 1:
            ls_all.append(ls)
            ls = []
            ls.extend(df.iloc[i,0:5].tolist())
            ls.append(df[item][i])
        else:
            ls.append(df[item][i])

    df_list.append(ls_all[1:])

items = ['OPS+','BB/K','BABIP','wOBA','wRC+']
for i in range(len(df_list)):
    col = [items[i] + '_' + str(j) for j in range(1,10)]
    column_name = ['Win','Date','No','Team','Home']
    column_name.extend(col)
    if i == 0:
        df_batting = pd.DataFrame(df_list[i], columns= column_name)
    else:
        df_batting = df_batting.merge(pd.DataFrame(df_list[i], columns= column_name))   

del Batting_board, col, column_name, df, df_list, i, item, items, ls, ls_all
df_batting.dropna(inplace=True)
df_batting.to_csv('df_batting',index=False)

Do the same action with Pitcher data  
Data cleaning  
Calculate the advance pitching index  
then do the data reshape by pitcher position  

In [116]:
### Pitcher
# Merge data
Pitching_board = Score_board.merge(Pitcher_list,on=['Date','No','Team'])
Pitching_board = Pitching_board.merge(Pitching_stat, left_on = 'Name', right_on='NAME', how = 'left')
Pitching_board = Pitching_board[['Win','Date','No','Team','Home','Order','INN','BA','FIP','WAR']]
Pitching_board.Order[Pitching_board.Order != 1] = 'RP'
Pitching_board.Order[Pitching_board.Order == 1] = 'SP'
Pitching_board.iloc[:,6:] = Pitching_board.iloc[:,6:].apply(lambda x: x.astype(float))
Pitching_board = Pitching_board[Pitching_board.Win != '0.5']

# weighted average
df_list = []
for i in ['FIP','WAR']:
    df = groupby_cal(Pitching_board,i)
    df.columns = ['Win', 'Date', 'No', 'Team', 'Home', 'Order', i]
    df_list.append(df)

for i in range(0,len(df_list)):
    if i == 0:
        df = df_list[0]
    else:
        df = df.merge(df_list[i])

# Data reshape
df = df.sort_values(by=['Date','No','Team','Order'], ascending=[True, True, True,False])
df.reset_index(drop=True, inplace = True)

df_list = []
for item in ['FIP','WAR']:
    ls_all = []
    ls = []

    for i in range(len(df)):
        if df.Order[i] == 'SP':
            ls_all.append(ls)
            ls = []
            ls.extend(df.iloc[i,0:5].tolist())
            ls.append(df[item][i])
        else:
            ls.append(df[item][i])

    df_list.append(ls_all[1:])

items = ['FIP','WAR']
for i in range(len(df_list)):
    col = [items[i] + '_' + j for j in ['SP','RP']]
    column_name = ['Win','Date','No','Team','Home']
    column_name.extend(col)
    if i == 0:
        df_pitching = pd.DataFrame(df_list[i], columns= column_name)
    else:
        df_pitching = df_pitching.merge(pd.DataFrame(df_list[i], columns= column_name))   

del Pitching_board, col, column_name, df, df_list, i, item, items, ls, ls_all
df_pitching.fillna(0, inplace=True)
df_pitching.to_csv('df_pitching',index=False)

### Prepare to fit a model for Win/Loss prediction

Merge the batting data and pitching data

In [117]:
# merge pitching and batting
df_comp = df_pitching.merge(df_batting)

Split data into X train, X test by date 2021/08/11  
then split data into X and y  
y is our target, and it is 'Win'  
X is independent variable, exclude 'Date','No','Team','Win'

In [119]:
# split data to X and y

X_train = df_comp.iloc[:274,:]
X_test = df_comp.iloc[274:,:]

target = 'Win'
feature = [col for col in df_comp.columns if col not in ['Date','No','Team','Win']]

y_train = X_train[target]
X_train = X_train[feature]
y_test = X_test[target]
X_test = X_test[feature]

Execute the standardscaler

In [120]:
# Scaler
scaler = StandardScaler()
X_train.iloc[:,1:] = scaler.fit_transform(X_train.iloc[:,1:])
X_test.iloc[:,1:] = scaler.transform(X_test.iloc[:,1:])

### Fit model: Logistic Regression
  
Use package statsmodel to fit the Logistic Regression  
Why don't use sklearn  
Because statsmodel has summary function  
This function can help us to find which variable is significantly influences the target  
And check the model's R square score   

In [124]:
log_reg = sm.Logit(y_train.astype(float), X_train.astype(float)).fit()

Optimization terminated successfully.
         Current function value: 0.568059
         Iterations 7


In [125]:
log_reg.summary()

0,1,2,3
Dep. Variable:,Win,No. Observations:,274.0
Model:,Logit,Df Residuals:,224.0
Method:,MLE,Df Model:,49.0
Date:,"Fri, 20 Aug 2021",Pseudo R-squ.:,0.1805
Time:,10:06:06,Log-Likelihood:,-155.65
converged:,True,LL-Null:,-189.92
Covariance Type:,nonrobust,LLR p-value:,0.03396

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Home,-0.1375,0.207,-0.663,0.507,-0.544,0.269
FIP_SP,-0.4355,0.298,-1.462,0.144,-1.019,0.148
FIP_RP,-0.6125,0.261,-2.345,0.019,-1.125,-0.101
WAR_SP,0.1019,0.279,0.365,0.715,-0.446,0.650
WAR_RP,-0.1953,0.209,-0.936,0.349,-0.604,0.213
OPS+_1,0.2028,1.295,0.157,0.876,-2.334,2.740
OPS+_2,1.1703,1.011,1.158,0.247,-0.811,3.151
OPS+_3,3.3496,1.900,1.763,0.078,-0.374,7.073
OPS+_4,-0.9504,1.381,-0.688,0.491,-3.657,1.756


In [132]:
yhat = log_reg.predict(X_test.astype(float))
prediction = list(map(round, yhat))
print(classification_report(y_test.astype(float).astype(int),prediction))
print(confusion_matrix(y_test.astype(float).astype(int),prediction))

              precision    recall  f1-score   support

           0       0.50      0.40      0.44         5
           1       0.57      0.67      0.62         6

    accuracy                           0.55        11
   macro avg       0.54      0.53      0.53        11
weighted avg       0.54      0.55      0.54        11

[[2 3]
 [2 4]]


In [130]:
log_reg_prune = sm.Logit(y_train.astype(float), X_train[['FIP_RP','OPS+_3','BB/K_4']].astype(float)).fit()
log_reg_prune.summary()

Optimization terminated successfully.
         Current function value: 0.657066
         Iterations 5


0,1,2,3
Dep. Variable:,Win,No. Observations:,274.0
Model:,Logit,Df Residuals:,271.0
Method:,MLE,Df Model:,2.0
Date:,"Fri, 20 Aug 2021",Pseudo R-squ.:,0.05205
Time:,10:06:59,Log-Likelihood:,-180.04
converged:,True,LL-Null:,-189.92
Covariance Type:,nonrobust,LLR p-value:,5.087e-05

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
FIP_RP,-0.6276,0.172,-3.655,0.000,-0.964,-0.291
OPS+_3,0.0412,0.127,0.325,0.745,-0.207,0.289
BB/K_4,-0.1318,0.127,-1.039,0.299,-0.381,0.117


In [134]:
yhat = log_reg_prune.predict(X_test[['FIP_RP','OPS+_3','BB/K_4']].astype(float))
prediction = list(map(round, yhat))
print(classification_report(y_test.astype(float).astype(int),prediction))
print(confusion_matrix(y_test.astype(float).astype(int),prediction))

              precision    recall  f1-score   support

           0       0.60      0.60      0.60         5
           1       0.67      0.67      0.67         6

    accuracy                           0.64        11
   macro avg       0.63      0.63      0.63        11
weighted avg       0.64      0.64      0.64        11

[[3 2]
 [2 4]]


#### 結論

從中華職棒、CPBLSTAT爬下球賽勝負和球員的進階數據發現  
將所有變數丟進羅吉斯迴歸，對於勝負最有顯著影響的是  
後援投手的 FIP  
第三棒打者的 OPS+  
第四棒打者的 BB/K 
  
FIP 是「投手獨立防禦率」  
此數據的計算扣除了野手的守備因素 (例如站位)  
只參考投手三振、保送、觸身球、被全壘打率的表現，試圖用成因大多「僅能由投手控制」的幾個數據，更正確評價投手的實質投球內容  
FIP愈低的話，顯示投手的三振多、保送少、且不容易被打全壘打  
  
OPS+ 「標準化攻擊指數」  
先計算攻擊指數 OPS = 「上壘率」和「長打率」，能代表一名打者綜合的進攻破壞力（上壘能力與長打火力）  
再把攻擊指數「去脈絡化」，都放在同一個比較基準點上，排除原本數據當中的許多雜音，例如球場因素（球員數據會受到所在主場的影響）  
得到 OPS+  

BB/K 為四壞三振比，評判打擊者選球功力，數值越大代表越會選球 


接著我們把 FIP_RP, OPS+_3, BB/K_4 拉出來獨立建模  
發現 OPS+_3, BB/K_4 不顯著了  
在中職，打擊居然不是勝負的主要因素 ???  

現在只剩下 FIP   
觀察 FIP 的係數都是負的  
也就是說  
派出好的後援投手，只是降低球隊輸球的機率，不是增加贏球機率
也就是說當球隊在領先時，後援投手不要砸鍋放火就好
  
此模型對於勝負的解釋能力只有 5%   
中華職棒官網的數據與CPBLSTAT的數據對於預測勝負的幫助並不大  
試著用 8/12之後的 Test Data 去預測勝負  
準確率 64%  
預測能力不高  