In [2]:
from requests.compat import *
from requests import request
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import preprocessor
import scraper
import numpy as np
from pickle import load
import tensorflow as tf

"""
Contains all classes and methods that define the states of a Baseball game as states
in a Markov Chain.
"""

def getID(first, second, third, outs, inning):
    """
    :returns: int. The stateID of the state described by the parameters.
    """
    return first + 2 * second + 4 * third + 8 * outs + 24 * (inning - 1)


class State:
    """
    Represents a state in the Markov Chain
    Can be seen as the tuple (f, s, t, o, i), where:
        f = 0 if first base is empty, 1 if there is a runner
        s = 0 if second base is empty, 1 if there is a runner
        t = 0 if third base is empty, 1 if there is a runner
        o in {0, 1, 2} the number of outs
        i in {1, 2, ..., 8, 9} the number of innings
    There is an extra state, which is the absorbing state, with 3 outs in the 9th inning, (0, 0, 0, 3, 9). 
    There are 217 total states, with IDs in [0, 216]. 
    Each state has a unique number, which is: (f + 2*s + 4*t + 8*o + 24*(i-1))
    """
    def __init__(self, stateID):
        self.id = stateID
        if stateID == 216:
            self.i = 9
            self.o = 3
            self.t = 0
            self.s = 0
            self.f = 0
        else:  
            self.i = (stateID // 24) + 1
            stateID -= (self.i - 1) * 24
            self.o = stateID // 8
            stateID -= self.o * 8
            self.t = stateID // 4
            stateID -= self.t * 4
            self.s = stateID // 2
            stateID -= self.s * 2
            self.f = stateID


    def walk(self):
        """
        :returns: (int, int). The stateID of the new state and number of runs scored when the game is in state self and the 
        batter walks, is intentionally walked or is hit by a pitch.
        """
        if self.f == 1:
            if self.s == 1:
                if self.t == 1:
                    return (getID(1, 1, 1, self.o, self.i), 1)
                else:
                    return (getID(1, 1, 1, self.o, self.i), 0)
            else:
                return (getID(1, 1, self.t, self.o, self.i), 0)
        else:
            return (getID(1, self.s, self.t, self.o, self.i), 0)


    def single(self):
        """
        :returns: (int, int). The stateID of the new state and number of runs scored when the game is in state self and the 
        batter hits a single.
        """
        return (getID(1, self.f, self.s, self.o, self.i), self.t)


    def double(self):
        """
        :returns: (int, int). The stateID of the new state and number of runs scored when the game is in state self and the 
        batter hits a double.
        """
        return (getID(0, 1, self.f, self.o, self.i), self.s + self.t)


    def triple(self):
        """
        :returns: (int, int). The stateID of the new state and number of runs scored when the game is in state self and the 
        batter hits a triple.
        """
        return (getID(0, 0, 1, self.o, self.i), self.f + self.s + self.t)


    def homeRun(self):
        """
        :returns: (int, int). The stateID of the new state and number of runs scored when the game is in state self and the 
        batter hits a home run.
        """
        return (getID(0, 0, 0, self.o, self.i), 1 + self.f + self.s + self.t)


    def out(self):
        """
        :returns: (int, int). The stateID of the new state and number of runs scored when the game is in state self and the 
        batter bats into an out.
        """
        if self.o == 2:
            # Tranistion to next inning
            return (getID(0, 0, 0, 0, self.i + 1), 0)
        else:
            return (getID(self.f, self.s, self.t, self.o + 1, self.i), 0)
    
    def doublePlay(self):
        """
        :returns: (int, int). The stateID of the new state and number of runs scored when the game is in state self and the 
        batter bats into an double paly.
        """
        if self.o >= 1:
            # Tranistion to next inning
            return (getID(0, 0, 0, 0, self.i + 1), 0)
        else:
            return (getID(self.f, self.s, self.t, self.o + 2, self.i), 0)

class Player:
    """
    야구 선수 클래스.
    """
    def __init__(self, playerID, name, first, second, third, bb, homerun, outs, double):
        """
        :param playerID: int. 선수의 고유 식별자.
        :param name: string. 선수의 이름.
        :param first: float. 싱글 확률.
        :param second: float. 더블 확률.
        :param third: float. 트리플 확률.
        :param bb: float. 볼넷 확률.
        :param homerun: float. 홈런 확률.
        :param outs: float. 아웃 확률.
        :param double: float. 더블 플레이 확률.
        """
        self.id = playerID
        self.name = name
        self.first = first
        self.second = second
        self.third = third
        self.double = double
        self.bb = bb
        self.outs = outs
        self.homerun = homerun

    def transitionMatrixSimple(self):
        """
        이 선수의 전이 행렬을 계산합니다.
        :return: numpy (217, 217) 배열. 이 선수의 전이 행렬.
        """
        # p[i]: i 득점이 발생할 때의 전이 행렬
        p = np.zeros((5, 217, 217))
        
        # 9회 말 3아웃 상태에서는 상태가 더 이상 변하지 않음
        p[0][216][216] = 1

        # 모든 전이 확률 계산
        for i in range(216):
            # 현재 상태
            currState = State(i)
            # 타자가 볼넷을 얻는 경우
            (nextState, runs) = currState.walk()
            p[runs][i][nextState] += self.bb
            # 타자가 싱글을 치는 경우
            (nextState, runs) = currState.single()
            p[runs][i][nextState] += self.first
            # 타자가 더블을 치는 경우
            (nextState, runs) = currState.double()
            p[runs][i][nextState] += self.second
            # 타자가 트리플을 치는 경우
            (nextState, runs) = currState.triple()
            p[runs][i][nextState] += self.third
            # 타자가 홈런을 치는 경우
            (nextState, runs) = currState.homeRun()
            p[runs][i][nextState] += self.homerun
            # 타자가 아웃되는 경우
            (nextState, runs) = currState.out()
            p[runs][i][nextState] += self.outs
            # 타자가 더블 플레이를 치는 경우
            (nextState, runs) = currState.doublePlay()
            p[runs][i][nextState] += self.double
        return p
    
def expectedRuns(lineup):
    """
    Computes the expected run distribution of a given baseball lineup.
    :param lineup: [Batter]. List containing the 9 batters in the lineup, in order.
    :return: np.array. An array containing 21 elements. The i-th element is the probability
        that the lineup will score i runs.
    """
    transitionsMatrices = list(map(lambda Batter: Batter.transitionMatrixSimple(), lineup))
    return simulateMarkovChain(transitionsMatrices)[:, 216]


def simulateMarkovChain(transitionMatrices):
    """
    Finds the near-steady state distribution of the MC representing our baseball game.
    :param transitionMatrices: [numpy array]. List containing the 9 (217 by 217) transition matrices
        for the batters in the lineup, in order.
    :return: numpy 21x217 array. The i-th row in the array represents the states where i runs have been scored.
    """
    u = np.zeros((21, 217))
    u[0][0] = 1
    iterations = 0
    batter = 0
    while sum(u)[216] < 0.999 and iterations < 1000:
        p = transitionMatrices[batter]
        next_u = np.zeros((21, 217))
        for i in range(21):
            for j in range(5):
                if i - j >= 0:
                    next_u[i] += u[i-j] @ p[j]
        u = next_u
        batter = (batter + 1) % 9 
        iterations += 1
    return u

def teamExpectedRuns(teamName, lineup):
    print('\nTeam: ' + teamName + '\n')
    print('Best lineup found: ' + str(list(map(lambda Batter: Batter.name, lineup))) + '\n')
    u = expectedRuns(lineup)
    print('Probability of the game having ended: ' + str(sum(u)) + '\n')
    print('Probability of each score:')
    expRuns = 0
    for i in range(21):
        expRuns += i * u[i]
        print(str(i) + ': ' + str(u[i]))
    print('\nExpected number of runs: ' + str(expRuns) + '\n')
    return (u, expRuns)

def expectedRemainingRuns_RP(lineup_sp, lineup_rp, batterUp, startState):
    """
    Computes the expected number of runs a team will score from a given point in a game.
    :param lineup: A list of 9 batters
    :param batterUp: An integer in [0, 8], representing whose turn to bat it is in the lineup
    :param startState: The state the game is in
    :return: The expected number of runs the team will score from startState.
    """
    transitionsMatrices = list(map(lambda Batter: Batter.transitionMatrixSimple(), lineup))
    u = np.zeros((21, 217))
    u[0][startState.id] = 1
    iterations = 0
    batter = batterUp
    while sum(u)[216] < 0.999 and iterations < 1000:
        p = transitionsMatrices[batter]
        next_u = np.zeros((21, 217))
        for i in range(21):
            for j in range(5):
                if i - j >= 0:
                    next_u[i] += u[i-j] @ p[j]
        u = next_u
        batter = (batter + 1) % 9 
        iterations += 1
    u = u[:, 216]
    expRuns = 0
    for i in range(21):
        expRuns += i * u[i]
    return expRuns

In [3]:
u = np.zeros((21, 217))
u[0][0] = 1
print(u)

[[1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [4]:
bat_recode_2024 = preprocessor.bat_recode(2024)
pitch_recode_2024 = preprocessor.pitch_recode(2024)

# 싱커 데이터 부족으로 인한 임시 방편
bat_recode_2024['구종가치/100 (싱커)'] = -0.899598784
pitch_recode_2024['Sinker Velocity, 평균구속 (싱커)'] = 126.1545879
pitch_recode_2024['Sinker Pitch Value per 100, 구종가치/100 (싱커)'] = -1.485315299

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col] = pd.to_numeric(data[col], errors='coerce')
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


15056
12995
13168
16121
15530
10184
10892
14160
10189
11333
14996
15034
15048
10035
10106
10643
15000
13112
11099
12562
10640
16110
14147
11376
12546
10266
14797
12516
14125
12534
10232
10894
16066
10008
14796
10235
10187
10195
12585
12905
10475
10170
10312
12916
10238
10108
10470
10165
15472
11172
14591
15652
13128
14618
16128
14590
10804
16038
11137
12583
14220
14806
12560
14716
10344
14133
14642
10840
13073
10082
14114
11261
14606
13145
15532
15484
15132
11298
10014
10707
14612
14221
10387
10913
15862
11397
15035
14151
14707
13006
13082
13081
15036
12549
14117
10249
11339
12936
11233
14807
13261
11414
15499
10400
14888
10891
11148
11226
12539
12613
10810
10407
11190
11213
11165
12894
16106
12988
14785
13154
13137
12587
16042
14867
12524
10753
13113
15253
10174
15422
10459
10684
12922
11153
10253
10261
11162
10636
11225
10180
14137
11212
14699
14587
10320
14170
10855
13056
16124
10182
13223
12943
11087
14706
12522
11215
14765
10815
10870
14140


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col] = pd.to_numeric(data[col], errors='coerce')
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


15475
11410
14792
11236
14747
11489
10203
15486
11415
13941
15028
15143
11411
15057
13152
12537
13942
14173
10427
10825
10783
15435
14143
14113
14155
10863
11166
10795
15067
10058
10871
14480
16067
10696
14798
11126
16122
10220
11379
16138
12532
10453
13061
13092
15063
14805
10367
15508
14871
16028
14581
16064
16107
15127
16088
10126
12852
15099
14576
14808
15146
10690
15461
15860
12858
14764
15432
10452
14616
10685
12295
13080
15644
16089
15531
16125
12944
13085
13088
10909
16065
10131
10527
13934
10737
15153
11229
14108
12908
11168
11164
10124
13003
12918
11303
12568
11310
11318
16087
15861
12850
10217
10590
14608
16108
15089
14128
11317
14624
14769
16023
13228
15011
14132
14776
11364
15509
11222
12871
13167
15462
11300
11232
10535
11355
14669
15455
14788
15643
10437
13169
10812
10652
14109
15496
13071
15013
13126
11323
10749
15071
13268
11242
12565
10523
16146
16141
12930
14156
11291
16027
10610
12847


In [10]:
home_start_pitcher_num = pitch_recode_2024[(pitch_recode_2024['투수 이름'] == '임찬규') & (pitch_recode_2024['(P) 투타'] == 0.0)].iloc[0, 0]


In [11]:
home_start_pitcher_num

'10652'

In [14]:
pitch_recode_2024.columns

Index(['Pitcher Number', '투수 이름', '(P) Left ball%, 전체 좌측 타구 비율',
       '(P) Left center ball%, 전체 좌중앙 타구 비율', '(P) Center ball%, 전체 중앙 타구 비율',
       '(P) Right center ball%, 전체 우중앙 타구 비율', '(P) Right ball%, 전체 우측 타구 비율',
       '(P) Pull-side ball%, 전체 당겨친 타구 비율',
       '2-Seamer Fastball Velocity, 평균구속 (투심)',
       '4-Seamer Fastball Velocity, 평균구속 (포심)', 'Cutter Velocity, 평균구속 (커터)',
       'Curve Velocity, 평균구속 (커브)', 'Slider Velocity, 평균구속 (슬라이더)',
       'Changeup Velocity, 평균구속 (체인지업)', 'Sinker Velocity, 평균구속 (싱커)',
       'Forkball Velocity, 평균구속 (포크볼)',
       '2-Seamer Fastball Pitch Value per 100, 구종가치/100 (투심)',
       '4-Seamer Fastball Pitch Value per 100, 구종가치/100 (포심)',
       'Cutter Pitch Value per 100, 구종가치/100 (커터)',
       'Curve Pitch Value per 100, 구종가치/100 (커브)',
       'Slider Pitch Value per 100, 구종가치/100 (슬라이더)',
       'Changeup Pitch Value per 100, 구종가치/100 (체인지업)',
       'Sinker Pitch Value per 100, 구종가치/100 (싱커)',
       'Forkball Pitch Value per 100,

In [15]:
bat_recode_2024.columns

Index(['Hitter Number', '타자 이름', '구종가치/100 (투심)', '구종가치/100 (포심)',
       '구종가치/100 (커터)', '구종가치/100 (커브)', '구종가치/100 (슬라이더)', '구종가치/100 (체인지업)',
       '구종가치/100 (싱커)', '구종가치/100 (포크볼)', 'Strike%, 전체 투구 대비 스트라이크',
       'Called Strike%, 전체 투구 대비 루킹 스트라이크%', 'Whiff%, 전체 투구 대비 헛스윙 스트라이크%',
       'CSW%, 전체 투구 대비 루킹+헛스윙 스트라이크%', 'Swing%, 스윙 비율', '스윙 대비 콘택트 비율',
       '스윙 대비 헛스윙 비율', '초구 스트라이크 비율', '초구 스윙 비율', '투스트라이크 카운트 투구 대비 삼진 결정 비율',
       'Strike Zone%, 존 안에 들어온 투구 비율',
       'Strike Zone Swing%, 존 안에 들어온 투구 대비 스윙 비율',
       'Strike Zone Contact%, 존 안에 들어온 투구 대비 콘택트 비율',
       'Out Zone%, 존 밖에 들어온 투구 비율', 'Out Zone Swing%, 존 밖에 들어온 투구 대비 스윙 비율',
       'Out Zone Contact%, 존 밖에 들어온 투구 대비 콘택트 비율',
       'Meatball Zone%, 존 한가운데 들어온 투구 비율',
       'Meatball Swing%, 존 한가운데 들어온 투구 대비 스윙 비율',
       'Shadow Zone%, 쉐도우존에 들어온 투구 비율', 'Looking Strike Out%, 루킹 삼진 비율',
       'Batting Average on Balls In Play, 인플레이 타구의 안타 비율', '땅볼%', '내야 뜬볼%',
       '외야 뜬볼%', '뜬볼%', '라인드라이브%', '홈런 / 뜬볼%

In [20]:
for col in pitch_recode_2024.columns[2:]:
    pitch_recode_2024[col] = pd.to_numeric(pitch_recode_2024[col], errors='coerce')

In [21]:
pitch_recode_2024

Unnamed: 0,Pitcher Number,투수 이름,"(P) Left ball%, 전체 좌측 타구 비율","(P) Left center ball%, 전체 좌중앙 타구 비율","(P) Center ball%, 전체 중앙 타구 비율","(P) Right center ball%, 전체 우중앙 타구 비율","(P) Right ball%, 전체 우측 타구 비율","(P) Pull-side ball%, 전체 당겨친 타구 비율","2-Seamer Fastball Velocity, 평균구속 (투심)","4-Seamer Fastball Velocity, 평균구속 (포심)",...,"(P) Strike Zone%, 존 안에 들어온 투구 비율","(P) Strike Zone Swing%, 존 안에 들어온 투구 대비 스윙 비율","(P) Strike Zone Contact%, 존 안에 들어온 투구 대비 콘택트 비율","(P) Out Zone%, 존 밖에 들어온 투구 비율","(P) Out Zone Swing%, 존 밖에 들어온 투구 대비 스윙 비율","(P) Out Zone Contact%, 존 밖에 들어온 투구 대비 콘택트 비율","(P) Meatball Zone%, 존 한가운데 들어온 투구 비율","(P) Meatball Swing%, 존 한가운데 들어온 투구 대비 스윙 비율","(P) Looking Strike Out%, 루킹 삼진 비율",(P) 투타
0,15475,박윤성,33.3,14.3,23.8,16.7,11.9,43.8,133.8,139.3,...,39.9,68.1,59.3,60.1,18.2,13.1,7.5,82.4,45.5,0.00
1,11410,김민우,17.2,17.2,13.8,34.5,17.2,64.0,133.8,141.5,...,41.4,72.7,58.4,58.6,31.2,19.3,4.3,87.5,15.4,0.00
2,14792,김진욱,26.1,28.3,6.5,19.6,19.6,65.1,133.8,141.3,...,43.8,61.6,56.3,56.3,29.2,18.8,6.6,58.8,25.0,1.00
3,11236,박민호,22.6,20.8,24.5,17.0,15.1,45.0,133.8,134.3,...,44.0,67.5,57.9,56.0,31.7,26.9,2.7,85.7,33.3,0.25
4,14747,김재열,20.9,15.1,19.8,26.7,17.4,58.0,133.8,143.5,...,43.0,61.3,55.4,57.0,34.9,17.9,6.3,74.3,18.9,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,14156,전사민,18.2,22.7,13.6,27.3,18.2,60.5,145.0,146.1,...,46.3,67.3,60.4,53.7,20.5,17.9,6.0,46.2,50.0,0.00
149,11291,구승민,21.1,26.3,14.0,21.1,17.5,65.3,133.8,143.8,...,39.7,67.1,55.0,60.3,25.4,13.1,6.2,81.8,25.0,0.00
150,16027,원상현,25.0,16.7,19.9,15.4,23.1,61.6,133.8,143.6,...,45.1,62.7,54.9,54.9,24.3,13.4,8.5,73.2,27.8,0.00
151,10610,장민재,27.6,15.8,27.6,14.5,14.5,72.7,133.8,135.5,...,43.7,65.4,60.4,56.3,27.3,15.6,6.6,75.0,7.1,0.00


In [22]:
# 숫자형 열만 선택
numeric_cols = pitch_recode_2024.select_dtypes(include='number')

# 평균 계산
numeric_means = numeric_cols.mean()

print(numeric_means)

(P) Left ball%, 전체 좌측 타구 비율                              21.970588
(P) Left center ball%, 전체 좌중앙 타구 비율                      20.150327
(P) Center ball%, 전체 중앙 타구 비율                            18.288889
(P) Right center ball%, 전체 우중앙 타구 비율                     19.849673
(P) Right ball%, 전체 우측 타구 비율                             19.743791
(P) Pull-side ball%, 전체 당겨친 타구 비율                        59.475163
2-Seamer Fastball Velocity, 평균구속 (투심)                   135.901961
4-Seamer Fastball Velocity, 평균구속 (포심)                   142.858170
Cutter Velocity, 평균구속 (커터)                              133.318301
Curve Velocity, 평균구속 (커브)                               117.137908
Slider Velocity, 평균구속 (슬라이더)                            130.239869
Changeup Velocity, 평균구속 (체인지업)                          124.558824
Sinker Velocity, 평균구속 (싱커)                              126.154588
Forkball Velocity, 평균구속 (포크볼)                           127.091503
2-Seamer Fastball Pitch Value per 100, 구종가치/100 (투심)     -3.09

In [27]:
# 모든 열의 평균 계산
numeric_means = pitch_recode_2024.select_dtypes(include='number').mean()
    
# 비숫자형 열의 첫 번째 값 가져오기 (또는 다른 방식으로 처리 가능)
non_numeric_data = pitch_recode_2024.select_dtypes(exclude='number').iloc[0]
    
# 평균값과 비숫자형 데이터를 하나의 데이터프레임으로 결합
mean_data = pd.concat([numeric_means, non_numeric_data])
    
# 평균값을 데이터프레임으로 변환
away_start_pitcher_data = pd.DataFrame(mean_data).transpose()

In [28]:
away_start_pitcher_data

Unnamed: 0,"(P) Left ball%, 전체 좌측 타구 비율","(P) Left center ball%, 전체 좌중앙 타구 비율","(P) Center ball%, 전체 중앙 타구 비율","(P) Right center ball%, 전체 우중앙 타구 비율","(P) Right ball%, 전체 우측 타구 비율","(P) Pull-side ball%, 전체 당겨친 타구 비율","2-Seamer Fastball Velocity, 평균구속 (투심)","4-Seamer Fastball Velocity, 평균구속 (포심)","Cutter Velocity, 평균구속 (커터)","Curve Velocity, 평균구속 (커브)",...,"(P) Strike Zone Contact%, 존 안에 들어온 투구 대비 콘택트 비율","(P) Out Zone%, 존 밖에 들어온 투구 비율","(P) Out Zone Swing%, 존 밖에 들어온 투구 대비 스윙 비율","(P) Out Zone Contact%, 존 밖에 들어온 투구 대비 콘택트 비율","(P) Meatball Zone%, 존 한가운데 들어온 투구 비율","(P) Meatball Swing%, 존 한가운데 들어온 투구 대비 스윙 비율","(P) Looking Strike Out%, 루킹 삼진 비율",(P) 투타,Pitcher Number,투수 이름
0,21.970588,20.150327,18.288889,19.849673,19.743791,59.475163,135.901961,142.85817,133.318301,117.137908,...,57.314379,55.628758,28.536601,18.367974,6.373856,73.150327,24.944444,0.308824,15475,박윤성


In [29]:
len(pitch_recode_2024.columns)

44