In [1]:
from bs4 import BeautifulSoup
import requests

url = 'https://www.mlb.com/starting-lineups/2024-06-29'

response = requests.get(url)
dom = BeautifulSoup(response.content, 'html.parser')

In [2]:
match_data = dom.find_all(attrs={'class':'starting-lineups__teams starting-lineups__teams--xs starting-lineups__teams--md starting-lineups__teams--lg'})
start_pitcher_data = dom.find_all(attrs={'class':'starting-lineups__pitcher--link'})
pitcher_split_data = dom.find_all(attrs={'class':'starting-lineups__pitcher-pitch-hand'})

In [3]:
import preprocessor

bat_recode, _ = preprocessor.bat_recode(2024)
pitch_recode, _ = preprocessor.pitch_recode(2024)
home_fielding_recode, _ = preprocessor.fielding_recode(2024,'home')
away_fielding_recode, _ = preprocessor.fielding_recode(2024,'away')

results = []

Gathering player lookup table. This may take a moment.


In [4]:
def depth_num(team_short_name):
    team_long_names = {'COL':'rockies', 'BOS':'redsox', 'KC':'royals', 'CIN':'reds', 'TEX':'rangers', 'WSH':'nationals', 
                        'LAA':'angels', 'STL':'cardinals', 'HOU':'astros', 'ATL':'braves', 'PHI':'phillies', 'MIN':'twins', 
                        'TOR':'bluejays', 'AZ':'dbacks', 'CHC':'cubs', 'PIT':'pirates', 'MIA':'marlins', 'CWS':'whitesox',
                        'LAD':'dodgers', 'MIL':'brewers', 'NYY':'yankees', 'BAL':'orioles', 'DET':'tigers', 'OAK':'athletics', 
                        'TB':'rays', 'CLE':'guardians', 'SF':'giants', 'SD':'padres', 'NYM':'mets', 'SEA':'mariners'}
    
    team_name = team_long_names[team_short_name]
    
    url = f'https://www.mlb.com/{team_name}/roster'

    response = requests.get(url)
    dom = BeautifulSoup(response.content, 'html.parser')
    
    depth_data = dom.find_all(attrs={'class':'info'})
    player_num_list = []
    for i in range(len(depth_data)):
        player_num = list(list(depth_data[i])[1].attrs.values())[0][-6:]
        player_num = int(player_num)
        player_num_list.append(player_num)
    
    return player_num_list

In [5]:
def pitcher_batter_aug(pitcher_data, batter_data, fielding_data, home_team, away_team):
    """
    투수와 타자 데이터를 결합하여 필요한 데이터를 생성합니다.
    :param pitcher_data: 투수 데이터.
    :param batter_data: 타자 데이터.
    :param home_team: 홈팀 경기장 정보.
    :return: 결합된 데이터 리스트.
    """
    batter_pitcher_list = []
    for pitcher_num in list(pitcher_data['pitcher_key_mlbam']):
        batter_pitcher = pd.DataFrame()
        for player_num in list(batter_data['batter_key_mlbam']):
            row = pd.DataFrame({
                'home_team':[home_team],
                'away_team':[away_team],
                'pitcher_key_mlbam': [pitcher_num],
                'batter_key_mlbam': [player_num],
            })
            batter_pitcher = pd.concat([batter_pitcher, row], ignore_index=True)
        batter_pitcher = pd.merge(batter_pitcher,pitcher_data)
        batter_pitcher = pd.merge(batter_pitcher,batter_data)
        batter_pitcher = pd.merge(batter_pitcher,fielding_data)
        print('파크팩터 추가')
        park_factors = {'COL':112, 'BOS':107, 'KC':105, 'CIN':104, 'TEX':102, 'WSH':102, 'LAA':101, 'STL':101, 'HOU':101,
                    'ATL':101, 'PHI':101, 'MIN':101, 'TOR':100, 'AZ':100, 'CHC':100, 'PIT':100, 'MIA':100, 'CWS':99, 
                    'LAD':99, 'MIL':99, 'NYY':99, 'BAL':98, 'DET':98, 'OAK':97, 'TB':97, 'CLE':96, 'SF':96, 'SD':96, 
                    'NYM':95, 'SEA':92}
        batter_pitcher['park_factor'] = park_factors[home_team]
        
        # 필요한 열 선택
        columns_to_select = [
            'home_team', 'away_team', 'batter_key_mlbam', 'pitcher_key_mlbam', 'IDfg', '(P) IDfg', 'key_bbref', '(P) key_bbref',
            'Name', '(P) Name', 'Team', '(P) Team', 'bat_split', 'GB/FB', 'LD%', 'GB%', 'FB%', 'IFFB%', 'HR/FB', 'Spd', 'BsR',
            'wFB/C', 'wSL/C', 'wCT/C', 'wCB/C', 'wCH/C', 'wSF/C', 'wKN/C', 'O-Swing%', 'Z-Swing%', 'Swing%', 'O-Contact%', 'Z-Contact%',
            'Contact%', 'Zone%', 'F-Strike%', 'SwStr%', 'Pull%', 'Cent%', 'Oppo%', 'Soft%', 'Med%', 'Hard%', 'wFA/C (sc)', 'wFC/C (sc)',
            'wFS/C (sc)', 'wFO/C (sc)', 'wSI/C (sc)', 'wSL/C (sc)', 'wCU/C (sc)', 'wKC/C (sc)', 'wCH/C (sc)', 'wKN/C (sc)', 'O-Swing% (sc)',
            'Z-Swing% (sc)', 'Swing% (sc)', 'O-Contact% (sc)', 'Z-Contact% (sc)', 'Contact% (sc)', 'Zone% (sc)', 'LD+%', 'GB%+', 'FB%+',
            'HR/FB%+', 'Pull%+', 'Cent%+', 'Oppo%+', 'Soft%+', 'Med%+', 'Hard%+', 'EV', 'LA', 'Barrel%', 'maxEV', 'HardHit%', 'CStr%', 'CSW%',
            'wCH/C (pi)', 'wCU/C (pi)', 'wFA/C (pi)', 'wFC/C (pi)', 'wFS/C (pi)', 'wKN/C (pi)', 'wSI/C (pi)', 'wSL/C (pi)', 'O-Swing% (pi)',
            'Z-Swing% (pi)', 'Swing% (pi)', 'O-Contact% (pi)', 'Z-Contact% (pi)', 'Contact% (pi)', 'Zone% (pi)', 'Pace', 'UBR', 'anglesweetspotpercent',
            'ev50', 'fbld', 'max_distance', 'avg_distance', 'avg_hr_distance', 'ev95percent', '(P) pitch_split', '(P) GB/FB', '(P) LD%',
            '(P) GB%', '(P) FB%', '(P) IFFB%', '(P) HR/FB', '(P) FB% 2', '(P) FBv', '(P) SL%', '(P) SLv', '(P) CT%', '(P) CTv', '(P) CB%',
            '(P) CBv', '(P) CH%', '(P) CHv', '(P) SF%', '(P) SFv', '(P) KN%', '(P) KNv', '(P) wFB/C', '(P) wSL/C', '(P) wCT/C', '(P) wCB/C',
            '(P) wCH/C', '(P) wSF/C', '(P) wKN/C', '(P) O-Swing%', '(P) Z-Swing%', '(P) Swing%', '(P) O-Contact%', '(P) Z-Contact%', '(P) Contact%',
            '(P) Zone%', '(P) F-Strike%', '(P) SwStr%', '(P) FA% (sc)', '(P) FC% (sc)', '(P) FS% (sc)', '(P) FO% (sc)', '(P) SI% (sc)', '(P) SL% (sc)',
            '(P) CU% (sc)', '(P) KC% (sc)', '(P) CH% (sc)', '(P) KN% (sc)', '(P) vFA (sc)', '(P) vFC (sc)', '(P) vFS (sc)', '(P) vFO (sc)', '(P) vSI (sc)',
            '(P) vSL (sc)', '(P) vCU (sc)', '(P) vKC (sc)', '(P) vCH (sc)', '(P) vKN (sc)', '(P) FA-X (sc)', '(P) FC-X (sc)', '(P) FS-X (sc)', '(P) FO-X (sc)',
            '(P) SI-X (sc)', '(P) SL-X (sc)', '(P) CU-X (sc)', '(P) KC-X (sc)', '(P) CH-X (sc)', '(P) KN-X (sc)', '(P) FA-Z (sc)', '(P) FC-Z (sc)', '(P) FS-Z (sc)',
            '(P) FO-Z (sc)', '(P) SI-Z (sc)', '(P) SL-Z (sc)', '(P) CU-Z (sc)', '(P) KC-Z (sc)', '(P) CH-Z (sc)', '(P) KN-Z (sc)', '(P) wFA/C (sc)', '(P) wFC/C (sc)',
            '(P) wFS/C (sc)', '(P) wFO/C (sc)', '(P) wSI/C (sc)', '(P) wSL/C (sc)', '(P) wCU/C (sc)', '(P) wKC/C (sc)', '(P) wCH/C (sc)', '(P) wKN/C (sc)', '(P) O-Swing% (sc)',
            '(P) Z-Swing% (sc)', '(P) Swing% (sc)', '(P) O-Contact% (sc)', '(P) Z-Contact% (sc)', '(P) Contact% (sc)', '(P) Zone% (sc)', '(P) LD%+', '(P) GB%+', '(P) FB%+',
            '(P) HR/FB%+', '(P) Pull%+', '(P) Cent%+', '(P) Oppo%+', '(P) Soft%+', '(P) Med%+', '(P) Hard%+', '(P) EV', '(P) LA', '(P) Barrel%', '(P) maxEV', '(P) HardHit%',
            '(P) CStr%', '(P) CSW%', '(P) botOvr CH', '(P) botStf CH', '(P) botCmd CH', '(P) botOvr CU', '(P) botStf CU', '(P) botCmd CU', '(P) botOvr FA', '(P) botStf FA',
            '(P) botCmd FA', '(P) botOvr SI', '(P) botStf SI', '(P) botCmd SI', '(P) botOvr SL', '(P) botStf SL', '(P) botCmd SL', '(P) botOvr KC', '(P) botStf KC',
            '(P) botCmd KC', '(P) botOvr FC', '(P) botStf FC', '(P) botCmd FC', '(P) botOvr FS', '(P) botStf FS', '(P) botCmd FS', '(P) botOvr', '(P) botStf', '(P) botCmd',
            '(P) botxRV100', '(P) Stf+ CH', '(P) Loc+ CH', '(P) Pit+ CH', '(P) Stf+ CU', '(P) Loc+ CU', '(P) Pit+ CU', '(P) Stf+ FA', '(P) Loc+ FA', '(P) Pit+ FA', '(P) Stf+ SI',
            '(P) Loc+ SI', '(P) Pit+ SI', '(P) Stf+ SL', '(P) Loc+ SL', '(P) Pit+ SL', '(P) Stf+ KC', '(P) Loc+ KC', '(P) Pit+ KC', '(P) Stf+ FC', '(P) Loc+ FC', '(P) Pit+ FC',
            '(P) Stf+ FS', '(P) Loc+ FS', '(P) Pit+ FS', '(P) Stf+ FO', '(P) Loc+ FO', '(P) Pit+ FO', '(P) Stuff+', '(P) Location+', '(P) Pitching+', '(P) CH% (pi)', '(P) CU% (pi)',
            '(P) FA% (pi)', '(P) FC% (pi)', '(P) FS% (pi)', '(P) KN% (pi)', '(P) SI% (pi)', '(P) SL% (pi)', '(P) vCH (pi)', '(P) vCU (pi)', '(P) vFA (pi)', '(P) vFC (pi)', '(P) vFS (pi)',
            '(P) vKN (pi)', '(P) vSI (pi)', '(P) vSL (pi)', '(P) CH-X (pi)', '(P) CU-X (pi)', '(P) FA-X (pi)', '(P) FC-X (pi)', '(P) FS-X (pi)', '(P) KN-X (pi)', '(P) SI-X (pi)',
            '(P) SL-X (pi)', '(P) CH-Z (pi)', '(P) CU-Z (pi)', '(P) FA-Z (pi)', '(P) FC-Z (pi)', '(P) FS-Z (pi)', '(P) KN-Z (pi)', '(P) SI-Z (pi)', '(P) SL-Z (pi)', '(P) wCH/C (pi)',
            '(P) wCU/C (pi)', '(P) wFA/C (pi)', '(P) wFC/C (pi)', '(P) wFS/C (pi)', '(P) wKN/C (pi)', '(P) wSI/C (pi)', '(P) wSL/C (pi)', '(P) O-Swing% (pi)', '(P) Z-Swing% (pi)',
            '(P) Swing% (pi)', '(P) O-Contact% (pi)', '(P) Z-Contact% (pi)', '(P) Contact% (pi)', '(P) Zone% (pi)', '(P) Pace', '(P) anglesweetspotpercent', '(P) ev50', '(P) fbld',
            '(P) max_distance', '(P) avg_distance', '(P) avg_hr_distance', '(P) ev95percent', 'rSZ/G', 'rCERA/G', 'rTS/G', 'rSB/G', 'rGDP/G', 'rARM/G', 'rGFP/G', 'rPM/G', 'DRS/G',
            'ARM/G', 'DPR/G', 'RngR/G', 'ErrR/G', 'UZR/G', 'Def/G', 'FRM/G', 'OAA/G', 'Range/G', 'park_factor'
        ]

        batter_pitcher = batter_pitcher[columns_to_select]
        batter_pitcher_list.append(batter_pitcher)
        
    return batter_pitcher_list

In [6]:
import tensorflow as tf
from pickle import load

def make_prob_lineup(data_list, load_scaler, model):
    """
    타자와 투수 데이터를 사용하여 확률적 라인업을 생성합니다.
    :param data_list: 결합된 데이터 리스트.
    :param load_scaler: 스케일러 객체.
    :param model: 예측 모델.
    :return: 확률적 라인업 리스트.
    """
    lineup_list = []
    for data in data_list:
        x = data.iloc[:,12:].values
        x = load_scaler.transform(x)
        y_predict = model.predict(x)
        
        lineup = []
        playerIDs = list(data.iloc[:,2])
        playerNames = list(data.iloc[:,8])
        
        for i in range(9):
            print(f'{i}번 타자')
            playerID = playerIDs[i]
            name = playerNames[i]
            first = y_predict[i][0]
            second = y_predict[i][1]
            third = y_predict[i][2]
            double = y_predict[i][3]
            bb = y_predict[i][4]
            outs = y_predict[i][5]
            homerun = y_predict[i][6]
            lineup.append(Player(playerID, name, first, second, third, double, bb, outs, homerun))
            
        lineup_list.append(lineup)
        
    return lineup_list

In [7]:
import pandas as pd
import time
import numpy as np
import preprocessor
from pickle import load
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from matplotlib.offsetbox import AnchoredText
import math
import pybaseball
from bs4 import BeautifulSoup
import requests

# 마르코프 체인에서 상태 ID를 반환하는 함수
def getID(first, second, third, outs, inning):
    """
    주어진 파라미터로 상태 ID를 반환합니다.
    :param first: 1루에 주자가 있는지 여부 (0 또는 1)
    :param second: 2루에 주자가 있는지 여부 (0 또는 1)
    :param third: 3루에 주자가 있는지 여부 (0 또는 1)
    :param outs: 아웃 수 (0, 1, 2)
    :param inning: 이닝 수 (1-9)
    :returns: int. 주어진 파라미터로 계산된 상태 ID
    """
    return first + 2 * second + 4 * third + 8 * outs + 24 * (inning - 1)

# 마르코프 체인의 상태를 나타내는 클래스
class State:
    """
    마르코프 체인에서 상태를 나타내는 클래스입니다.
    """
    def __init__(self, stateID):
        self.id = stateID
        if stateID == 216:
            self.i = 9
            self.o = 3
            self.t = 0
            self.s = 0
            self.f = 0
        else:  
            self.i = (stateID // 24) + 1
            stateID -= (self.i - 1) * 24
            self.o = stateID // 8
            stateID -= self.o * 8
            self.t = stateID // 4
            stateID -= self.t * 4
            self.s = stateID // 2
            stateID -= self.s * 2
            self.f = stateID

    # 주자가 진루하는 상황들에 대한 함수들
    def walk(self):
        """
        타자가 걸어 나가는 상황
        :returns: (int, int). 새로운 상태 ID와 득점 수
        """
        if self.f == 1:
            if self.s == 1:
                if self.t == 1:
                    return (getID(1, 1, 1, self.o, self.i), 1)
                else:
                    return (getID(1, 1, 1, self.o, self.i), 0)
            else:
                return (getID(1, 1, self.t, self.o, self.i), 0)
        else:
            return (getID(1, self.s, self.t, self.o, self.i), 0)

    def single(self):
        """
        타자가 단타를 치는 상황
        :returns: (int, int). 새로운 상태 ID와 득점 수
        """
        return (getID(1, self.f, self.s, self.o, self.i), self.t)

    def double(self):
        """
        타자가 2루타를 치는 상황
        :returns: (int, int). 새로운 상태 ID와 득점 수
        """
        return (getID(0, 1, self.f, self.o, self.i), self.s + self.t)

    def triple(self):
        """
        타자가 3루타를 치는 상황
        :returns: (int, int). 새로운 상태 ID와 득점 수
        """
        return (getID(0, 0, 1, self.o, self.i), self.f + self.s + self.t)

    def homeRun(self):
        """
        타자가 홈런을 치는 상황
        :returns: (int, int). 새로운 상태 ID와 득점 수
        """
        return (getID(0, 0, 0, self.o, self.i), 1 + self.f + self.s + self.t)

    def out(self):
        """
        타자가 아웃되는 상황
        :returns: (int, int). 새로운 상태 ID와 득점 수
        """
        if self.o == 2:
            return (getID(0, 0, 0, 0, self.i + 1), 0)
        else:
            return (getID(self.f, self.s, self.t, self.o + 1, self.i), 0)
    
    def doublePlay(self):
        """
        타자가 병살타를 치는 상황
        :returns: (int, int). 새로운 상태 ID와 득점 수
        """
        if self.o >= 1:
            return (getID(0, 0, 0, 0, self.i + 1), 0)
        else:
            return (getID(self.f, self.s, self.t, self.o + 2, self.i), 0)

# 야구 선수 정보를 나타내는 클래스
class Player:
    """
    야구 선수를 나타내는 클래스입니다.
    """
    def __init__(self, playerID, name, first, second, third, bb, homerun, outs, double):
        """
        :param playerID: int. 선수의 고유 ID.
        :param name: string. 선수의 이름.
        :param first: float. 단타 확률.
        :param second: float. 2루타 확률.
        :param third: float. 3루타 확률.
        :param bb: float. 볼넷 확률.
        :param homerun: float. 홈런 확률.
        :param outs: float. 아웃 확률.
        :param double: float. 병살 확률.
        """
        self.id = playerID
        self.name = name
        self.first = first
        self.second = second
        self.third = third
        self.double = double
        self.bb = bb
        self.outs = outs
        self.homerun = homerun

    def transitionMatrixSimple(self):
        """
        이 선수에 대한 전이 행렬을 계산합니다.
        :return: numpy (217, 217) 배열. 이 선수의 전이 행렬.
        """
        p = np.zeros((5, 217, 217))
        p[0][216][216] = 1

        for i in range(216):
            currState = State(i)
            nextState, runs = currState.walk()
            p[runs][i][nextState] += self.bb
            nextState, runs = currState.single()
            p[runs][i][nextState] += self.first
            nextState, runs = currState.double()
            p[runs][i][nextState] += self.second
            nextState, runs = currState.triple()
            p[runs][i][nextState] += self.third
            nextState, runs = currState.homeRun()
            p[runs][i][nextState] += self.homerun
            nextState, runs = currState.out()
            p[runs][i][nextState] += self.outs
            nextState, runs = currState.doublePlay()
            p[runs][i][nextState] += self.double
        return p
    
def expectedRuns(lineup):
    """
    주어진 야구 라인업의 예상 득점 분포를 계산합니다.
    :param lineup: [Batter]. 라인업에 포함된 9명의 타자 리스트.
    :return: np.array. 21개의 요소를 포함하는 배열. i번째 요소는 라인업이 i 득점할 확률을 나타냅니다.
    """
    transitionsMatrices = list(map(lambda Batter: Batter.transitionMatrixSimple(), lineup))
    return simulateMarkovChain(transitionsMatrices)[:, 216]

def simulateMarkovChain(transitionMatrices):
    """
    야구 게임을 나타내는 마르코프 체인의 거의 정적 상태 분포를 찾습니다.
    :param transitionMatrices: [numpy array]. 라인업에 포함된 타자에 대한 9개의 (217x217) 전이 행렬 리스트.
    :return: numpy 21x217 배열. 배열의 i번째 행은 i 득점이 된 상태를 나타냅니다.
    """
    u = np.zeros((21, 217))
    u[0][0] = 1
    iterations = 0
    batter = 0
    while sum(u)[216] < 0.999 and iterations < 2000:
        p = transitionMatrices[batter]
        next_u = np.zeros((21, 217))
        for i in range(21):
            for j in range(5):
                if i - j >= 0:
                    next_u[i] += u[i-j] @ p[j]
        u = next_u
        batter = (batter + 1) % 9 
        iterations += 1
    return u

def teamExpectedRuns(teamName, opponent_team_name, starter_lineup_list, relief_lineup_list, starter_data, starter_name, starter_num):
    """
    주어진 팀의 예상 득점을 계산하고 결과를 출력합니다.
    :param teamName: 팀 이름.
    :param starter_lineup_list: 선발 투수 라인업 리스트.
    :param relief_lineup_list: 구원 투수 라인업 리스트.
    :param starter_data: 선발 투수 데이터.
    :param starter_num: 선발 투수 번호.
    :param opponent_name: 상대팀 이름
    """
    print('\n팀: ' + teamName + '\n')
    print('상대팀: ' + opponent_team_name + '\n')
    print('상대 선발 투수: ' + starter_name + '\n')
    print('라인업: ' + str(list(map(lambda Batter: Batter.name, starter_lineup_list[0]))) + '\n')
    
    starter_num = list(pybaseball.playerid_reverse_lookup([starter_num], key_type='mlbam')['key_fangraphs'])[0]
    try:
        inning = starter_data[starter_data['IDfg'] == starter_num].loc[:, 'Start-IP']
        game_started = starter_data[starter_data['IDfg'] == starter_num].loc[:, 'GS']
        avg_inning = float(inning/game_started)
        if avg_inning < 5.0:
            avg_inning = 5.0
    except:
        avg_inning = 5.0
    
    # 선발 투수 득점 계산
    u = expectedRuns(starter_lineup_list[0])
    starter_expRuns = 0
    if sum(u) < 0.7:
        print('게임 종료 확률이 낮아 예상 실점을 계산할 수 없습니다.')
        u = (1/sum(u))*u
        
        for i in range(21):
            starter_expRuns += i * u[i]
        
        avg_inning = 9 * (4/starter_expRuns)
    else:
        for i in range(21):
            starter_expRuns += i * u[i]
        if (avg_inning / 9) * starter_expRuns > 4:
            avg_inning = 9 * (4/starter_expRuns)
        
    # 불펜 투수 득점 계산
    

    relief_exp_runs_list = []
    for relief_lineup in relief_lineup_list:
        exp = expectedRemainingRuns(relief_lineup, 0, State(getID(0, 0, 0, 0, 1)))
        relief_exp_runs_list.append(exp)
        
    relief_expRuns = sum(relief_exp_runs_list) / len(relief_exp_runs_list)
    total_expRuns = (avg_inning / 9) * starter_expRuns + ((9 - avg_inning) / 9) * relief_expRuns

    # 그래프 생성
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.bar(range(len(u)), u, color='blue')
    ax.set_xlabel('Runs Scored')
    ax.set_ylabel('Probability')
    ax.set_title(f'{teamName}의 선발 상대 예상 득점 분포')
    ax.legend()

    # 예상 득점 텍스트 추가
    anchored_text = AnchoredText(
        f'상대 팀: {opponent_team_name}\nStarter Expected Runs: {(avg_inning / 9) * starter_expRuns:.2f}\nRelief Expected Runs: {((9 - avg_inning) / 9) * relief_expRuns:.2f}\nTotal Expected Runs: {total_expRuns:.2f}',
        loc='upper right', prop=dict(size=10)
    )
    ax.add_artist(anchored_text)
    
    plt.savefig(f'img/{teamName}.png')

    print('게임 종료 확률: ' + str(sum(u)) + '\n')
    print('\n선발 투수 평균 이닝 수: ' + str(avg_inning) + '\n')
    print('선발 투수에 의한 예상 실점: ' + str((avg_inning / 9) * starter_expRuns) + '\n')
    print('선발 투수에 의한 예상 실점(9이닝 당): ' + str(starter_expRuns) + '\n')

    print('각 점수에 대한 확률:')
    for i in range(21):
        print(str(i) + ': ' + str(u[i]))
    
    print('\n구원 투수 예상 실점: ' + str(((9 - avg_inning) / 9) * relief_expRuns) + '\n')

    print('\n총 예상 실점:' + str(total_expRuns) + '\n')
    
    return total_expRuns


def expectedRemainingRuns(lineup, batterUp, startState):
    """
    게임의 특정 지점에서 팀이 득점할 예상 점수를 계산합니다.
    :param lineup: 9명의 타자 리스트.
    :param batterUp: 타석에 있는 타자의 인덱스 (0-8).
    :param startState: 현재 게임의 상태.
    :return: 주어진 상태에서 팀이 득점할 예상 점수.
    """
    transitionsMatrices = list(map(lambda Batter: Batter.transitionMatrixSimple(), lineup))
    u = np.zeros((21, 217))
    u[0][startState.id] = 1
    iterations = 0
    batter = batterUp
    while sum(u)[216] < 0.999 and iterations < 2000:
        p = transitionsMatrices[batter]
        next_u = np.zeros((21, 217))
        for i in range(21):
            for j in range(5):
                if i - j >= 0:
                    next_u[i] += u[i-j] @ p[j]
        u = next_u
        batter = (batter + 1) % 9 
        iterations += 1
    u = u[:, 216]
    expRuns = 0
    if sum(u) < 0.7:
        # u = (1/sum(u))*u
        expRuns = 9
    else:
        for i in range(21):
            expRuns += i * u[i]
        if expRuns > 9:
            expRuns = 9
    return expRuns

In [21]:
import pandas as pd
import pybaseball

year = 2024
j = 0
for match in match_data:
    print('매치 분류 시작')
    lineup_data = match.text.split('\n')
    
    home_team = lineup_data[6].split(' ')[-2]
    away_team = lineup_data[3].split(' ')[-2]
    
    home_start_pitcher = start_pitcher_data[4*j + 1].text
    away_start_pitcher = start_pitcher_data[4*j + 3].text
    
    home_start_pitcher_num = int(list(start_pitcher_data[4*j + 3].attrs.values())[1][-6:])
    away_start_pitcher_num = int(list(start_pitcher_data[4*j + 1].attrs.values())[1][-6:])
    
    home_start_pitcher_split = preprocessor.split_to_num(pitcher_split_data[2*j + 1].text.replace(' ','')[1])
    away_start_pitcher_split = preprocessor.split_to_num(pitcher_split_data[2*j].text.replace(' ','')[1])
    
    if home_start_pitcher_num in list(pitch_recode['pitcher_key_mlbam']):
        home_start_pitcher_data = pitch_recode[pitch_recode['pitcher_key_mlbam'] == home_start_pitcher_num]
    else:
        print('해당 투수 정보 없음')
        # 모든 열의 평균 계산
        numeric_means = pitch_recode.select_dtypes(include='number').mean()

        # 비숫자형 열의 첫 번째 값 가져오기
        non_numeric_data = pitch_recode.select_dtypes(exclude='number').iloc[0]

        # 평균값과 비숫자형 데이터를 결합
        mean_data = pd.concat([numeric_means, non_numeric_data])

        # 평균값을 데이터프레임으로 변환
        home_start_pitcher_data = pd.DataFrame(mean_data).transpose()
        home_start_pitcher_data['pitcher_key_mlbam'] = home_start_pitcher_num
        home_start_pitcher_data['(P) Name'] = home_start_pitcher
        home_start_pitcher_data['(P) pitch_split'] = home_start_pitcher_split
    
    if away_start_pitcher_num in list(pitch_recode['pitcher_key_mlbam']):
        away_start_pitcher_data = pitch_recode[pitch_recode['pitcher_key_mlbam'] == away_start_pitcher_num]
    else:
        print('해당 투수 정보 없음')
        # 모든 열의 평균 계산
        numeric_means = pitch_recode.select_dtypes(include='number').mean()

        # 비숫자형 열의 첫 번째 값 가져오기
        non_numeric_data = pitch_recode.select_dtypes(exclude='number').iloc[0]

        # 평균값과 비숫자형 데이터를 결합
        mean_data = pd.concat([numeric_means, non_numeric_data])

        # 평균값을 데이터프레임으로 변환
        away_start_pitcher_data = pd.DataFrame(mean_data).transpose()
        away_start_pitcher_data['pitcher_key_mlbam'] = away_start_pitcher_num
        away_start_pitcher_data['(P) Name'] = away_start_pitcher
        away_start_pitcher_data['(P) pitch_split'] = away_start_pitcher_split
    
    
    home_batters_data = pd.DataFrame()
    away_batters_data = pd.DataFrame()
    
    for i in range(9):
        home_batter = lineup_data[21 + i].split(' (')[0]
        home_position = lineup_data[21 + i].split(' (')[1].split(' ')[1]
        home_split = preprocessor.split_to_num(lineup_data[21 + i].split(' (')[1].split(' ')[0][0])
        print(home_split)
        home_batter_num = int(list(match.find_all(attrs={'class':'starting-lineups__player--link'})[i+9].attrs.values())[1][-6:])
        
        if home_batter_num in list(bat_recode['batter_key_mlbam']):
            batter_data = bat_recode[bat_recode['batter_key_mlbam'] == home_batter_num]
            home_batters_data = pd.concat([home_batters_data, batter_data], ignore_index=True)
                        
        else:
            # 모든 열의 평균 계산
            numeric_means = bat_recode.select_dtypes(include='number').mean()

            # 비숫자형 열의 첫 번째 값 가져오기
            non_numeric_data = bat_recode.select_dtypes(exclude='number').iloc[0]

            # 평균값과 비숫자형 데이터를 결합
            mean_data = pd.concat([numeric_means, non_numeric_data])

            # 평균값을 데이터프레임으로 변환
            batter_data = pd.DataFrame(mean_data).transpose()
            batter_data['batter_key_mlbam'] = home_batter_num
            batter_data['Name'] = home_batter
            batter_data['bat_split'] = home_split
            home_batters_data = pd.concat([home_batters_data, batter_data], ignore_index=True)

        away_batter = lineup_data[10 + i].split(' (')[0]
        away_position = lineup_data[10 + i].split(' (')[1].split(' ')[1]
        away_split = preprocessor.split_to_num(lineup_data[10 + i].split(' (')[1].split(' ')[0][0])
        print(away_split)
        away_batter_num = (list(match.find_all(attrs={'class':'starting-lineups__player--link'})[i].attrs.values())[1][-6:])
        
        if away_batter_num in list(bat_recode['batter_key_mlbam']):
            batter_data = bat_recode[bat_recode['batter_key_mlbam'] == away_batter_num]
            away_batters_data = pd.concat([away_batters_data, batter_data], ignore_index=True)
                        
        else:
            # 모든 열의 평균 계산
            numeric_means = bat_recode.select_dtypes(include='number').mean()

            # 비숫자형 열의 첫 번째 값 가져오기
            non_numeric_data = bat_recode.select_dtypes(exclude='number').iloc[0]

            # 평균값과 비숫자형 데이터를 결합
            mean_data = pd.concat([numeric_means, non_numeric_data])

            # 평균값을 데이터프레임으로 변환
            batter_data = pd.DataFrame(mean_data).transpose()
            batter_data['batter_key_mlbam'] = away_batter_num
            batter_data['Name'] = away_batter
            batter_data['bat_split'] = away_split  
            away_batters_data = pd.concat([away_batters_data, batter_data], ignore_index=True)
            
    # 불펜 투수 계산
            
    home_relief_data = pd.DataFrame()
    away_relief_data = pd.DataFrame()
            
    starter_data = pybaseball.pitching_stats(year, qual=30)
    starter_data = starter_data[starter_data['GS'] / starter_data['G'] > 0.5]
    start_pitcher_mlb_nums = list(starter_data['IDfg'])
    start_pitcher_mlb_nums = list(pybaseball.playerid_reverse_lookup(start_pitcher_mlb_nums, key_type='fangraphs')['key_mlbam'])
            
    home_depth_num = depth_num(home_team)
    away_depth_num = depth_num(away_team)
            
    for pitcher_num in home_depth_num:
        try:
            if (pitcher_num in list(pitch_recode['pitcher_key_mlbam'])) and (pitcher_num not in start_pitcher_mlb_nums):
                pitcher_data = pitch_recode[pitch_recode['pitcher_key_mlbam'] == pitcher_num]
                home_relief_data = pd.concat([home_relief_data, pitcher_data], ignore_index=True)
        except:
            continue
                    
    for pitcher_num in away_depth_num:
        try:
            if (pitcher_num in list(pitch_recode['pitcher_key_mlbam'])) and (pitcher_num not in start_pitcher_mlb_nums):
                pitcher_data = pitch_recode[pitch_recode['pitcher_key_mlbam'] == pitcher_num]
                away_relief_data = pd.concat([away_relief_data, pitcher_data], ignore_index=True)
        except:
            continue
            
    
            
    print('원정 선발 - 홈 타자')            
    home_batter_away_starter_list = pitcher_batter_aug(away_start_pitcher_data, home_batters_data, away_fielding_recode, home_team, away_team)
    print('원정 구원 - 홈 타자')  
    home_batter_away_relief_list = pitcher_batter_aug(away_relief_data, home_batters_data, away_fielding_recode, home_team, away_team)
    print('홈 선발 - 원정 타자')  
    away_batter_home_starter_list = pitcher_batter_aug(home_start_pitcher_data, away_batters_data, home_fielding_recode, home_team, away_team)
    print('홈 구원 - 원정 타자')  
    away_batter_home_relief_list = pitcher_batter_aug(home_relief_data, away_batters_data, home_fielding_recode, home_team, away_team)
    
    a = home_batter_away_starter_list
    
    b = away_batter_home_starter_list
    
    j += 1

매치 분류 시작
해당 투수 정보 없음
0.0
1.0
1.0
0.0
0.0
0.0
1.0
1.0
0.0
0.0
0.0
0.5
1.0
1.0
0.0
0.0
0.0
1.0
원정 선발 - 홈 타자
파크팩터 추가
원정 구원 - 홈 타자
파크팩터 추가
파크팩터 추가
파크팩터 추가
파크팩터 추가
파크팩터 추가
파크팩터 추가
홈 선발 - 원정 타자
파크팩터 추가
홈 구원 - 원정 타자
파크팩터 추가
파크팩터 추가
파크팩터 추가
파크팩터 추가
파크팩터 추가
파크팩터 추가
파크팩터 추가
파크팩터 추가
매치 분류 시작
0.0
0.0
1.0
0.5
0.0
0.0
0.0
1.0
1.0
0.0
0.0
1.0
1.0
0.0
1.0
1.0
1.0
0.0
원정 선발 - 홈 타자
파크팩터 추가
원정 구원 - 홈 타자
파크팩터 추가
파크팩터 추가
파크팩터 추가
파크팩터 추가
파크팩터 추가
파크팩터 추가
파크팩터 추가
파크팩터 추가
파크팩터 추가
홈 선발 - 원정 타자
파크팩터 추가
홈 구원 - 원정 타자
파크팩터 추가
파크팩터 추가
파크팩터 추가
파크팩터 추가
파크팩터 추가
파크팩터 추가
파크팩터 추가
매치 분류 시작
0.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.5
1.0
1.0
원정 선발 - 홈 타자
파크팩터 추가
원정 구원 - 홈 타자
파크팩터 추가
파크팩터 추가
파크팩터 추가
파크팩터 추가
파크팩터 추가
파크팩터 추가
파크팩터 추가
홈 선발 - 원정 타자
파크팩터 추가
홈 구원 - 원정 타자
파크팩터 추가
파크팩터 추가
파크팩터 추가
파크팩터 추가
파크팩터 추가
파크팩터 추가
파크팩터 추가
매치 분류 시작
해당 투수 정보 없음
1.0
1.0
0.0
0.0
0.0
0.5
1.0
1.0
0.0
0.0
1.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
원정 선발 - 홈 타자
파크팩터 추가
원정 구원 - 홈 타자
파크팩터 추가
파크팩터 추가
파크팩터 추가
파크팩터 추가
파크팩터 추가
파크팩터 추가
파크팩터 추가
파크팩터 추

In [22]:
b[0]

Unnamed: 0,home_team,away_team,batter_key_mlbam,pitcher_key_mlbam,IDfg,(P) IDfg,key_bbref,(P) key_bbref,Name,(P) Name,...,ARM/G,DPR/G,RngR/G,ErrR/G,UZR/G,Def/G,FRM/G,OAA/G,Range/G,park_factor
0,SEA,MIN,650489,682243,18695.072687,29837,hendegu01,millebr04,W Castro,Bryce Miller,...,-0.001592,0.0,0.002946,-0.005494,-0.00414,-0.003025,0.002787,-0.001592,-0.001592,92
1,SEA,MIN,663616,682243,18695.072687,29837,hendegu01,millebr04,T Larnach,Bryce Miller,...,-0.001592,0.0,0.002946,-0.005494,-0.00414,-0.003025,0.002787,-0.001592,-0.001592,92
2,SEA,MIN,621043,682243,18695.072687,29837,hendegu01,millebr04,C Correa,Bryce Miller,...,-0.001592,0.0,0.002946,-0.005494,-0.00414,-0.003025,0.002787,-0.001592,-0.001592,92
3,SEA,MIN,467793,682243,18695.072687,29837,hendegu01,millebr04,C Santana,Bryce Miller,...,-0.001592,0.0,0.002946,-0.005494,-0.00414,-0.003025,0.002787,-0.001592,-0.001592,92
4,SEA,MIN,596146,682243,18695.072687,29837,hendegu01,millebr04,M Kepler,Bryce Miller,...,-0.001592,0.0,0.002946,-0.005494,-0.00414,-0.003025,0.002787,-0.001592,-0.001592,92
5,SEA,MIN,669304,682243,18695.072687,29837,hendegu01,millebr04,J Miranda,Bryce Miller,...,-0.001592,0.0,0.002946,-0.005494,-0.00414,-0.003025,0.002787,-0.001592,-0.001592,92
6,SEA,MIN,621439,682243,18695.072687,29837,hendegu01,millebr04,B Buxton,Bryce Miller,...,-0.001592,0.0,0.002946,-0.005494,-0.00414,-0.003025,0.002787,-0.001592,-0.001592,92
7,SEA,MIN,680777,682243,18695.072687,29837,hendegu01,millebr04,R Jeffers,Bryce Miller,...,-0.001592,0.0,0.002946,-0.005494,-0.00414,-0.003025,0.002787,-0.001592,-0.001592,92
8,SEA,MIN,668885,682243,18695.072687,29837,hendegu01,millebr04,A Martin,Bryce Miller,...,-0.001592,0.0,0.002946,-0.005494,-0.00414,-0.003025,0.002787,-0.001592,-0.001592,92


In [24]:
x = b[0].iloc[:,12:].values

In [25]:
load_scaler = load(open('scaler.pkl', 'rb'))
x = load_scaler.transform(x)

In [26]:
x

array([[ 0.32568718,  0.12134682, -0.13253529, ..., -0.12689579,
        -0.16945801, -2.38239077],
       [ 1.59547682,  0.12134682, -0.13253529, ..., -0.12689579,
        -0.16945801, -2.38239077],
       [-0.94410246,  0.12134682, -0.13253529, ..., -0.12689579,
        -0.16945801, -2.38239077],
       ...,
       [-0.94410246,  0.12134682, -0.13253529, ..., -0.12689579,
        -0.16945801, -2.38239077],
       [-0.94410246,  0.12134682, -0.13253529, ..., -0.12689579,
        -0.16945801, -2.38239077],
       [-0.94410246,  0.12134682, -0.13253529, ..., -0.12689579,
        -0.16945801, -2.38239077]])

In [52]:
model_path = 'models/mlb_model.93.hdf5'
model = tf.keras.models.load_model(model_path) 

y_predict = model.predict(x)



In [53]:
y_predict

array([[0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1., 0.]], dtype=float32)

In [20]:
bat_recode[bat_recode['batter_key_mlbam'] == 650489]

Unnamed: 0,IDfg,batter_key_mlbam,key_bbref,Name,Team,bat_split,GB/FB,LD%,GB%,FB%,...,Zone% (pi),Pace,UBR,anglesweetspotpercent,ev50,fbld,max_distance,avg_distance,avg_hr_distance,ev95percent
21,17338,650489,castrwi01,Willi Castro,MIN,1.0,0.0126,0.231,0.428,0.341,...,0.493,17.7,0.6,37.3,99.4,91.0,432,170,404.0,39.9


In [52]:
lineup_data = match_data[1].text.split('\n')

In [53]:
lineup_data

['',
 '',
 '',
 '                                                            CIN Lineup',
 '                                                        ',
 '',
 '                                                            STL Lineup',
 '                                                        ',
 '',
 '',
 'J India (R) 2B',
 'E De La Cruz (S) SS',
 'S Steer (R) 1B',
 'J Fraley (L) DH',
 'N Marte (R) 3B',
 'N Martini (L) LF',
 'S Fairchild (R) CF',
 'W Benson (L) RF',
 'A Wynns (R) C',
 '',
 '',
 'M Winn (R) SS',
 'A Burleson (L) RF',
 'W Contreras (R) C',
 'P Goldschmidt (R) 1B',
 'B Donovan (L) LF',
 'N Arenado (R) 3B',
 'N Gorman (L) 2B',
 'M Carpenter (L) DH',
 'M Siani (L) CF',
 '',
 '']

In [54]:
list(match_data[1].find_all(attrs={'class':'starting-lineups__player--link'})[0].attrs.values())[1]

'/player/jonathan-india-663697'

In [55]:
# away team 1번 타자 mlb num

list(match_data[1].find_all(attrs={'class':'starting-lineups__player--link'})[0].attrs.values())[1][-6:]

'663697'

In [56]:
# away team name

lineup_data[3].split(' ')[-2]

'CIN'

In [57]:
# home team name

lineup_data[6].split(' ')[-2]

'STL'

In [58]:
# away team 2번 타자 mlb num

list(match_data[1].find_all(attrs={'class':'starting-lineups__player--link'})[1].attrs.values())[1][-6:]

'682829'

In [59]:
# home team 2번 타자 mlb num

list(match_data[1].find_all(attrs={'class':'starting-lineups__player--link'})[10].attrs.values())[1][-6:]

'676475'

In [60]:
# away team 1번 타자 name

lineup_data[12].split(' (')[0]

'S Steer'

In [61]:
# away team 2번 타자 name

lineup_data[13].split(' (')[0]

'J Fraley'

In [62]:
# away team 1번 타자 position

lineup_data[12].split(' (')[1].split(' ')[1]

'1B'

In [63]:
# away team 2번 타자 position

lineup_data[13].split(' (')[1].split(' ')[1]

'DH'

In [83]:
pitcher_data = dom.find_all(attrs={'class':'starting-lineups__pitcher--link'})

In [84]:
pitcher_data

[<a class="starting-lineups__pitcher--link" href="/player/cal-quantrill-615698">
 <img onerror="this.onerror=null;this.src='https://content.mlb.com/images/headshots/current/60x60/generic@3x.png';" src="https://content.mlb.com/images/headshots/current/60x60/615698@3x.png">
 </img></a>,
 <a class="starting-lineups__pitcher--link" href="/player/cal-quantrill-615698">Cal Quantrill</a>,
 <a class="starting-lineups__pitcher--link" href="/player/jonathan-cannon-686563">
 <img onerror="this.onerror=null;this.src='https://content.mlb.com/images/headshots/current/60x60/generic@3x.png';" src="https://content.mlb.com/images/headshots/current/60x60/686563@3x.png">
 </img></a>,
 <a class="starting-lineups__pitcher--link" href="/player/jonathan-cannon-686563">Jonathan Cannon</a>,
 <a class="starting-lineups__pitcher--link" href="/player/carson-spiers-686730">
 <img onerror="this.onerror=null;this.src='https://content.mlb.com/images/headshots/current/60x60/generic@3x.png';" src="https://content.mlb.co

In [92]:
pitcher_data[1].text

'Cal Quantrill'

In [67]:
list(pitcher_data[1].attrs.values())[1][-6:]

'615698'

In [68]:
pitcher_data[3].text

'Jonathan Cannon'

In [69]:
list(pitcher_data[3].attrs.values())[1][-6:]

'686563'

In [70]:
pitcher_split_data = dom.find_all(attrs={'class':'starting-lineups__pitcher-pitch-hand'})

In [90]:
pitcher_split_data[0].text.replace(' ','')[1]

'R'

In [13]:
url = 'https://www.mlb.com/whitesox/roster'

response = requests.get(url)
dom = BeautifulSoup(response.content, 'html.parser')

In [14]:
dom

<!DOCTYPE html>

<html lang="en">
<head>
<title>Active Roster | Chicago White Sox</title>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no" name="viewport"/>
<meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
<meta content="The Official Site of Major League Baseball" name="description"/>
<meta content="The Official Site of Major League Baseball" property="og:description"/>
<meta content="Active Roster" property="og:title"/>
<meta content="https://www.mlbstatic.com/team-logos/share/145.jpg" property="og:image"/>
<meta content="MLB.com" property="og:site_name"/>
<meta content="website" property="og:type"/>
<meta content="https://www.mlb.com/whitesox/roster" property="og:url"/>
<script src="/cdn-cgi/apps/head/Bvi3NCUQlon5FbcqTnDxNcVe5Fs.js"></script><link href="https://www.mlb.com/whitesox/roster" rel="canonical"/>
<link href="https://www.mlbstatic.com/" rel="preconnect"/>
<link href="https://builds.mlbstatic.com/" rel="

In [38]:
depth_data = dom.find_all(attrs={'class':'info'})

In [39]:
depth_data

[<td class="info">
 <a href="/player/605121">Justin Anderson</a>
 <span class="jersey">60</span>
 <div class="mobile-info">
 <span class="mobile-info__bat-throw">B/T: L/R</span>
 <span class="mobile-info__height">Ht: 6' 3"</span>
 <span class="mobile-info__weight">Wt: 230</span>
 <span class="mobile-info__birthday">DOB: 09/28/1992</span>
 </div>
 </td>,
 <td class="info">
 <a href="/player/621383">Tanner Banks</a>
 <span class="jersey">57</span>
 <div class="mobile-info">
 <span class="mobile-info__bat-throw">B/T: R/L</span>
 <span class="mobile-info__height">Ht: 6' 1"</span>
 <span class="mobile-info__weight">Wt: 205</span>
 <span class="mobile-info__birthday">DOB: 10/24/1991</span>
 </div>
 </td>,
 <td class="info">
 <a href="/player/605154">John Brebbia</a>
 <span class="jersey">59</span>
 <div class="mobile-info">
 <span class="mobile-info__bat-throw">B/T: L/R</span>
 <span class="mobile-info__height">Ht: 6' 1"</span>
 <span class="mobile-info__weight">Wt: 205</span>
 <span class="

In [41]:
len(depth_data)

26

In [40]:
list(list(depth_data[0])[1].attrs.values())[0][-6:]

'605121'

In [43]:
player_num_list = []
for i in range(len(depth_data)):
    player_num = list(list(depth_data[i])[1].attrs.values())[0][-6:]
    player_num_list.append(player_num)
player_num_list

['605121',
 '621383',
 '605154',
 '686563',
 '676979',
 '607200',
 '623167',
 '656629',
 '641771',
 '694363',
 '647336',
 '689672',
 '621051',
 '686676',
 '455117',
 '657557',
 '670032',
 '664901',
 '672820',
 '683734',
 '643217',
 '667452',
 '502054',
 '673357',
 '657757',
 '650391']

In [95]:
def depth_num(team_short_name):
    team_long_names = {'COL':'rockies', 'BOS':'redsox', 'KC':'royals', 'CIN':'reds', 'TEX':'rangers', 'WSH':'nationals', 
                        'LAA':'angels', 'STL':'cardinals', 'HOU':'astros', 'ATL':'braves', 'PHI':'phillies', 'MIN':'twins', 
                        'TOR':'bluejays', 'AZ':'dbacks', 'CHC':'cubs', 'PIT':'pirates', 'MIA':'marlins', 'CWS':'whitesox',
                        'LAD':'dodgers', 'MIL':'brewers', 'NYY':'yankees', 'BAL':'orioles', 'DET':'tigers', 'OAK':'athletics', 
                        'TB':'rays', 'CLE':'guardians', 'SF':'giants', 'SD':'padres', 'NYM':'mets', 'SEA':'mariners'}
    
    team_name = team_long_names[team_short_name]
    
    url = f'https://www.mlb.com/{team_name}/roster'

    response = requests.get(url)
    dom = BeautifulSoup(response.content, 'html.parser')
    
    depth_data = dom.find_all(attrs={'class':'info'})
    player_num_list = []
    for i in range(len(depth_data)):
        player_num = list(list(depth_data[i])[1].attrs.values())[0][-6:]
        player_num = int(player_num)
        player_num_list.append(player_num)
    
    return player_num_list

In [96]:
depth_num('CIN')

[671096,
 668933,
 518585,
 664747,
 571656,
 668881,
 607259,
 594580,
 593423,
 608371,
 686730,
 608718,
 458677,
 571912,
 642851,
 600869,
 682829,
 669289,
 663697,
 682119,
 682622,
 666181,
 656413,
 641584,
 605361,
 668715]

In [105]:
import pybaseball

starter_data = pybaseball.pitching_stats(2024, qual=30)
starter_data = starter_data[starter_data['GS'] / starter_data['G'] > 0.5]
starter_data['IDfg']

51     19879
78     27463
111    21846
16     17277
62     10603
       ...  
281    19374
275    27589
291    26285
245    18000
270    25311
Name: IDfg, Length: 154, dtype: int64

In [None]:
start_pitcher_mlb_nums = [21846,19879]
list(pybaseball.playerid_reverse_lookup(start_pitcher_mlb_nums, key_type='fangraphs')['key_mlbam'])

In [None]:
def pitcher_batter_aug(pitcher_data, batter_data, home_team):
    """
    투수와 타자 데이터를 결합하여 필요한 데이터를 생성합니다.
    :param pitcher_data: 투수 데이터.
    :param batter_data: 타자 데이터.
    :param home_team: 홈팀 경기장 정보.
    :return: 결합된 데이터 리스트.
    """
    batter_pitcher_list = []
    for pitcher_num in list(pitcher_data['pitcher_key_mlbam']):
        batter_pitcher = pd.DataFrame()
        for player_num in list(batter_data['batter_key_mlbam']):
            row = pd.DataFrame({
                'pitcher_key_mlbam': [pitcher_num],
                'batter_key_mlbam': [player_num],
            })
            batter_pitcher = pd.concat([batter_pitcher, row], ignore_index=True)
        batter_pitcher = pd.merge(batter_pitcher,pitcher_data)
        batter_pitcher = pd.merge(batter_pitcher,batter_data)
        park_factors = {'COL':112, 'BOS':107, 'KC':105, 'CIN':104, 'TEX':102, 'WSH':102, 'LAA':101, 'STL':101, 'HOU':101,
                    'ATL':101, 'PHI':101, 'MIN':101, 'TOR':100, 'AZ':100, 'CHC':100, 'PIT':100, 'MIA':100, 'CWS':99, 
                    'LAD':99, 'MIL':99, 'NYY':99, 'BAL':98, 'DET':98, 'OAK':97, 'TB':97, 'CLE':96, 'SF':96, 'SD':96, 
                    'NYM':95, 'SEA':92}
        batter_pitcher['park_factor'] = park_factors[home_team]
        
            # 필요한 열 선택
        columns_to_select = [
            'game_date', 'home_team', 'away_team', 'batter_key_mlbam', 'pitcher_key_mlbam', 'IDfg', '(P) IDfg', 'key_bbref',
            '(P) key_bbref', 'Name', '(P) Name', 'Team', '(P) Team', 'bat_split', 'GB/FB', 'LD%', 'GB%', 'FB%', 'IFFB%',
            'HR/FB', 'Spd', 'BsR', 'wFB/C', 'wSL/C', 'wCT/C', 'wCB/C', 'wCH/C', 'wSF/C', 'wKN/C', 'O-Swing%', 'Z-Swing%',
            'Swing%', 'O-Contact%', 'Z-Contact%', 'Contact%', 'Zone%', 'F-Strike%', 'SwStr%', 'Pull%', 'Cent%', 'Oppo%',
            'Soft%', 'Med%', 'Hard%', 'wFA/C (sc)', 'wFC/C (sc)', 'wFS/C (sc)', 'wFO/C (sc)', 'wSI/C (sc)', 'wSL/C (sc)',
            'wCU/C (sc)', 'wKC/C (sc)', 'wEP/C (sc)', 'wCH/C (sc)', 'wSC/C (sc)', 'wKN/C (sc)', 'O-Swing% (sc)',
            'Z-Swing% (sc)', 'Swing% (sc)', 'O-Contact% (sc)', 'Z-Contact% (sc)', 'Contact% (sc)', 'Zone% (sc)', 'LD+%',
            'GB%+', 'FB%+', 'HR/FB%+', 'Pull%+', 'Cent%+', 'Oppo%+', 'Soft%+', 'Med%+', 'Hard%+', 'EV', 'LA', 'Barrel%',
            'maxEV', 'HardHit%', 'CStr%', 'CSW%', 'wCH/C (pi)', 'wCS/C (pi)', 'wCU/C (pi)', 'wFA/C (pi)', 'wFC/C (pi)',
            'wFS/C (pi)', 'wKN/C (pi)', 'wSB/C (pi)', 'wSI/C (pi)', 'wSL/C (pi)', 'O-Swing% (pi)', 'Z-Swing% (pi)',
            'Swing% (pi)', 'O-Contact% (pi)', 'Z-Contact% (pi)', 'Contact% (pi)', 'Zone% (pi)', 'Pace', 'UBR',
            'anglesweetspotpercent', 'ev50', 'fbld', 'max_distance', 'avg_distance', 'avg_hr_distance', 'ev95percent',
            '(P) pitch_split', '(P) GB/FB', '(P) LD%', '(P) GB%', '(P) FB%', '(P) FB%.1', '(P) IFFB%', '(P) HR/FB',
            '(P) FB%.2', '(P) FB%.3', '(P) FBv', '(P) SL%', '(P) SLv', '(P) CT%', '(P) CTv', '(P) CB%', '(P) CBv',
            '(P) CH%', '(P) CHv', '(P) SF%', '(P) SFv', '(P) KN%', '(P) KNv', '(P) wFB/C', '(P) wSL/C', '(P) wCT/C',
            '(P) wCB/C', '(P) wCH/C', '(P) wSF/C', '(P) wKN/C', '(P) O-Swing%', '(P) Z-Swing%', '(P) Swing%',
            '(P) O-Contact%', '(P) Z-Contact%', '(P) Contact%', '(P) Zone%', '(P) F-Strike%', '(P) SwStr%', '(P) FA% (sc)',
            '(P) FC% (sc)', '(P) FS% (sc)', '(P) FO% (sc)', '(P) SI% (sc)', '(P) SL% (sc)', '(P) CU% (sc)', '(P) KC% (sc)',
            '(P) EP% (sc)', '(P) CH% (sc)', '(P) SC% (sc)', '(P) KN% (sc)', '(P) vFA (sc)', '(P) vFC (sc)', '(P) vFS (sc)',
            '(P) vFO (sc)', '(P) vSI (sc)', '(P) vSL (sc)', '(P) vCU (sc)', '(P) vKC (sc)', '(P) vEP (sc)', '(P) vCH (sc)',
            '(P) vSC (sc)', '(P) vKN (sc)', '(P) FA-X (sc)', '(P) FC-X (sc)', '(P) FS-X (sc)', '(P) FO-X (sc)',
            '(P) SI-X (sc)', '(P) SL-X (sc)', '(P) CU-X (sc)', '(P) KC-X (sc)', '(P) EP-X (sc)', '(P) CH-X (sc)',
            '(P) SC-X (sc)', '(P) KN-X (sc)', '(P) FA-Z (sc)', '(P) FC-Z (sc)', '(P) FS-Z (sc)', '(P) FO-Z (sc)',
            '(P) SI-Z (sc)', '(P) SL-Z (sc)', '(P) CU-Z (sc)', '(P) KC-Z (sc)', '(P) EP-Z (sc)', '(P) CH-Z (sc)',
            '(P) SC-Z (sc)', '(P) KN-Z (sc)', '(P) wFA/C (sc)', '(P) wFC/C (sc)', '(P) wFS/C (sc)', '(P) wFO/C (sc)',
            '(P) wSI/C (sc)', '(P) wSL/C (sc)', '(P) wCU/C (sc)', '(P) wKC/C (sc)', '(P) wEP/C (sc)', '(P) wCH/C (sc)',
            '(P) wSC/C (sc)', '(P) wKN/C (sc)', '(P) O-Swing% (sc)', '(P) Z-Swing% (sc)', '(P) Swing% (sc)',
            '(P) O-Contact% (sc)', '(P) Z-Contact% (sc)', '(P) Contact% (sc)', '(P) Zone% (sc)', '(P) LD%+',
            '(P) GB%+', '(P) FB%+', '(P) HR/FB%+', '(P) Pull%+', '(P) Cent%+', '(P) Oppo%+', '(P) Soft%+',
            '(P) Med%+', '(P) Hard%+', '(P) EV', '(P) LA', '(P) Barrel%', '(P) maxEV', '(P) HardHit%', '(P) CStr%',
            '(P) CSW%', '(P) botOvr CH', '(P) botStf CH', '(P) botCmd CH', '(P) botOvr CU', '(P) botStf CU',
            '(P) botCmd CU', '(P) botOvr FA', '(P) botStf FA', '(P) botCmd FA', '(P) botOvr SI', '(P) botStf SI',
            '(P) botCmd SI', '(P) botOvr SL', '(P) botStf SL', '(P) botCmd SL', '(P) botOvr KC', '(P) botStf KC',
            '(P) botCmd KC', '(P) botOvr FC', '(P) botStf FC', '(P) botCmd FC', '(P) botOvr FS', '(P) botStf FS',
            '(P) botCmd FS', '(P) botOvr', '(P) botStf', '(P) botCmd', '(P) botxRV100', '(P) Stf+ CH', '(P) Loc+ CH',
            '(P) Pit+ CH', '(P) Stf+ CU', '(P) Loc+ CU', '(P) Pit+ CU', '(P) Stf+ FA', '(P) Loc+ FA', '(P) Pit+ FA',
            '(P) Stf+ SI', '(P) Loc+ SI', '(P) Pit+ SI', '(P) Stf+ SL', '(P) Loc+ SL', '(P) Pit+ SL', '(P) Stf+ KC',
            '(P) Loc+ KC', '(P) Pit+ KC', '(P) Stf+ FC', '(P) Loc+ FC', '(P) Pit+ FC', '(P) Stf+ FS', '(P) Loc+ FS',
            '(P) Pit+ FS', '(P) Stf+ FO', '(P) Loc+ FO', '(P) Pit+ FO', '(P) Stuff+', '(P) Location+', '(P) Pitching+',
            '(P) CH% (pi)', '(P) CS% (pi)', '(P) CU% (pi)', '(P) FA% (pi)', '(P) FC% (pi)', '(P) FS% (pi)', '(P) KN% (pi)',
            '(P) SB% (pi)', '(P) SI% (pi)', '(P) SL% (pi)', '(P) vCH (pi)', '(P) vCS (pi)', '(P) vCU (pi)', '(P) vFA (pi)',
            '(P) vFC (pi)', '(P) vFS (pi)', '(P) vKN (pi)', '(P) vSB (pi)', '(P) vSI (pi)', '(P) vSL (pi)', '(P) vXX (pi)',
            '(P) CH-X (pi)', '(P) CS-X (pi)', '(P) CU-X (pi)', '(P) FA-X (pi)', '(P) FC-X (pi)', '(P) FS-X (pi)',
            '(P) KN-X (pi)', '(P) SB-X (pi)', '(P) SI-X (pi)', '(P) SL-X (pi)', '(P) CH-Z (pi)', '(P) CS-Z (pi)',
            '(P) CU-Z (pi)', '(P) FA-Z (pi)', '(P) FC-Z (pi)', '(P) FS-Z (pi)', '(P) KN-Z (pi)', '(P) SB-Z (pi)',
            '(P) SI-Z (pi)', '(P) SL-Z (pi)', '(P) wCH/C (pi)', '(P) wCS/C (pi)', '(P) wCU/C (pi)', '(P) wFA/C (pi)',
            '(P) wFC/C (pi)', '(P) wFS/C (pi)', '(P) wKN/C (pi)', '(P) wSB/C (pi)', '(P) wSI/C (pi)', '(P) wSL/C (pi)',
            '(P) O-Swing% (pi)', '(P) Z-Swing% (pi)', '(P) Swing% (pi)', '(P) O-Contact% (pi)', '(P) Z-Contact% (pi)',
            '(P) Contact% (pi)', '(P) Zone% (pi)', '(P) Pace', '(P) anglesweetspotpercent', '(P) ev50', '(P) fbld',
            '(P) max_distance', '(P) avg_distance', '(P) avg_hr_distance', '(P) ev95percent', 'rSZ/G', 'rCERA/G', 'rTS/G',
            'rSB/G', 'rGDP/G', 'rARM/G', 'rGFP/G', 'rPM/G', 'DRS/G', 'ARM/G', 'DPR/G', 'RngR/G', 'ErrR/G', 'UZR/G',
            'Def/G', 'FRM/G', 'OAA/G', 'Range/G', '(Away) rSZ/G', '(Away) rCERA/G', '(Away) rTS/G', '(Away) rSB/G',
            '(Away) rGDP/G', '(Away) rARM/G', '(Away) rGFP/G', '(Away) rPM/G', '(Away) DRS/G', '(Away) ARM/G', '(Away) DPR/G',
            '(Away) RngR/G', '(Away) ErrR/G', '(Away) UZR/G', '(Away) Def/G', '(Away) FRM/G', '(Away) OAA/G', '(Away) Range/G',
            'park_factor'
            ]
        batter_pitcher = batter_pitcher[columns_to_select]
        batter_pitcher_list.append(batter_pitcher)
        
    return batter_pitcher_list

In [None]:
import pandas as pd
import time
import numpy as np
import preprocessor
from pickle import load
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from matplotlib.offsetbox import AnchoredText
import math
from bs4 import BeautifulSoup
import requests

In [None]:
def today_lineup(bat_recode, pitch_recode, year, date):
    """
    오늘의 경기 라인업을 가져오고 예상 득점을 계산합니다.
    :param bat_recode: 타자 기록 데이터.
    :param pitch_recode: 투수 기록 데이터.
    :param year: 년도.
    :param date: 날짜 (YYYY-MM-DD).
    """

    url = f'https://www.mlb.com/starting-lineups/{date}'

    response = requests.get(url)
    dom = BeautifulSoup(response.content, 'html.parser')

    match_data = dom.find_all(attrs={'class':'starting-lineups__teams starting-lineups__teams--xs starting-lineups__teams--md starting-lineups__teams--lg'})
    start_pitcher_data = dom.find_all(attrs={'class':'starting-lineups__pitcher--link'})
    
    results = {}
    i = 1
    for match in match_data:
        try:
            lineup_data = match.text.split('\n')
            
            home_team = lineup_data[6].split(' ')[-2]
            away_team = lineup_data[3].split(' ')[-2]
            
            home_start_pitcher = pitcher_data[i].text
            home_start_pitcher_num = list(start_pitcher_data[i].attrs.values())[1][-6:]
            away_start_pitcher = pitcher_data[i+1].text
            away_start_pitcher_num = list(start_pitcher_data[i+2].attrs.values())[1][-6:]
            
            try:
                home_start_pitcher_data = pitch_recode[pitch_recode['pitcher_key_mlbam'] == home_start_pitcher_num]
            except:
                # 모든 열의 평균 계산
                numeric_means = pitch_recode.select_dtypes(include='number').mean()

                # 비숫자형 열의 첫 번째 값 가져오기
                non_numeric_data = pitch_recode.select_dtypes(exclude='number').iloc[0]

                # 평균값과 비숫자형 데이터를 결합
                mean_data = pd.concat([numeric_means, non_numeric_data])

                # 평균값을 데이터프레임으로 변환
                home_start_pitcher_num = 123456
                home_start_pitcher_data = pd.DataFrame(mean_data).transpose()
                home_start_pitcher_data['pitcher_key_mlbam'] = 123456
                home_start_pitcher_data['(P) Name'] = home_start_pitcher
                home_start_pitcher_data['(P) pitch_split'] = home_start_pitcher_split
                
            try:
                away_start_pitcher_data = pitch_recode[pitch_recode['pitcher_key_mlbam'] == away_start_pitcher_num]
            except:
                # 모든 열의 평균 계산
                numeric_means = pitch_recode.select_dtypes(include='number').mean()

                # 비숫자형 열의 첫 번째 값 가져오기
                non_numeric_data = pitch_recode.select_dtypes(exclude='number').iloc[0]

                # 평균값과 비숫자형 데이터를 결합
                mean_data = pd.concat([numeric_means, non_numeric_data])

                # 평균값을 데이터프레임으로 변환
                away_start_pitcher_num = 654321
                away_start_pitcher_data = pd.DataFrame(mean_data).transpose()
                away_start_pitcher_data['pitcher_key_mlbam'] = 654321
                away_start_pitcher_data['(P) Name'] = away_start_pitcher
                away_start_pitcher_data['(P) pitch_split'] = away_start_pitcher_split
            
            home_batters_data = pd.DataFrame()
            away_batters_data = pd.DataFrame()
            
            for i in range(9):
                home_batter = lineup_data[21 + i].split(' (')[0]
                home_position = lineup_data[21 + i].split(' (')[1].split(' ')[1]
                home_batter_num = list(match_data[1].find_all(attrs={'class':'starting-lineups__player--link'})[i].attrs.values())[1][-6:]
                        
                try:
                    batter_data = bat_recode[bat_recode['batter_key_mlbam'] == home_batter_num]
                    home_batters_data = pd.concat([home_batters_data, batter_data], ignore_index=True)
                        
                except:
                    # 모든 열의 평균 계산
                    numeric_means = bat_recode.select_dtypes(include='number').mean()

                    # 비숫자형 열의 첫 번째 값 가져오기
                    non_numeric_data = bat_recode.select_dtypes(exclude='number').iloc[0]

                    # 평균값과 비숫자형 데이터를 결합
                    mean_data = pd.concat([numeric_means, non_numeric_data])

                    # 평균값을 데이터프레임으로 변환
                    batter_data = pd.DataFrame(mean_data).transpose()
                    batter_data['batter_key_mlbam'] = 99999 - i
                    batter_data['Name'] = home_batter
                    batter_data['bat_split'] = home_split  
                    home_batters_data = pd.concat([home_batters_data, batter_data], ignore_index=True)

                away_batter = lineup_data[11 + i].split(' (')[0]
                away_position = lineup_data[11 + i].split(' (')[1].split(' ')[1]
                away_batter_num = list(match_data[1].find_all(attrs={'class':'starting-lineups__player--link'})[i+9].attrs.values())[1][-6:]
                        
                try:
                    batter_data = bat_recode[bat_recode['batter_key_mlbam'] == away_batter_num]
                    away_batters_data = pd.concat([away_batters_data, batter_data], ignore_index=True)
                        
                except:
                    # 모든 열의 평균 계산
                    numeric_means = bat_recode.select_dtypes(include='number').mean()

                    # 비숫자형 열의 첫 번째 값 가져오기
                    non_numeric_data = bat_recode.select_dtypes(exclude='number').iloc[0]

                    # 평균값과 비숫자형 데이터를 결합
                    mean_data = pd.concat([numeric_means, non_numeric_data])

                    # 평균값을 데이터프레임으로 변환
                    batter_data = pd.DataFrame(mean_data).transpose()
                    batter_data['batter_key_mlbam'] = 99999 - i
                    batter_data['Name'] = home_batter
                    batter_data['bat_split'] = home_split
                    away_batters_data = pd.concat([away_batters_data, batter_data], ignore_index=True)
            
            
            # 불펜 투수 계산
            
            home_relief_data = pd.DataFrame()
            away_relief_data = pd.DataFrame()
            
            start_pitcher_mlb_nums = pybaseball.pitching_stats(year, qual=30)
            start_pitcher_mlb_nums = start_pitcher_mlb_nums[start_pitcher_mlb_nums['GS'] / starter_data['G'] > 0.5]
            start_pitcher_mlb_nums = list(start_pitcher_mlb_nums[start_pitcher_mlb_nums['IDfg']])
            start_pitcher_mlb_nums = list(pybaseball.playerid_reverse_lookup(start_pitcher_mlb_nums, key_type='fangraphs')['key_mlbam'])
            
            home_depth_num = depth_num(home_team)
            away_depth_num = depth_num(away_team)
            
            for pitcher_num in home_depth_num:
                try:
                    if (pitcher_num in list(pitch_recode['pitcher_key_mlbam'])) and (pitcher_num not in start_pitcher_mlb_nums):
                        pitcher_data = pitch_recode[pitch_recode['Pitcher Number'] == pitcher_num]
                        home_relief_data = pd.concat([home_relief_data, pitcher_data], ignore_index=True)
                except:
                    continue
                    
            for pitcher_num in away_depth_num:
                try:
                    if (pitcher_num in list(pitch_recode['pitcher_key_mlbam'])) and (pitcher_num not in start_pitcher_mlb_nums):
                        pitcher_data = pitch_recode[pitch_recode['Pitcher Number'] == pitcher_num]
                        away_relief_data = pd.concat([away_relief_data, pitcher_data], ignore_index=True)
                except:
                    continue

            home_batter_away_starter_list = pitcher_batter_aug(away_start_pitcher_data, home_batters_data, home_team)
            home_batter_away_relief_list = pitcher_batter_aug(away_relief_data, home_batters_data, home_team)
            away_batter_home_starter_list = pitcher_batter_aug(home_start_pitcher_data, away_batters_data, home_team)
            away_batter_home_relief_list = pitcher_batter_aug(home_relief_data, away_batters_data, home_team)
            
            load_scaler = load(open('scaler.pkl', 'rb'))

            model_path = 'model/mlb_model_dnn.hdf5'
            model = tf.keras.models.load_model(model_path) 

            home_batter_away_starter_lineup_list = make_prob_lineup(home_batter_away_starter_list, load_scaler, model)
            home_batter_away_relief_lineup_list = make_prob_lineup(home_batter_away_relief_list, load_scaler, model)
            away_batter_home_starter_lineup_list = make_prob_lineup(away_batter_home_starter_list, load_scaler, model)
            away_batter_home_relief_lineup_list = make_prob_lineup(away_batter_home_relief_list, load_scaler, model)
            
            print('예상 득점 계산 중')
            
            home_expRuns = teamExpectedRuns(home_team_name, away_team_name, home_batter_away_starter_lineup_list, home_batter_away_relief_lineup_list, starter_data, away_start_pitcher, away_start_pitcher_num)
            away_expRuns = teamExpectedRuns(away_team_name, home_team_name, away_batter_home_starter_lineup_list, away_batter_home_relief_lineup_list, starter_data, home_start_pitcher, home_start_pitcher_num)

            # 승률 계산
            exp_runs_diff = home_expRuns - away_expRuns
            home_win_prob = logistic_win_prob(exp_runs_diff)
            away_win_prob = 1 - home_win_prob

            print(f'\n{home_team_name}의 예상 승률: {home_win_prob:.2%}')
            print(f'{away_team_name}의 예상 승률: {away_win_prob:.2%}')

            results[link] = {
                'home_team': home_team_name,
                'away_team': away_team_name,
                'home_expRuns': home_expRuns,
                'away_expRuns': away_expRuns,
                'home_win_prob': home_win_prob,
                'away_win_prob': away_win_prob
            }
        except:
            print('이 경기는 라인업이 뜨지 않았거나 취소되었습니다.')

    return results