In [10]:
from selenium.webdriver import Chrome, ChromeOptions
import chromedriver_binary

url = 'https://race.netkeiba.com/race/shutuba.html?race_id=202105030609&rf=race_list'
options = ChromeOptions()
sample_driver = Chrome(options=options)
sample_driver.get(url)

In [14]:
sample_elements = sample_driver.find_elements_by_class_name('HorseList')[0]

In [16]:
sample_tds = sample_elements.find_elements_by_tag_name('td')

In [17]:
for td in sample_tds:
    print(td.text)



--
アスティ
セ5
54.0
吉田豊
美浦堀井

31.5
11




In [69]:
import pandas as pd
import re

class Shutubahyo:
    def __init__(self):
        self.shutubahyo = pd.DataFrame()
    
    def scraping_table(self, race_id_list):
        options = ChromeOptions()
        driver = Chrome(options=options)
        
        for race_id in race_id_list:
            url = 'https://race.netkeiba.com/race/shutuba.html?race_id=' + race_id
            driver.get(url)
            elements = driver.find_elements_by_class_name('HorseList')
            for element in elements:
                tds = element.find_elements_by_tag_name('td')
                row = []
                for td in tds:
                    row.append(td.text)
                    if td.get_attribute('class') in ['HorseInfo', 'Jockey']:
                        href = td.find_element_by_tag_name('a').get_attribute('href')
                        row.append(re.findall(r'\d+', href)[0])
                self.shutubahyo = self.shutubahyo.append(pd.Series(row, name = race_id))
            
        driver.close()
        
    def preprocessing(self):
        df = self.shutubahyo.copy()
        df = df[[0,1,3,4,5,6,7,8,10,11,12]]
        self.shutubahyo = df
        self.shutubahyo = self.shutubahyo.set_axis(['枠', '馬番','馬名','horse_id','性齢','斤量','騎手','jockey_id','馬体重','予想オッズ','人気'], axis=1)
        
    def merge_horse_results(self, horse_results, columns, n_race=5):
        for column in columns:
            df = horse_results.groupby(level=0).head(n_race)
            df = df.astype(str).groupby(level=0)[column].apply(lambda x: ','.join(x))
            df = df.str.split(',', expand=True).add_prefix('{}_'.format(column))
            self.shutubahyo = self.shutubahyo.merge(df, left_on='horse_id', right_index=True, how='left')

In [70]:
st = Shutubahyo()

In [58]:
st.scraping_table(['202105030609'])

In [59]:
st.preprocessing()
st.shutubahyo

Unnamed: 0,枠,馬番,馬名,horse_id,性齢,斤量,騎手,jockey_id,馬体重,予想オッズ,人気
202105030609,,,アスティ,2016104998,セ5,54.0,吉田豊,733,,32.0,11
202105030609,,,エカテリンブルク,2017105500,牡4,54.0,松田,1030,,29.2,9
202105030609,,,エドノフェリーチェ,2017104060,牝4,54.0,大野,1096,,7.2,4
202105030609,,,ディスカバー,2014105545,セ7,53.0,原,1184,,272.1,12
202105030609,,,ドリームスピリット,2015102790,牡6,54.0,田辺,1075,,14.1,7
202105030609,,,ナミブ,2016100742,セ5,51.0,杉原,1135,,32.0,10
202105030609,,,ヒルノダカール,2016100550,牡5,56.0,池添,1032,,7.9,5
202105030609,,,ベスビアナイト,2017105590,セ4,54.0,菅原明,1179,,14.1,8
202105030609,,,ペレ,2016104614,牝5,54.0,ルメール,5339,,2.5,1
202105030609,,,ライル,2016104868,セ5,55.0,福永,1014,,4.9,2


In [60]:
shutubahyo = st.shutubahyo

In [62]:
horse_data = pd.read_pickle('horse_data.pickle')

In [63]:
target = horse_data.groupby(level=0).head(5)

In [65]:
target.index.value_counts()

2015100828    5
2017103381    5
2015100188    5
2017105538    5
2018101525    5
             ..
2018105098    1
2017104241    1
2018105057    1
2017105629    1
2017101134    1
Length: 11389, dtype: int64

In [66]:
target.astype(str).groupby(level=0)['着順'].apply(lambda x: ','.join(x)) 
#文字列型にして5レース分の着順（リスト）を「,」でつなげて一つの文字列にする

2009100502       9,9,10,10,3
2009102606     12,12,11,5,14
2009103405     10,12,10,12,6
2010100035    16,16,12,14,16
2010100690       8,8,9,13,14
                   ...      
2018110133       4,5,10,2,12
2018110135        6,18,15,18
2018110138        12,9,1,1,2
2018110139             2,8,7
2018110145        10,2,3,1,2
Name: 着順, Length: 11389, dtype: object

In [72]:
horse_data.head()

Unnamed: 0,日付,開催,天気,R,レース名,映像,頭数,枠番,馬番,オッズ,...,ﾀｲﾑ指数,通過,ペース,上り,馬体重,厩舎ｺﾒﾝﾄ,備考,勝ち馬(2着馬),賞金,受賞歴
2018101626,2021/04/11,3中山6,晴,7.0,3歳1勝クラス,,10.0,1.0,1.0,52.9,...,**,5-6-6-5,38.0-34.7,34.6,450(+2),,,ミヤビハイディ,,
2018101626,2021/03/27,3中山1,晴,6.0,3歳1勝クラス,,8.0,8.0,8.0,72.1,...,**,8-8-8-8,36.3-36.6,36.3,448(+4),,,スペシャルドラマ,,
2018101626,2021/03/13,2中山5,雨,7.0,3歳1勝クラス,,11.0,6.0,7.0,99.2,...,**,9-9-7-8,37.5-38.2,40.5,444(0),,,ヴァイスメテオール,,
2018101626,2020/10/03,4中山8,曇,9.0,芙蓉S(OP),,9.0,1.0,1.0,35.0,...,**,7-7-9-9,36.2-35.5,36.2,444(+8),,,ランドオブリバティ,,
2018101626,2020/09/05,2札幌7,晴,11.0,札幌2歳S(G3),,14.0,6.0,9.0,10.7,...,**,6-6-5-10,35.0-36.9,37.8,436(-2),,,ソダシ,,


In [71]:
st.shutubahyo = shutubahyo

In [73]:
st.merge_horse_results(horse_data,['着順','賞金'])

In [74]:
st.shutubahyo

Unnamed: 0,枠,馬番,馬名,horse_id,性齢,斤量,騎手,jockey_id,馬体重,予想オッズ,...,着順_0,着順_1,着順_2,着順_3,着順_4,賞金_0,賞金_1,賞金_2,賞金_3,賞金_4
202105030609,,,アスティ,2016104998,セ5,54.0,吉田豊,733,,32.0,...,取,5,3,1,6,,150.0,384.1,760.0,
202105030609,,,エカテリンブルク,2017105500,牡4,54.0,松田,1030,,29.2,...,14,1,11,4,4,,760.0,,110.0,160.0
202105030609,,,エドノフェリーチェ,2017104060,牝4,54.0,大野,1096,,7.2,...,3,4,2,7,1,382.7,230.0,608.2,,530.0
202105030609,,,ディスカバー,2014105545,セ7,53.0,原,1184,,272.1,...,7,11,10,6,4,,,,,230.0
202105030609,,,ドリームスピリット,2015102790,牡6,54.0,田辺,1075,,14.1,...,4,3,10,10,11,230.0,383.5,,,
202105030609,,,ナミブ,2016100742,セ5,51.0,杉原,1135,,32.0,...,6,5,9,12,5,,76.0,,,150.0
202105030609,,,ヒルノダカール,2016100550,牡5,56.0,池添,1032,,7.9,...,9,2,2,8,2,,606.0,604.8,,605.6
202105030609,,,ベスビアナイト,2017105590,セ4,54.0,菅原明,1179,,14.1,...,1,1,4,7,3,1097.8,760.0,160.0,,130.0
202105030609,,,ペレ,2016104614,牝5,54.0,ルメール,5339,,2.5,...,3,6,2,1,2,385.0,,608.8,760.0,300.0
202105030609,,,ライル,2016104868,セ5,55.0,福永,1014,,4.9,...,16,1,3,2,2,,760.0,190.0,300.0,300.0
