In [21]:
from tqdm import tqdm
import time
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
import random

class RaceResult:
    @staticmethod
    def scrape(race_ids):
        """
        レース結果のスクレイピングを行うよ。
    
        入力input:
        レースIDのリスト
        race_ids : list
    
        出力output:
        結果のデータフレーム
        race_results : pd.DataFrame
        """
        # 辞書型で出力を定義しておく
        race_results = {}
        for race_id in tqdm(race_ids, leave=False):
            try:
                url = "https://db.netkeiba.com/race/" + race_id + "/"

                response = requests.get(url)
                response.encoding = "EUC-JP"

                df = pd.read_html(response.text)[0]
                # 半角スペースがあったら除去するよ〜
                df = df.rename(columns=lambda x: x.replace(' ', ''))
                # 正規表現で天気とレース情報をスクレイピングするよ〜
                soup = BeautifulSoup(response.text, "html.parser")
                text = soup.select("div.data_intro p")[0].text
                words = re.findall(r'\w+', text)
                for info in words:
                    if 'm' in info:
                        df['コース長'] = [int(''.join(re.findall(r'\d+', info)))] * len(df)
                    if info in ['曇', '晴', '雨', '小雨', '小雪', '雪', ]:
                        df['天気'] = [info] * len(df)
                    if info in ['芝', 'ダート', '障']:
                        df['レース場'] = [info] * len(df)
                    if info in ['良', '稍重', '重', '不良']:
                        df['場の状態'] = [info] * len(df)

                # 今度はお馬さんidと騎手さんid、調教師idを取得するよ〜
                horse_id_list = []
                jockey_id_list = []
                trainer_id_list = []

                horse_link_list = soup.find('table', attrs={'summary': 'レース結果'}).find_all('a', attrs={'href': re.compile(r'^/horse/')})
                for horse_link in horse_link_list:
                    horse_id = int(''.join(re.findall(r'\d+', horse_link['href'])))
                    horse_id_list.append(horse_id)

                jockey_link_list = soup.find('table', attrs={'summary': 'レース結果'}).find_all('a', attrs={'href': re.compile(r'^/jockey/result/recent/')})
                for jockey_link in jockey_link_list:
                    jockey_id = int(''.join(re.findall(r'\d+', jockey_link['href'])))
                    jockey_id_list.append(jockey_id)

                trainer_link_list = soup.find('table', attrs={'summary': 'レース結果'}).find_all('a', attrs={'href': re.compile(r'^/trainer/result/recent/')})
                for trainer_link in trainer_link_list:
                    trainer_id = int(''.join(re.findall(r'\d+', trainer_link['href'])))
                    trainer_id_list.append(trainer_id)

                df['馬id'] = horse_id_list
                df['騎手id'] = jockey_id_list
                df['調教師id'] = trainer_id_list

                df.index = [race_id] * len(df)
                race_results[race_id] = df
                
                time.sleep(random.uniform(1, 3))
            
            except IndexError:
                continue
            except AttributeError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break
            
        race_results_df = pd.concat([race_results[key] for key in race_results])
            
        return race_results_df

year = 2020

race_id_list = []
for place in range(1, 11, 1):
    for kai in range(1, 7, 1):
        for day in range(1, 13, 1):
            for r in range(1, 13, 1):
                race_id = ’馬5年分s期待値tr(year) + str(place).zfill(2) + str(kai).zfill(2) + str(day).zfill(2) + str(r).zfill(2)
                race_id_list.append(race_id)

race_results = RaceResult.scrape(race_id_list)
race_results.to_pickle(f'../DATA/{year}_race_result.pkl')


                                                                                

In [None]:
["202408040402"] * len("202408040402")

In [29]:
from tqdm import tqdm
import time
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

# このdfは、engineering後のデータフレームだよ。順番前後しちゃってるよ。ごめんね。
df = pd.read_pickle('../DATA/recent_5_race_df_for_learning.pkl')
# 既存のdfに日付データを追加するよ〜
date_data = {}
for race_id, each_df in tqdm(df.groupby(df.index)):
    url = "https://db.netkeiba.com/race/" + race_id + "/"
    response = requests.get(url)
    response.encoding = 'EUC-JP'
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    content = soup.select('p.smalltxt')[0].text
    pattern = '\d+年\d+月\d+日'
    str_date = re.search(pattern, content).group()
    race_date = dt.strptime(str_date, '%Y年%m月%d日')
    date_data[race_id] = pd.DataFrame(([race_date] * len(each_df)), index=([race_id] * len(each_df)))
    
    time.sleep(1)

date_df = pd.concat([date_data[key] for key in date_data])
df_with_date = pd.concat([df, date_df], axis=1)
df_with_date = df_with_date.rename(columns={0: '開催年月日'})
df_with_date.to_pickle('../DATA/df_for_learning_with_date.pkl')
# date_data_df = pd.DataFrame(date_data)
# df['開催年月日'] = date_data_df
# print(df['開催年月日'])

100%|███████████████████████████████████| 15546/15546 [6:23:03<00:00,  1.48s/it]


In [32]:
import pandas as pd

df = pd.read_pickle('../DATA/df_for_learning_with_date.pkl')
if df['開催年月日'][3] < df['開催年月日'] [597]:
    print('おばか')
else:
    print('痴れ者')

おばか
