# Pythonを用いた競馬予測

## インポート

In [1]:
import re  # 正規表現
import time
from urllib.request import Request, urlopen

import pandas as pd
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

## データ収集
- カレンダーのページから開催日一覧を取得（スクレイピング）
    1. 2023年1月：https://race.netkeiba.com/top/calendar.html?year=2023&month=1
- 開催ページからレースid一覧を取得
    1. 2023年1月5日開催：https://race.netkeiba.com/top/race_list.html?kaisai_date=20230105
- レース結果ページからレース結果テーブル一覧を取得

In [None]:
from urllib.request import Request, urlopen

url = "https://db.netkeiba.com/race/202306050811/"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"}

req = Request(url, headers=headers)
html = urlopen(req).read()
print(html)

In [None]:
import pandas as pd

pd.read_html(html)

In [None]:
pd.read_html(html)[0]

In [None]:
pd.read_html(html)[1]

In [None]:
pd.read_html(html)[2]

### 開催日一覧を取得する

In [None]:
url = "https://race.netkeiba.com/top/calendar.html?year=2023&month=1"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"}

req = Request(url, headers=headers)
html = urlopen(req).read()
html

In [None]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(html)
soup

In [None]:
a = soup.find("table", class_="Calendar_Table").find("a")
a

In [None]:
a["href"]

In [None]:
import re # 正規表現

re.findall(r"kaisai_date=(\d{8})", a["href"])[0] # \d{8}：8桁の数字（\dは数字）()を付けるとその部分だけ取り出される

In [12]:
a_list = soup.find("table", class_="Calendar_Table").findAll("a")

In [None]:
kaisai_date_list = []
for a in a_list:
    kaisai_date = re.findall(r"kaisai_date=(\d{8})", a["href"])[0]
    kaisai_date_list.append(kaisai_date)

kaisai_date_list

In [14]:
import time
from tqdm.notebook import tqdm

def scrape_kaisai_date(from_, to_):
    
    kaisai_date_list = []
    
    for date in tqdm(pd.date_range(from_, to_, freq="MS")):
        year = date.year
        month = date.month
        url = f"https://race.netkeiba.com/top/calendar.html?year={year}&month={month}"
        
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"}
        req = Request(url, headers=headers)
        html = urlopen(req).read()
        
        time.sleep(1)
        
        soup = BeautifulSoup(html, 'html.parser')
        
        a_list = soup.find("table", class_="Calendar_Table").find_all("a")
        
        for a in a_list:
            kaisai_date = re.findall(r"kaisai_date=(\d{8})", a["href"])[0]
            kaisai_date_list.append(kaisai_date)
        
    return kaisai_date_list

In [None]:
scrape_kaisai_date("2023-01", "2023-12")

## 開催ページからrace_idを取得

In [16]:
url = "https://race.netkeiba.com/top/race_list.html?kaisai_date=20230105"

In [17]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [18]:
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"}
req = Request(url, headers=headers)
html = urlopen(req).read()
soup = BeautifulSoup(html, 'html.parser')


In [19]:
soup.find("div", class_="RaceList_Box") # 動的jsで構成されている場合はBeautifulSoupでは要素を取得できない.

取得できないので、ChromeDriverを使う

In [None]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service

driver_path = ChromeDriverManager().install()
driver_path


In [21]:
driver = webdriver.Chrome(service=Service(driver_path))

In [22]:
driver.get(url)

In [None]:
from selenium.webdriver.common.by import By
li_list = driver.find_elements(By.CLASS_NAME, "RaceList_DataItem")
li_list

In [None]:
li = li_list[0]
li

In [None]:
href = li.find_element(By.TAG_NAME, "a").get_attribute("href")
href

In [None]:
import re

re.findall(r"race_id=(\d{12})", href)[0]

In [27]:
race_id_list = []

for li in li_list:
    href = li.find_element(By.TAG_NAME, "a").get_attribute("href")
    race_id = re.findall(r"race_id=(\d{12})", href)[0]
    race_id_list.append(race_id)

In [None]:
len(race_id_list)

In [29]:
driver.quit()

## 関数化

In [30]:
import scraping

In [None]:
kaisai_date_list = scraping.scrape_kaisai_date(from_="2023-01", to_="2023-12")

In [32]:
import time
from selenium.webdriver.chrome.options import Options
from tqdm.notebook import tqdm
import traceback

def scrape_race_id_list(kaisai_date_list: list[str]):
    options = Options()
    options.add_argument("--headless") # 処理軽量化のためにバックグラウンドで実行
    driver_path = ChromeDriverManager().install()
    race_id_list = []
    
    # for文終了時にwith構文自動的にdriverがquitする.
    with webdriver.Chrome(service=Service(driver_path),options=options) as driver:
        for kaisai_date in tqdm(kaisai_date_list):
            url = f"https://race.netkeiba.com/top/race_list.html?kaisai_date={kaisai_date}"
            try:
                driver.get(url)
                time.sleep(1)
                li_list = driver.find_elements(By.CLASS_NAME, "RaceList_DataItem")
                for li in li_list:
                    href = li.find_element(By.TAG_NAME, "a").get_attribute("href")
                    race_id = re.findall(r"race_id=(\d{12})", href)[0]
                    race_id_list.append(race_id)
            except:
                print(f"stopped at {url}")
                print(traceback.format_exc()) # エラー把握
                break
    return race_id_list

In [None]:
race_id_list = scrape_race_id_list(kaisai_date_list)

In [None]:
race_id_list

### スクリプトのチェック

In [35]:
import scraping
%load_ext autoreload

In [36]:
%autoreload

In [None]:
race_id_list = scraping.scrape_race_id_list(kaisai_date_list[:10])

notebookにおけるモジュールのリロード

In [None]:
race_id_list