In [4]:
import requests
import pandas as pd
import time
import re
import csv
import os
from bs4 import BeautifulSoup
import random
import pickle
from tqdm import tqdm
import numpy as np 

In [5]:
# what: HTMLを解析して馬の結果テーブルをDataFrame化する関数
# for:  特徴量抽出のため
# in:   取得したhtml(.bin)
# out:  レース結果テーブル(DataFrame)
def parse_horse_html(bin_path):
    with open(bin_path, "rb") as f:
            html_text = f.read().decode("EUC-JP", errors="ignore")
    soup = BeautifulSoup(html_text, "html.parser")

    # --- レース成績表の抽出 ---
    result_table = soup.find("table", class_="db_h_race_results")
    
    if not result_table:
        raise ValueError("馬結果テーブルが見つかりません。")

    rows = result_table.find_all("tr")[1:]  # ヘッダを除外して行ごとにデータを取得
    horse_data = []

    for row in rows:
        cols = row.find_all("td")
        # --- 日付を取得 ---
        date_tag = row.find("a", href=re.compile(r"/race/list/(\d+)"))
        race_date = re.search(r"/race/list/(\d+)", date_tag["href"]).group(1) if date_tag else None

        # --- race idを取得 ---
        race_id_tag = row.find("a", href=re.compile(r"/race/(\d+)"))
        race_id = re.search(r"/race/(\d+)", race_id_tag["href"]).group(1) if race_id_tag else None

        # --- jockey idを取得 ---
        jockey_id_tag = row.find("a", href=re.compile(r"/jockey/result/recent/(\d+)"))
        jockey_id = re.search(r"/jockey/result/recent/(\d+)", jockey_id_tag["href"]).group(1) if jockey_id_tag else None

        horse_data.append([
            race_date,                     # レース日付
            cols[1].get_text(strip=True),  # 開催
            cols[2].get_text(strip=True),  # 天気
            cols[3].get_text(strip=True),  # R
            race_id,                       # race_id
            cols[6].get_text(strip=True),  # 頭数
            cols[7].get_text(strip=True),  # 枠番
            cols[8].get_text(strip=True),  # 馬番
            cols[9].get_text(strip=True),  # オッズ
            cols[10].get_text(strip=True), # 人気
            cols[11].get_text(strip=True), # 着順
            jockey_id,                     # 騎手
            cols[13].get_text(strip=True), # 斤量
            cols[14].get_text(strip=True), # 距離
            cols[16].get_text(strip=True), # 馬場
            cols[18].get_text(strip=True), # タイム
            cols[19].get_text(strip=True), # 着差
            cols[21].get_text(strip=True), # 通過
            cols[22].get_text(strip=True), # ペース
            cols[23].get_text(strip=True), # 上り
            cols[24].get_text(strip=True), # 馬体重
            # cols[26].get_text(strip=True), # 勝ち馬
            cols[28].get_text(strip=True), # 賞金
        ])
    horse_df = pd.DataFrame(horse_data, columns=[
        "race_date", "place", "weather", "race_no", "race_id", "num_of_horses", "frame_no", "horse_no",
        "odds", "popularity", "finish_position", "jockey_id", "weight", "distance", "course_condition",
        "time", "margin", "passing", "pace", "final_3f", "horse_weight", "prize_money"
    ])

    return horse_df

In [None]:
# what: 馬結果テーブルの前処理をする関数
# for:  AIモデルがうけつけられるようにする
# in:   レース結果テーブルの列(.pkl)
# out:  レース結果テーブルの列(.pkl)

def encoding_weather(weather):
    # {晴:0, 曇:1, 雨:2, 小雨:3, 雪:4, その他:np.nan}
    if pd.isna(weather): 
        return np.nan
    weather_char = str(weather)
    if weather_char == "晴":
        return 0
    elif weather_char == "曇":
        return 1
    elif weather_char == "小雨":
        return 2
    elif weather_char == "雨":
        return 3
    elif weather_char == "雪":
        return 4
    else:
        return np.nan

def parse_encoding_distance(distance):
    # 例: "芝1800" -> (0, 1800)
    # {芝:0, ダ:1, 障: 2, その他:np.nan}
    if pd.isna(distance): 
        return (np.nan, np.nan)
    
    # --- 馬場(state) ---
    state_char = str(distance[0])
    if state_char == "芝":
        state = 0
    elif state_char == "ダ":
        state = 1
    elif state_char == "障":
        state = 2
    else:
        state = np.nan
    
    # --- 距離(length) ---
    try:
        long = int(distance[1:])
    except:
        long = np.nan

    return (state, long)

def encoding_course_condition(condition):
    # {良:0, 稍重:1, 重:2, 不良:3, その他:np.nan}
    if pd.isna(condition): 
        return np.nan
    condition_char = str(condition)
    if condition_char == "良":
        return 0
    elif condition_char == "稍重":
        return 1
    elif condition_char == "重":
        return 2
    elif condition_char == "不良":
        return 3
    else:
        return np.nan

def time_to_seconds(tstr):
    # "1:51.3" -> seconds float
    try:
        if pd.isna(tstr): return np.nan
        if ":" in str(tstr):
            mm, ss = str(tstr).split(":")
            return int(mm) * 60 + float(ss)
        else:
            return float(tstr)
    except:
        return np.nan
    
def parse_margin(margin):
    # "1.1" -> 1.1 float
    # "クビ" -> 0.1 float
    # "ハナ" -> 0.05 float
    try:
        if pd.isna(margin): return np.nan
        s = str(margin)
        if s == "アタマ":
            return 0.2
        elif s == "クビ":
            return 0.1
        elif s == "ハナ":
            return 0.05
        else:
            return float(s)
    except:
        return np.nan
    
# def parse_margin(margin):
#     if pd.isna(margin):
#         return np.nan

#     s = str(margin).strip()

#     # 特殊表記
#     if s in ["ハナ"]:
#         return 0.05
#     if s in ["クビ"]:
#         return 0.1
#     if s in ["アタマ"]:
#         return 0.2
#     if s in ["大差"]:
#         return 10.0
#     if s in ["中止", "失格", "取消"]:
#         return np.nan

#     # 分数（例: "1/2", "3/4"）
#     if "/" in s:
#         try:
#             return eval(s)  # "1/2" → 0.5
#         except:
#             pass

#     # 複合分数（例: "1 1/4"）
#     if " " in s:
#         try:
#             whole, frac = s.split()
#             return float(whole) + eval(frac)
#         except:
#             pass

#     # 通常の数字（例: "1.1"）
#     try:
#         return float(s)
#     except:
#         return np.nan

def parse_passing(passing):
    # "3-3-2-2" -> [3,3,2,2]
    try:
        if pd.isna(passing): return [np.nan, np.nan, np.nan, np.nan]
        parts = str(passing).split("-")
        return [int(p) for p in parts]
    except:
        return [np.nan, np.nan, np.nan, np.nan]

def parse_pace(pace):
    # "34.1-44.8" -> [34.4,44.8]
    try:
        if pd.isna(pace): return [np.nan, np.nan]
        parts = str(pace).split("-")
        return [float(p) for p in parts]
    except:
        return [np.nan, np.nan]

def parse_bodyweight(bw):
    # "494(-4)" -> weight=494, diff=-4
    try:
        s = str(bw)
        if "(" in s:
            w = int(s.split("(")[0])
            diff = int(s.split("(")[1].rstrip(")"))
        else:
            w = int(s)
            diff = np.nan
        return (w, diff)
    except:
        return (np.nan, np.nan)

## 実行関数

In [None]:
# race_resultテーブルからhorse_idを取得&保存
# horse_ids_df: horse_idのリスト(DataFrame)
result_table_path = r"C:\Users\yasak\Desktop\mykeibaAI_ver1p0\data\race_result_table.pkl"
df = pd.read_pickle(result_table_path)
horse_ids_df = df["horse_id"].unique()
pd.Series(horse_ids_df).to_csv(r"C:\Users\yasak\Desktop\mykeibaAI_ver1p0\data\horse_id_list.csv", index=False, header=False)
# # trainer_idとjockey_idのリストも作成
# df_trainer_id = df["trainer_id"].unique().tolist()
# df_jockey_id = df["jockey_id"].unique().tolist()


### horse_idを使って各馬の成績ページ(HTML)をbinファイルとして保存

In [None]:
# CSVファイルのパス
csv_path = r"C:\Users\yasak\Desktop\mykeibaAI_ver1p0\data\horse_id_list.csv"
# 保存フォルダのパス
save_dir = r"C:\Users\yasak\Desktop\mykeibaAI_ver1p0\data\horse_result_html"

# CSVの読み込み
df = pd.read_csv(csv_path, header=None)
horse_ids = df[0].astype(str).tolist()
user_agents = [ # netkeiba.comはアクセス元によってページ構成が変わる→PCブラウザに統一
    # Windows Chrome 系
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edg/121.0.2277.83 Safari/537.36",
    # macOS Chrome 系
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
]

for horse_id in tqdm(horse_ids, total=len(horse_ids)):
    url = f"https://db.netkeiba.com/horse/result/{horse_id}"

    # ファイル保存パス 
    save_path = os.path.join(save_dir, f"{horse_id}.bin")
    if os.path.exists(save_path):
        continue # 既にそのhorse_idが取得済みならスキップ
    else:
        try:
            res = requests.get(url, headers={"User-Agent": random.choice(user_agents)}, timeout=10)
            res.raise_for_status()  # エラーがあれば例外を発生

            # HTMLをバイナリで保存
            with open(save_path, "wb") as f:
                f.write(res.content)

            # アクセス間隔を少し空ける（サーバー負荷対策）
            time.sleep(random.uniform(0.8, 2.0))

        except Exception as e:
            print(f"Error fetching {horse_id}: {e}")

100%|██████████| 8714/8714 [00:00<00:00, 123150.73it/s]


### 馬の過去成績テーブルの作成

In [28]:
# what: 各馬のbinファイルから馬の過去成績テーブルを抽出し、1つのテーブルに結合しpickleで保存する関数
# for:  特徴量の抽出用
# in:   取得したhorse_id_list(.csv)とhtml(.bin)
# out:  結合されたresult_table(.pickle)

csv_path = r"C:\Users\yasak\Desktop\mykeibaAI_ver1p0\data\horse_id_list.csv"
result_table_path = r"C:\Users\yasak\Desktop\mykeibaAI_ver1p0\data\horse_result_table.pkl"
bin_dir = r"C:\Users\yasak\Desktop\mykeibaAI_ver1p0\data\horse_result_html"
df = pd.read_csv(csv_path, header=None)

# 既存pickleのhorse_idを確認
existing_df = pd.read_pickle(result_table_path)
existing_ids = set(existing_df["horse_id"].astype(str))

# 新しく解析するhorse_idだけを抽出
target_ids = [str(rid) for rid in df["horse_id"] if str(rid) not in existing_ids]

new_dfs = []
for horse_id in tqdm(target_ids, total=len(target_ids)):
    bin_path = os.path.join(bin_dir, f"{horse_id}.bin")
    if not os.path.exists(bin_path):
        print(f"Missing bin file: {horse_id}")
        continue
    try:
        # --- HTML解析 ---
        df_horse = parse_horse_html(bin_path)
        df_horse.insert(0, "horse_id", horse_id) # horse_idを先頭列に挿入
        new_dfs.append(df_horse)
    except Exception as e:
        print(f"Error fetching {horse_id}: {e}")
new_result_df = pd.concat(new_dfs, ignore_index=True)

# 追加するテーブルに前処理をしておく
new_result_df.drop(columns=["place"], inplace=True)
new_result_df["weather"] = new_result_df["weather"].apply(encoding_weather)
new_result_df[["course_state", "distance_length"]] = new_result_df["distance"].apply(lambda x: pd.Series(parse_encoding_distance(x)))
new_result_df.drop(columns=["distance"], inplace=True)
new_result_df["course_condition"] = new_result_df["course_condition"].apply(encoding_course_condition)
new_result_df["time"] = new_result_df["time"].apply(time_to_seconds)
new_result_df["margin"] = new_result_df["margin"].apply(parse_margin)
new_result_df[["passing_1st", "passing_2nd", "passing_3rd", "passing_4th"]] = new_result_df["passing"].apply(lambda x: pd.Series(parse_passing(x)))
new_result_df.drop(columns=["passing"], inplace=True)
new_result_df[["pace_1st", "pace_2nd"]] = new_result_df["pace"].apply(lambda x: pd.Series(parse_pace(x)))
new_result_df.drop(columns=["pace"], inplace=True)
new_result_df[["body_weight","body_diff"]] = new_result_df["horse_weight"].apply(lambda x: pd.Series(parse_bodyweight(x)))
new_result_df.drop(columns=["horse_weight"], inplace=True)

result_df = pd.concat([existing_df, new_result_df], ignore_index=True)
print(f"✅ 新規{len(new_result_df)}件を追加しました（合計 {len(result_df)} 件）")
result_table = result_df.to_pickle(result_table_path)

100%|██████████| 8712/8712 [27:54<00:00,  5.20it/s]   


✅ 新規178919件を追加しました（合計 178965 件）


In [31]:
horse_table_path = r"C:\Users\yasak\Desktop\mykeibaAI_ver1p0\data\horse_result_table.pkl"
horse_table_df = pd.read_pickle(horse_table_path)

print(horse_table_df[:10])

     horse_id race_date  weather race_no       race_id num_of_horses frame_no  \
0  2018104746  20240323      2.0      11  202407010511            16        4   
1  2018104746  20240114      0.0      10  202408010510            13        8   
2  2018104746  20231223      0.0      10  202309050710            16        7   
3  2018104746  20231028      0.0      10  202308020810            15        7   
4  2018104746  20231001      1.0      10  202309040910            12        8   
5  2018104746  20230708      1.0      11  202307030311            16        6   
6  2018104746  20230513      2.0      10  202308010710            16        1   
7  2018104746  20230325      2.0      11  202307020511            16        1   
8  2018104746  20230212      0.0      10  202309010210            13        8   
9  2018104746  20230109      0.0      10  202307010410            16        6   

  horse_no  odds popularity  ... course_state distance_length passing_1st  \
0        8   9.8          5  ..