In [None]:
import requests
import pandas as pd
import time
import re
import csv
import os
from bs4 import BeautifulSoup
import random
import pickle
from tqdm import tqdm
import numpy as np
import os
pd.set_option("display.max_columns", None) # 表示する列数の制限を解除

import sys
sys.path.append("..") # 親ディレクトリを追加
from module.path_reader import PathReader
from module.get_html import GetHTML

In [17]:
# what: HTMLを解析してレース結果テーブルをDataFrame化する関数
# for:  AIモデルの入力形式に合わせる
# in:   取得したhtml(.bin)
# out:  レース結果テーブル(DataFrame)
def parse_race_html(bin_path):
    with open(bin_path, "rb") as f:
            html_text = f.read().decode("EUC-JP", errors="ignore")
    soup = BeautifulSoup(html_text, "html.parser")
    result_table = soup.find("table", class_="RaceTable01")
    
    if not result_table:
        raise ValueError("レース結果テーブルが見つかりません。")

    rows = result_table.find_all("tr")[1:]  # ヘッダを除外
    race_data = []

    for row in rows:
        cols = row.find_all("td")
        if len(cols) < 15:
            continue

        # --- 馬IDを取得 ---
        horse_tag = row.find("a", href=re.compile(r"/horse/(\d+)"))
        horse_id = re.search(r"/horse/(\d+)", horse_tag["href"]).group(1) if horse_tag else None

        # --- 騎手IDを取得 ---
        jockey_tag = row.find("a", href=re.compile(r"/jockey/result/recent/(\d+)/"))
        jockey_id = re.search(r"/jockey/result/recent/(\d+)/", jockey_tag["href"]).group(1) if jockey_tag else None

        # --- 調教師IDを取得 ---
        trainer_tag = row.find("a", href=re.compile(r"/trainer/result/recent/(\d+)/"))
        trainer_id = re.search(r"/trainer/result/recent/(\d+)/", trainer_tag["href"]).group(1) if trainer_tag else None

        race_data.append([
            cols[0].get_text(strip=True),  # 着順
            cols[1].get_text(strip=True),  # 枠番
            cols[2].get_text(strip=True),  # 馬番
            horse_id,                      # 馬ID
            cols[4].get_text(strip=True),  # 性齢
            cols[5].get_text(strip=True),  # 斤量
            jockey_id,                     # 騎手ID
            cols[7].get_text(strip=True),  # タイム
            cols[8].get_text(strip=True),  # 着差
            cols[9].get_text(strip=True),  # 人気
            cols[10].get_text(strip=True), # オッズ
            cols[11].get_text(strip=True), # 後3F
            cols[12].get_text(strip=True), # 通過
            trainer_id,                    # 厩舎ID
            cols[14].get_text(strip=True), # 馬体重
        ])

    df = pd.DataFrame(race_data, columns=[
        "rank", "wakuban", "umaban", "horse_id", "sex&age", "weight_carried",
        "jockey_id", "time", "margin", "popularity", "odds", "last3f", "passing",
        "trainer_id", "weight"])
    # df = pd.DataFrame(race_data, columns=["着順", "馬名", "性齢", "斤量", "騎手", "タイム", "人気", "オッズ", "後3F", "厩舎", "馬体重"])
    return df

In [18]:
# what: HTMLを解析して払い戻しテーブルをDataFrame化する関数
# for:  AIモデルの入力形式に合わせる
# in:   取得したhtml(.bin)
# out:  払い戻しテーブル(DataFrame)
from bs4 import BeautifulSoup
import pandas as pd

def parse_return_html(html_text):
    soup = BeautifulSoup(html_text, "html.parser")

    pay_tables = soup.find_all("table", class_="Payout_Detail_Table")
    pay_data = []

    for tbl in pay_tables:
        for row in tbl.find_all("tr"):
            bet_type = row.find("th").get_text(strip=True) if row.find("th") else None
            result = " / ".join(span.get_text(strip=True) for span in row.select("td.Result span") if span.get_text(strip=True))
            payout = " / ".join(span.get_text(strip=True) for span in row.select("td.Payout span") if span.get_text(strip=True))
            popularity = " / ".join(span.get_text(strip=True) for span in row.select("td.Ninki span") if span.get_text(strip=True))

            pay_data.append([bet_type, result, payout, popularity])

    pay_df = pd.DataFrame(pay_data, columns=["券種", "馬番", "払戻金", "人気"])
    return pay_df

In [None]:
# what: レース結果テーブルの前処理をする関数
# for:  AIモデルがうけつけられるようにする
# in:   レース結果テーブルの列(.pkl)
# out:  レース結果テーブルの列(.pkl)

def parse_sex_age(sexage):
    # 例: "牡4" -> ("牡", 4)
    # {牡:0, 牝:1, セ: 2, その他:np.nan}
    if pd.isna(sexage): return (np.nan, np.nan)

    # --- 性別(sex) ---
    sex_char = str(sexage[0])
    if sex_char == "牡":
        sex = 0
    elif sex_char == "牝":
        sex = 1
    elif sex_char == "セ":   # 騙馬（せん馬）
        sex = 2
    else:
        sex = np.nan

    # --- 年齢(age) ---
    try:
        age = int(sexage[1:])
    except:
        age = np.nan
    return sex, age
    
def time_to_seconds(tstr):
    # "1:51.3" -> seconds float
    try:
        if pd.isna(tstr): return np.nan
        if ":" in str(tstr):
            mm, ss = str(tstr).split(":")
            return int(mm) * 60 + float(ss)
        else:
            return float(tstr)
    except:
        return np.nan
    
def parse_margin(margin):
    # 着差の文字列を馬身(float)に変換する関数
    if pd.isna(margin):
        return np.nan
    s = str(margin).strip()
    if s == "":
        return np.nan
    # 正規化（全角スペースや「馬身」などの語を除去）
    s = s.replace("　", " ").replace("馬身", "").replace("馬", "").strip()

    # 特殊語のマッピング（単位は「馬身」）
    special = {"大差": 10.0, "着差": np.nan, "鼻": 0.05, "ハナ": 0.05, "アタマ": 0.1, "クビ": 0.25, "短": 0.05}
    if s in special:
        return special[s]

    # 「1 1/2」や「1-1/2」等の分数表現
    try:
        # 全体と分数（例: "1 1/2" -> whole=1, frac="1/2"）
        if " " in s and "/" in s:
            whole, frac = s.split()
            num, den = frac.split("/")
            return float(whole) + float(num) / float(den)
        if "-" in s and "/" in s:
            whole, frac = s.split("-")
            num, den = frac.split("/")
            return float(whole) + float(num) / float(den)
        if "/" in s and not any(c.isalpha() for c in s):
            num, den = s.split("/")
            return float(num) / float(den)
    except Exception:
        pass

    # 小数や整数に直接変換できればそのまま馬身として返す
    try:
        return float(s)
    except Exception:
        pass

    # その他は欠損とする
    return np.nan

def parse_passing(passing):
    # "3-4-4-3" -> [3,4,4,3]
    passing_str = str(passing)
    if not isinstance(passing_str, str) or passing_str.strip() == "":
        return [np.nan, np.nan, np.nan, np.nan]

    parts = passing_str.split("-")

    result = []
    for i in range(4):
        if i < len(parts) and parts[i].isdigit():
            result.append(int(parts[i]))
        else:
            result.append(np.nan)
    return result

def parse_bodyweight(bw):
    # "494(-4)" -> weight=494, diff=-4
    try:
        s = str(bw)
        if "(" in s:
            w = int(s.split("(")[0])
            diff = int(s.split("(")[1].rstrip(")"))
        else:
            w = int(s)
            diff = np.nan
        return w, diff
    except:
        return (np.nan, np.nan)

## 実行関数：HTMLの取得とレース結果・払い戻しテーブルの作成

In [None]:
# 入力パラメータ

# 実行環境(NotePC/Desktop)の選択
# reader = PathReader("../file_path_NotePC.json") # NotePC用
reader = PathReader("../file_path_Desktop.json") # Desktop用

race_id_list_name = "race_id_list_2301_2404.csv" # スクレイピングするrace_id_list


In [None]:
# データフォルダのインスタンス実体化
data_folder = reader.get_path("data_folder")

# race_idが保存されているCSVファイルのパス
race_id_list_path = os.path.join(data_folder, race_id_list_name)

### HTMLの取得

In [None]:
# CSVの読み込み
df = pd.read_csv(race_id_list_path)
race_ids = df["race_id"].astype(str).tolist()
get_html = GetHTML()

for race_id in tqdm(race_ids, total=len(race_ids)):
    url = f"https://race.netkeiba.com/race/result.html?race_id={race_id}" # 取得先URL
    save_path = os.path.join(data_folder, "race_result_html", f"{race_id}.bin") # ファイル保存パス
    get_html.get_and_save(url, save_path) # HTMLの取得と保存

100%|██████████| 4656/4656 [00:00<00:00, 75108.86it/s]

Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bin file
Exsisting bi




### レース結果テーブルの作成
pickleファイルを展開
→race_idを取得
→race_idが重複する場合はスキップ

In [None]:
# what: 各レースのbinファイルからレース結果を抽出し、1つのテーブルに結合しpickleで保存する関数
# for:  特徴量の抽出用
# in:   取得したrace_id_list(.csv)とhtml(.bin)
# out:  結合されたresult_table(.pickle)

# 既存のrace_result_tableに追加する場合はTrue、新規追加はFalse
add_table_bool = False

df = pd.read_csv(race_id_list_path, dtype={"race_id": str, "date": str})
race_to_date = dict(zip(df["race_id"], df["date"])) # race_id -> data の辞書を作成

result_table_path = os.path.join(data_folder, "race_result_table.pkl")
if add_table_bool:
    # 既存pickleのrace_idを確認
    existing_df = pd.read_pickle(result_table_path)
    existing_ids = set(existing_df["race_id"].astype(str))
else:
    existing_df = pd.DataFrame()
    existing_ids = set()

# 新しく解析するrace_idだけを抽出
target_ids = [str(rid) for rid in df["race_id"] if str(rid) not in existing_ids]

new_dfs = []
for race_id in tqdm(target_ids, total=len(target_ids)):
    bin_path = os.path.join(data_folder, "race_result_html", f"{race_id}.bin")
    if not os.path.exists(bin_path):
        print(f"Missing bin file: {race_id}")
        continue
    try:
        # --- HTML解析 ---
        df_race = parse_race_html(bin_path)
        df_race.insert(0, "race_id", race_id)
        df_race.insert(1, "date", race_to_date[race_id]) # race_idと対応する日付を挿入
        new_dfs.append(df_race)
    except Exception as e:
        print(f"Error fetching {race_id}: {e}")
new_result_df = pd.concat(new_dfs, ignore_index=True)

# 追加するテーブルに前処理をしておく
new_result_df["rank"] = pd.to_numeric(new_result_df["rank"], errors="coerce", downcast='float')
new_result_df.insert(3, "is_win", (new_result_df["rank"] == 1).astype(float)) # ターゲットエンコーディング(1着のみ1)
new_result_df.insert(4, "is_place", (new_result_df["rank"] <= 3).astype(float)) # ターゲットエンコーディング(3着以内)
new_result_df["wakuban"] = pd.to_numeric(new_result_df["wakuban"], errors="coerce", downcast='float')
new_result_df["umaban"] = pd.to_numeric(new_result_df["umaban"], errors="coerce", downcast='float')
new_result_df[["sex","age"]] = new_result_df["sex&age"].apply(lambda x: pd.Series(parse_sex_age(x)))
new_result_df["sex"] = pd.to_numeric(new_result_df["sex"], errors="coerce", downcast='float')
new_result_df["age"] = pd.to_numeric(new_result_df["age"], errors="coerce", downcast='float')
new_result_df["weight_carried"] = pd.to_numeric(new_result_df["weight_carried"], errors="coerce", downcast='float')
new_result_df["time_sec"] = new_result_df["time"].apply(time_to_seconds)
new_result_df["margin"] = new_result_df["margin"].apply(lambda x: pd.Series(parse_margin(x)))
new_result_df["popularity"] = pd.to_numeric(new_result_df["popularity"], errors="coerce", downcast='float')
new_result_df["odds"] = pd.to_numeric(new_result_df["odds"], errors="coerce", downcast='float')
new_result_df["last3f"] = pd.to_numeric(new_result_df["last3f"], errors="coerce", downcast='float')
new_result_df[["passing1", "passing2", "passing3", "passing4"]] = new_result_df["passing"].apply(lambda x: pd.Series(parse_passing(x)))
new_result_df[["weight_horse","weight_diff"]] = new_result_df["weight"].apply(lambda x: pd.Series(parse_bodyweight(x)))
new_result_df = new_result_df.drop(columns=["sex&age", "time", "passing","weight"])

assert isinstance(existing_df, pd.DataFrame)
assert isinstance(new_result_df, pd.DataFrame)

result_df = pd.concat([existing_df, new_result_df], ignore_index=True)
print(f"✅ 新規{len(new_result_df)}件を追加しました（合計 {len(result_df)} 件）")
# print(result_df)
result_table = result_df.to_pickle(result_table_path)

100%|██████████| 4656/4656 [19:21<00:00,  4.01it/s]


✅ 新規64329件を追加しました（合計 64329 件）


NameError: name 'result_table_path' is not defined

In [10]:
result_table_path = os.path.join(data_folder, "race_result_table.pkl")
result_table = result_df.to_pickle(result_table_path)

In [11]:
result_df = pd.read_pickle(result_table_path)
result_df

Unnamed: 0,race_id,date,rank,is_win,is_place,wakuban,umaban,horse_id,weight_carried,jockey_id,margin,popularity,odds,last3f,trainer_id,sex,age,time_sec,passing1,passing2,passing3,passing4,weight_horse,weight_diff
0,202307010107,20230105,1.0,1.0,1.0,4.0,7.0,2018100377,52.0,01190,,14.0,89.900002,37.200001,01075,1.0,5.0,72.4,5.0,4.0,,,460.0,-4.0
1,202307010107,20230105,2.0,0.0,1.0,7.0,14.0,2018102969,58.0,01174,0.05,2.0,4.800000,36.500000,01113,0.0,5.0,72.4,10.0,10.0,,,478.0,0.0
2,202307010107,20230105,3.0,0.0,1.0,8.0,15.0,2017106394,58.0,01163,1.00,8.0,18.200001,36.000000,01042,0.0,6.0,72.6,15.0,15.0,,,496.0,4.0
3,202307010107,20230105,4.0,0.0,0.0,1.0,2.0,2017101726,55.0,01195,0.75,10.0,25.500000,36.799999,01018,0.0,6.0,72.7,10.0,10.0,,,536.0,-6.0
4,202307010107,20230105,5.0,0.0,0.0,2.0,4.0,2018105151,58.0,01180,0.25,12.0,36.200001,36.400002,01185,0.0,5.0,72.8,14.0,14.0,,,498.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64324,202405020403,20240428,12.0,0.0,0.0,2.0,3.0,2021103111,57.0,01009,0.55,14.0,242.600006,39.700001,01096,0.0,3.0,101.4,7.0,7.0,,,492.0,6.0
64325,202405020403,20240428,13.0,0.0,0.0,8.0,16.0,2021103705,57.0,01096,0.10,15.0,367.600006,37.200001,01100,0.0,3.0,101.4,16.0,16.0,,,482.0,0.0
64326,202405020403,20240428,14.0,0.0,0.0,1.0,1.0,2021106574,57.0,01075,0.50,3.0,4.800000,40.799999,01020,0.0,3.0,101.5,1.0,1.0,,,476.0,0.0
64327,202405020403,20240428,15.0,0.0,0.0,6.0,11.0,2021100425,56.0,01184,4.00,9.0,59.500000,40.500000,01171,0.0,3.0,102.2,7.0,7.0,,,484.0,6.0


In [12]:
# 各列の型の確認
type_counts = {
    col: result_df[col].map(lambda x: str(type(x))).value_counts(sort=False)
    for col in result_df.columns
}

pd.DataFrame(type_counts)

Unnamed: 0,race_id,date,rank,is_win,is_place,wakuban,umaban,horse_id,weight_carried,jockey_id,margin,popularity,odds,last3f,trainer_id,sex,age,time_sec,passing1,passing2,passing3,passing4,weight_horse,weight_diff
<class 'NoneType'>,,,,,,,,,,,,,,,1.0,,,,,,,,,
<class 'float'>,,,64329.0,64329.0,64329.0,64329.0,64329.0,,64329.0,,64329.0,64329.0,64329.0,64329.0,,64329.0,64329.0,64329.0,64329.0,64329.0,64329.0,64329.0,64329.0,64329.0
<class 'str'>,64329.0,64329.0,,,,,,64329.0,,64329.0,,,,,64328.0,,,,,,,,,


### pickleファイルの確認

In [None]:
# ファイルパス
result_table_path = r"C:\Users\yasak\Desktop\mykeibaAI_ver1p0\data\race_result_table.pkl"

# pickleファイルを読み込む
df = pd.read_pickle(result_table_path)
# df
# 下10件を表示
print(df.tail(30))


           date       race_id  is_win rank    horse_id  weight_carried  \
28393  20240428  202405020403       0   13  2021103705            57.0   
28394  20240428  202405020403       0   14  2021106574            57.0   
28395  20240428  202405020403       0   15  2021100425            56.0   
28396  20240428  202405020403       0   16  2021103492            57.0   
28397  20240428  202408030404       1    1  2021104427            55.0   
28398  20240428  202408030404       0    2  2021105002            57.0   
28399  20240428  202408030404       0    3  2021104924            57.0   
28400  20240428  202408030404       0    4  2021110022            57.0   
28401  20240428  202408030404       0    5  2021102701            54.0   
28402  20240428  202408030404       0    6  2021103570            57.0   
28403  20240428  202408030404       0    7  2021106937            57.0   
28404  20240428  202408030404       0    8  2021101771            54.0   
28405  20240428  202408030404       0 

<p>回収率シミュレーションのときにparse_return_html関数は使う</p>

In [None]:
with open(r"C:\Users\yasak\Desktop\mykeibaAI_ver1p0\result.bin", "rb") as f:
    html_text = f.read().decode("EUC-JP", errors="ignore")

# new_result_df_result = parse_race_html(html_text)
# print(df_result)

# print("----------------------------------------------------------------")
df_pay = parse_return_html(html_text)
print(df_pay)

    着順         馬名  性齢    斤量   騎手     タイム  人気    オッズ   後3F     厩舎      馬体重
0    1  ホウオウルーレット  牡4  58.0  岩田康  1:51.3   5    7.2  35.9   美浦栗田  494(-4)
1    2  キュールエフウジン  牡4  58.0  藤岡佑  1:51.4   8   22.2  36.3   栗東中尾  494(-6)
2    3  セイクリッドゲイズ  セ5  58.0  岩田望  1:51.5   4    7.0  36.8  栗東佐々木   494(0)
3    4  プリモスペランツァ  牡4  58.0  鮫島駿  1:51.6  11   33.9  37.1   栗東中竹   494(0)
4    5   マルブツプライド  牡4  58.0   川須  1:51.6  10   30.1  36.8   栗東加用  532(+8)
5    6     カズプレスト  牡4  58.0   亀田  1:51.7   6   10.4  37.6  栗東高柳大  522(-6)
6    7   ホウオウフウジン  牡4  58.0    幸  1:52.0   9   25.3  38.0   栗東矢作  524(-6)
7    8      クロニクル  牡4  58.0  吉田隼  1:52.1   7   13.1  37.8  栗東田中克  522(+6)
8    9   ルイナールカズマ  牡4  58.0  藤岡康  1:52.1  12  124.1  36.5  栗東奥村豊  496(-6)
9   10   ラインオブソウル  牡4  58.0   松若  1:52.3   3    6.9  37.5   栗東音無  520(+4)
10  11   タガノエスコート  牡4  58.0  和田竜  1:52.7   1    3.1  38.2   栗東小林  500(-6)
11  12     ロコポルティ  牡5  58.0  Ｍデム  1:52.7   2    5.6  38.0  栗東西園正  526(-2)
--------------------------------------