In [1]:
import requests
import pandas as pd
import time
import re
import csv
import os
from bs4 import BeautifulSoup
import random
import pickle
from tqdm import tqdm
import numpy as np
import os
pd.set_option("display.max_columns", None) # 表示する列数の制限を解除

import sys
sys.path.append("..") # 親ディレクトリを追加
from module.random_agent import RandomUserAgent
from module.path_reader import PathReader

In [2]:
# 入力パラメータ

# 実行環境(NotePC/Desktop)の選択
# reader = PathReader("../file_path_NotePC.json") # NotePC用
reader = PathReader("../file_path_Desktop.json") # Desktop用

# スクレイピングするrace_id_liseの選択
race_id_list_name = "race_id_list_test.csv"

In [3]:
# 入力データと出力データの絶対パス生成

# データフォルダのインスタンス実体化
data_folder = reader.get_path("data_folder")
# race_idが保存されているCSVファイルのパス
race_id_list_path = os.path.join(reader.get_path("data_folder"), race_id_list_name)
# 保存フォルダのパス
save_dir = os.path.join(reader.get_path("data_folder"), "race_result_html")

In [4]:
# what: HTMLを解析してレース結果テーブルをDataFrame化する関数
# for:  AIモデルの入力形式に合わせる
# in:   取得したhtml(.bin)
# out:  レース結果テーブル(DataFrame)
def parse_race_html(bin_path):
    with open(bin_path, "rb") as f:
            html_text = f.read().decode("EUC-JP", errors="ignore")
    soup = BeautifulSoup(html_text, "html.parser")
    result_table = soup.find("table", class_="RaceTable01")
    
    if not result_table:
        raise ValueError("レース結果テーブルが見つかりません。")

    rows = result_table.find_all("tr")[1:]  # ヘッダを除外
    race_data = []

    for row in rows:
        cols = row.find_all("td")
        if len(cols) < 15:
            continue

        # --- 馬IDを取得 ---
        horse_tag = row.find("a", href=re.compile(r"/horse/(\d+)"))
        horse_id = re.search(r"/horse/(\d+)", horse_tag["href"]).group(1) if horse_tag else None

        # --- 騎手IDを取得 ---
        jockey_tag = row.find("a", href=re.compile(r"/jockey/result/recent/(\d+)/"))
        jockey_id = re.search(r"/jockey/result/recent/(\d+)/", jockey_tag["href"]).group(1) if jockey_tag else None

        # --- 調教師IDを取得 ---
        trainer_tag = row.find("a", href=re.compile(r"/trainer/result/recent/(\d+)/"))
        trainer_id = re.search(r"/trainer/result/recent/(\d+)/", trainer_tag["href"]).group(1) if trainer_tag else None

        race_data.append([
            cols[0].get_text(strip=True),  # 着順
            cols[1].get_text(strip=True),  # 枠番
            cols[2].get_text(strip=True),  # 馬番
            horse_id,                      # 馬ID
            cols[4].get_text(strip=True),  # 性齢
            cols[5].get_text(strip=True),  # 斤量
            jockey_id,                     # 騎手ID
            cols[7].get_text(strip=True),  # タイム
            cols[8].get_text(strip=True),  # 着差
            cols[9].get_text(strip=True),  # 人気
            cols[10].get_text(strip=True), # オッズ
            cols[11].get_text(strip=True), # 後3F
            cols[12].get_text(strip=True), # 通過
            trainer_id,                    # 厩舎ID
            cols[14].get_text(strip=True), # 馬体重
        ])

    df = pd.DataFrame(race_data, columns=[
        "rank", "wakuban", "umaban", "horse_id", "sex&age", "weight_carried",
        "jockey_id", "time", "margin", "popularity", "odds", "last3f", "passing",
        "trainer_id", "weight"])
    # df = pd.DataFrame(race_data, columns=["着順", "馬名", "性齢", "斤量", "騎手", "タイム", "人気", "オッズ", "後3F", "厩舎", "馬体重"])
    return df

In [8]:
# what: レース結果テーブルの前処理をする関数
# for:  AIモデルがうけつけられるようにする
# in:   レース結果テーブルの列(.pkl)
# out:  レース結果テーブルの列(.pkl)

def parse_sex_age(sexage):
    # 例: "牡4" -> ("牡", 4)
    # {牡:0, 牝:1, セ: 2, その他:np.nan}
    if pd.isna(sexage): return (np.nan, np.nan)

    # --- 性別(sex) ---
    sex_char = str(sexage[0])
    if sex_char == "牡":
        sex = 0
    elif sex_char == "牝":
        sex = 1
    elif sex_char == "セ":   # 騙馬（せん馬）
        sex = 2
    else:
        sex = np.nan

    # --- 年齢(age) ---
    try:
        age = int(sexage[1:])
    except:
        age = np.nan
    return sex, age
    
def time_to_seconds(tstr):
    # "1:51.3" -> seconds float
    try:
        if pd.isna(tstr): return np.nan
        if ":" in str(tstr):
            mm, ss = str(tstr).split(":")
            return int(mm) * 60 + float(ss)
        else:
            return float(tstr)
    except:
        return np.nan
    
def parse_margin(margin):
    # 着差の文字列を馬身(float)に変換する関数
    if pd.isna(margin):
        return np.nan
    s = str(margin).strip()
    if s == "":
        return np.nan
    # 正規化（全角スペースや「馬身」などの語を除去）
    s = s.replace("　", " ").replace("馬身", "").replace("馬", "").strip()

    # 特殊語のマッピング（単位は「馬身」）
    special = {
        "大差": 10.0,
        "着差": np.nan,
        "鼻": 0.05, "ハナ": 0.05,
        "アタマ": 0.1,
        "クビ": 0.25,
        "短": 0.05
    }
    if s in special:
        return special[s]

    # 「1 1/2」や「1-1/2」等の分数表現
    try:
        # 全体と分数（例: "1 1/2" -> whole=1, frac="1/2"）
        if " " in s and "/" in s:
            whole, frac = s.split()
            num, den = frac.split("/")
            return float(whole) + float(num) / float(den)
        if "-" in s and "/" in s:
            whole, frac = s.split("-")
            num, den = frac.split("/")
            return float(whole) + float(num) / float(den)
        if "/" in s and not any(c.isalpha() for c in s):
            num, den = s.split("/")
            return float(num) / float(den)
    except Exception:
        pass

    # 小数や整数に直接変換できればそのまま馬身として返す
    try:
        return float(s)
    except Exception:
        pass

    # その他は欠損とする
    return np.nan

def parse_passing(passing_str):
    if not isinstance(passing_str, str) or passing_str.strip() == "":
        return [np.nan, np.nan, np.nan, np.nan]

    parts = passing_str.split("-")

    result = []
    for i in range(4):
        if i < len(parts) and parts[i].isdigit():
            result.append(int(parts[i]))
        else:
            result.append(np.nan)
    return result

def parse_bodyweight(bw):
    # "494(-4)" -> weight=494, diff=-4
    try:
        s = str(bw)
        if "(" in s:
            w = float(s.split("(")[0])
            diff = float(s.split("(")[1].rstrip(")"))
        else:
            w = float(s)
            diff = np.nan
        return w, diff
    except:
        return (np.nan, np.nan)

In [9]:
# what: 各レースのbinファイルからレース結果を抽出し、1つのテーブルに結合しpickleで保存する関数
# for:  特徴量の抽出用
# in:   取得したrace_id_list(.csv)とhtml(.bin)
# out:  結合されたresult_table(.pickle)

# 既存のrace_result_tableに追加する場合はTrue、新規追加はFalse
add_table_bool = False

bin_dir = os.path.join(data_folder, "race_result_html")
df = pd.read_csv(race_id_list_path, dtype={"race_id": str, "date": str})
race_to_date = dict(zip(df["race_id"], df["date"])) # race_id -> data の辞書を作成

if add_table_bool:
    result_table_path = os.path.join(data_folder, "race_result_table.pkl")
    # 既存pickleのrace_idを確認
    existing_df = pd.read_pickle(result_table_path)
    existing_ids = set(existing_df["race_id"].astype(str))
else:
    existing_df = pd.DataFrame()
    existing_ids = set()

# 新しく解析するrace_idだけを抽出
target_ids = [str(rid) for rid in df["race_id"] if str(rid) not in existing_ids]

new_dfs = []
for race_id in tqdm(target_ids, total=len(target_ids)):
    bin_path = os.path.join(bin_dir, f"{race_id}.bin")
    if not os.path.exists(bin_path):
        print(f"Missing bin file: {race_id}")
        continue
    try:
        # --- HTML解析 ---
        df_race = parse_race_html(bin_path)
        df_race.insert(0, "race_id", race_id)
        df_race.insert(1, "date", race_to_date[race_id]) # race_idと対応する日付を挿入
        new_dfs.append(df_race)
    except Exception as e:
        print(f"Error fetching {race_id}: {e}")
new_result_df = pd.concat(new_dfs, ignore_index=True)

# 追加するテーブルに前処理をしておく
new_result_df["rank"] = pd.to_numeric(new_result_df["rank"], errors="coerce", downcast='float')
new_result_df.insert(3, "is_win", (new_result_df["rank"] == 1).astype(float)) # ターゲットエンコーディング(1着のみ1)
new_result_df.insert(4, "is_place", (new_result_df["rank"] <= 3).astype(float)) # ターゲットエンコーディング(3着以内)
new_result_df["wakuban"] = pd.to_numeric(new_result_df["wakuban"], errors="coerce", downcast='float')
new_result_df["umaban"] = pd.to_numeric(new_result_df["umaban"], errors="coerce", downcast='float')
new_result_df[["sex","age"]] = new_result_df["sex&age"].apply(lambda x: pd.Series(parse_sex_age(x)))
new_result_df["sex"] = pd.to_numeric(new_result_df["sex"], errors="coerce", downcast='float')
new_result_df["age"] = pd.to_numeric(new_result_df["age"], errors="coerce", downcast='float')
new_result_df["weight_carried"] = pd.to_numeric(new_result_df["weight_carried"], errors="coerce", downcast='float')
new_result_df["time_sec"] = new_result_df["time"].apply(time_to_seconds)
new_result_df["margin"] = new_result_df["margin"].apply(lambda x: pd.Series(parse_margin(x)))
new_result_df["popularity"] = pd.to_numeric(new_result_df["popularity"], errors="coerce", downcast='float')
new_result_df["odds"] = pd.to_numeric(new_result_df["odds"], errors="coerce", downcast='float')
new_result_df["last3f"] = pd.to_numeric(new_result_df["last3f"], errors="coerce", downcast='float')
new_result_df[["passing1", "passing2", "passing3", "passing4"]] = new_result_df["passing"].apply(lambda x: pd.Series(parse_passing(x)))
new_result_df[["weight_horse","weight_diff"]] = new_result_df["weight"].apply(lambda x: pd.Series(parse_bodyweight(x)))
new_result_df = new_result_df.drop(columns=["sex&age", "time", "passing","weight"])

assert isinstance(existing_df, pd.DataFrame)
assert isinstance(new_result_df, pd.DataFrame)

result_df = pd.concat([existing_df, new_result_df], ignore_index=True)
print(f"✅ 新規{len(new_result_df)}件を追加しました（合計 {len(result_df)} 件）")
# print(result_df)
# result_table = result_df.to_pickle(result_table_path)

100%|██████████| 3/3 [00:00<00:00, 29.03it/s]

✅ 新規42件を追加しました（合計 42 件）





In [10]:
# 各列の型の確認
type_counts = {
    col: result_df[col].map(lambda x: str(type(x))).value_counts(sort=False)
    for col in result_df.columns
}

pd.DataFrame(type_counts)

Unnamed: 0,race_id,date,rank,is_win,is_place,wakuban,umaban,horse_id,weight_carried,jockey_id,margin,popularity,odds,last3f,trainer_id,sex,age,time_sec,passing1,passing2,passing3,passing4,weight_horse,weight_diff
<class 'float'>,,,42.0,42.0,42.0,42.0,42.0,,42.0,,42.0,42.0,42.0,42.0,,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0
<class 'str'>,42.0,42.0,,,,,,42.0,,42.0,,,,,42.0,,,,,,,,,


# Archive

In [None]:
def parse_margin_to_winner(df):
    margins = df["margin"].to_numpy() # "margin"列を抜き出してnumpy arrayとして保存
    ranks = df["rank"].to_numpy()
    
    result = np.zeros(len(df)) # 結果（総和）を格納するための配列を作成（0埋め）

    cumulative = 0 # 初期値
    nan_flag = False # 初期値（2着以降でNaNが発生した場合、このflagがTrueになる）

    for i, (rank, m) in enumerate(zip(ranks, margins)): # iはiterable, marginとrankを一対一で抜き出す
        if rank == 1: # 一着の場合marginを0とする処理
            result[i] = 0
            m = 0
            print(f"Number: {i}_rank 1 detected")
            continue

        if nan_flag: # flagがTrueになっていた場合NaNを返す処理
            result[i] = np.nan
            print(f"Number: {i}_NaN detected")
            continue

        if np.isnan(m): # marginが欠損値の場合にNaNを返す処理
            nan_flag = True
            result[i] = np.nan
            print(f"Number: {i}_margin NaN detected")
            continue

        cumulative += m # 総和
        result[i] = cumulative # 総和を格納

    df["margin_to_winner"] = result # 配列を丸ごと追加
    return df

In [None]:
bin_path = r"C:\Users\yasak\Desktop\mykeibaAI_ver1p0\data\horse_result_html\2018104746.bin"
with open(bin_path, "rb") as f:
        html_text = f.read().decode("EUC-JP", errors="ignore")
soup = BeautifulSoup(html_text, "html.parser")

# --- レース成績表の抽出 ---
# result_table = soup.find("table", class_="db_h_race_results")
result_table = soup.find("table", class_="db_h_race_results")
print(result_table)

<table cellpadding="0" cellspacing="1" class="db_h_race_results nk_tb_common" summary="セイクリッドゲイズの競走戦績">
<thead>
<tr align="center">
<th>日付</th>
<th>開催</th>
<th>天<br/>気</th>
<th>R</th>
<th>レース名</th>
<th>映<br/>像<br/><img alt="" class="png_img" src="https://cdn.netkeiba.com/img.db/style/netkeiba.ja/image/icon_premium_01.png"/></th>
<th>頭<br/>数</th>
<th>枠<br/>番</th>
<th>馬<br/>番</th>
<th>オ<br/>ッ<br/>ズ</th>
<th>人<br/>気</th>
<th>着<br/>順</th>
<th>騎手</th>
<th>斤<br/>量</th>
<th>距離</th>
<th class="disp_none">水分量</th>
<th class="">馬<br/>場</th>
<th class="">馬場<br/>指数<br/><img alt="" class="png_img" src="https://cdn.netkeiba.com/img.db/style/netkeiba.ja/image/icon_premium_01.png"/></th>
<th>タイム</th>
<th>着差</th>
<th class="">ﾀｲﾑ<br/>指数<br/><img alt="" class="png_img" src="https://cdn.netkeiba.com/img.db/style/netkeiba.ja/image/icon_premium_01.png"/></th>
<th class="">通過</th>
<th class="">ペース</th>
<th class="">上り</th>
<th>馬体重</th>
<th class="">厩舎<br/>ｺﾒﾝﾄ<br/><img alt="" class="png_img" src="https://cd

In [5]:
rows = result_table.find_all("tr")[1:]  # ヘッダを除外して行ごとにデータを取得
horse_data = []
for row in rows:
    cols = row.find_all("td")
    for i, c in enumerate(cols):
        print(i, c.get_text(strip=True))


0 2024/03/23
1 1中京5
2 小雨
3 11
4 伊勢S(3勝クラス)
5 
6 16
7 4
8 8
9 9.8
10 5
11 4
12 吉田隼人
13 56
14 ダ1900
15 
16 重
17 **
18 1:56.5
19 0.9
20 **
21 11-11-11-10
22 29.1-36.4
23 36.5
24 490(-4)
25 
26 
27 ロコポルティ
28 280.0
0 2024/01/14
1 1京都5
2 晴
3 10
4 雅S(3勝クラス)
5 
6 13
7 8
8 12
9 85.3
10 8
11 6
12 角田大河
13 58
14 ダ1800
15 
16 良
17 **
18 1:52.7
19 0.9
20 **
21 10-11-12-10
22 37.1-36.1
23 35.9
24 494(0)
25 
26 
27 ヤマニンウルス
28 
0 2023/12/23
1 5阪神7
2 晴
3 10
4 摩耶S(3勝クラス)
5 
6 16
7 7
8 14
9 13.2
10 4
11 10
12 岩田望来
13 56
14 ダ1800
15 
16 良
17 **
18 1:53.4
19 1.2
20 **
21 12-13-11-10
22 36.9-37.9
23 37.8
24 494(-2)
25 
26 
27 ラインオブソウル
28 
0 2023/10/28
1 2京都8
2 晴
3 10
4 御陵S(3勝クラス)
5 
6 15
7 7
8 14
9 3.0
10 1
11 9
12 川田将雅
13 58
14 ダ1800
15 
16 良
17 **
18 1:52.7
19 1.1
20 **
21 6-5-6-3
22 36.0-38.0
23 38.6
24 496(+2)
25 
26 
27 ミッキーヌチバナ
28 
0 2023/10/01
1 4阪神9
2 曇
3 10
4 堺S(3勝クラス)
5 
6 12
7 8
8 12
9 7.0
10 4
11 3
12 岩田望来
13 58
14 ダ1800
15 
16 重
17 **
18 1:51.5
19 0.2
20 **
21 9-9-7-7
22 36.4-37.3
23 36.8
24 494

In [56]:
rows = result_table.find_all("tr")[1:]  # ヘッダを除外して行ごとにデータを取得
horse_data = []
for row in rows:
    cols = row.find_all("td")
    # --- 日付を取得 ---
    date_tag = row.find("a", href=re.compile(r"/race/list/(\d+)"))
    race_date = re.search(r"/race/list/(\d+)", date_tag["href"]).group(1) if date_tag else None

    # --- race idを取得 ---
    race_id_tag = row.find("a", href=re.compile(r"/race/(\d+)"))
    race_id = re.search(r"/race/(\d+)", race_id_tag["href"]).group(1) if race_id_tag else None

    # --- jockey idを取得 ---
    jockey_id_tag = row.find("a", href=re.compile(r"/jockey/result/recent/(\d+)"))
    jockey_id = re.search(r"/jockey/result/recent/(\d+)", jockey_id_tag["href"]).group(1) if jockey_id_tag else None

    horse_data.append([
        race_date,                     # レース日付
        cols[1].get_text(strip=True),  # 開催
        cols[2].get_text(strip=True),  # 天気
        cols[3].get_text(strip=True),  # R
        race_id,                       # race_id
        cols[6].get_text(strip=True),  # 頭数
        cols[7].get_text(strip=True),  # 枠番
        cols[8].get_text(strip=True),  # 馬番
        cols[9].get_text(strip=True),  # オッズ
        cols[10].get_text(strip=True), # 人気
        cols[11].get_text(strip=True), # 着順
        jockey_id,                     # 騎手
        cols[13].get_text(strip=True), # 斤量
        cols[14].get_text(strip=True), # 距離
        cols[16].get_text(strip=True), # 馬場
        cols[18].get_text(strip=True), # タイム
        cols[19].get_text(strip=True), # 着差
        cols[21].get_text(strip=True), # 通過
        cols[22].get_text(strip=True), # ペース
        cols[23].get_text(strip=True), # 上り
        cols[24].get_text(strip=True), # 馬体重
        # cols[26].get_text(strip=True), # 勝ち馬
        cols[28].get_text(strip=True), # 賞金
    ])
horse_df = pd.DataFrame(horse_data, columns=[
    "race_date", "place", "weather", "race_no", "race_id", "num_of_horses", "frame_no", "horse_no",
    "odds", "popularity", "finish_position", "jockey_id", "weight", "distance", "course_condition",
    "time", "margin", "passing", "pace", "final_3f", "horse_weight", "prize_money"
])
horse_df

Unnamed: 0,race_date,place,weather,race_no,race_id,num_of_horses,frame_no,horse_no,odds,popularity,...,weight,distance,course_condition,time,margin,passing,pace,final_3f,horse_weight,prize_money
0,20240323,1中京5,小雨,11,202407010511,16,4,8,9.8,5,...,56,ダ1900,重,1:56.5,0.9,11-11-11-10,29.1-36.4,36.5,490(-4),280.0
1,20240114,1京都5,晴,10,202408010510,13,8,12,85.3,8,...,58,ダ1800,良,1:52.7,0.9,10-11-12-10,37.1-36.1,35.9,494(0),
2,20231223,5阪神7,晴,10,202309050710,16,7,14,13.2,4,...,56,ダ1800,良,1:53.4,1.2,12-13-11-10,36.9-37.9,37.8,494(-2),
3,20231028,2京都8,晴,10,202308020810,15,7,14,3.0,1,...,58,ダ1800,良,1:52.7,1.1,6-5-6-3,36.0-38.0,38.6,496(+2),
4,20231001,4阪神9,曇,10,202309040910,12,8,12,7.0,4,...,58,ダ1800,重,1:51.5,0.2,9-9-7-7,36.4-37.3,36.8,494(0),463.6
5,20230708,3中京3,曇,11,202307030311,16,6,12,18.2,7,...,56,ダ1800,良,1:53.2,0.1,11-10-10-10,38.1-36.5,36.0,494(-2),749.6
6,20230513,1京都7,小雨,10,202308010710,16,1,1,7.6,4,...,58,ダ1800,良,1:53.2,0.7,12-10-12-7,36.6-38.5,37.9,496(+2),
7,20230325,2中京5,小雨,11,202307020511,16,1,2,6.3,4,...,56,ダ1900,不,1:57.8,0.4,6-7-7-7,30.4-35.9,35.9,494(0),280.0
8,20230212,1阪神2,晴,10,202309010210,13,8,12,9.9,5,...,56,ダ1800,良,1:53.1,0.3,7-6-5-4,37.2-37.4,37.1,494(-6),748.0
9,20230109,1中京4,晴,10,202307010410,16,6,12,40.4,8,...,58,ダ1800,良,1:53.2,0.3,11-13-12-12,36.9-37.7,36.7,500(-2),280.0


In [57]:
print(horse_df[:1])

  race_date place weather race_no       race_id num_of_horses frame_no  \
0  20240323  1中京5      小雨      11  202407010511            16        4   

  horse_no odds popularity  ... weight distance course_condition    time  \
0        8  9.8          5  ...     56    ダ1900                重  1:56.5   

  margin      passing       pace final_3f horse_weight prize_money  
0    0.9  11-11-11-10  29.1-36.4     36.5      490(-4)       280.0  

[1 rows x 22 columns]


In [58]:
# what: 馬結果テーブルの前処理をする関数
# for:  AIモデルがうけつけられるようにする
# in:   レース結果テーブルの列(.pkl)
# out:  レース結果テーブルの列(.pkl)

def encoding_weather(weather):
    # {晴:0, 曇:1, 雨:2, 小雨:3, 雪:4, その他:np.nan}
    if pd.isna(weather): 
        return np.nan
    weather_char = str(weather)
    if weather_char == "晴":
        return 0
    elif weather_char == "曇":
        return 1
    elif weather_char == "小雨":
        return 2
    elif weather_char == "雨":
        return 3
    elif weather_char == "雪":
        return 4
    else:
        return np.nan

def parse_encoding_distance(distance):
    # 例: "芝1800" -> (0, 1800)
    # {芝:0, ダ:1, 障: 2, その他:np.nan}
    if pd.isna(distance): 
        return (np.nan, np.nan)
    
    # --- 馬場(state) ---
    state_char = str(distance[0])
    if state_char == "芝":
        state = 0
    elif state_char == "ダ":
        state = 1
    elif state_char == "障":
        state = 2
    else:
        state = np.nan
    
    # --- 距離(length) ---
    try:
        long = int(distance[1:])
    except:
        long = np.nan

    return (state, long)

def encoding_course_condition(condition):
    # {良:0, 稍重:1, 重:2, 不良:3, その他:np.nan}
    if pd.isna(condition): 
        return np.nan
    condition_char = str(condition)
    if condition_char == "良":
        return 0
    elif condition_char == "稍重":
        return 1
    elif condition_char == "重":
        return 2
    elif condition_char == "不良":
        return 3
    else:
        return np.nan

def time_to_seconds(tstr):
    # "1:51.3" -> seconds float
    try:
        if pd.isna(tstr): return np.nan
        if ":" in str(tstr):
            mm, ss = str(tstr).split(":")
            return int(mm) * 60 + float(ss)
        else:
            return float(tstr)
    except:
        return np.nan
    
def parse_margin(margin):
    # "1.1" -> 1.1 float
    # "クビ" -> 0.1 float
    # "ハナ" -> 0.05 float
    try:
        if pd.isna(margin): return np.nan
        s = str(margin)
        if s == "アタマ":
            return 0.2
        elif s == "クビ":
            return 0.1
        elif s == "ハナ":
            return 0.05
        else:
            return float(s)
    except:
        return np.nan

def parse_passing(passing):
    # "3-3-2-2" -> [3,3,2,2]
    try:
        if pd.isna(passing): return [np.nan, np.nan, np.nan, np.nan]
        parts = str(passing).split("-")
        return [int(p) for p in parts]
    except:
        return [np.nan, np.nan, np.nan, np.nan]

def parse_pace(pace):
    # "34.1-44.8" -> [34.4,44.8]
    try:
        if pd.isna(pace): return [np.nan, np.nan]
        parts = str(pace).split("-")
        return [float(p) for p in parts]
    except:
        return [np.nan, np.nan]

def parse_bodyweight(bw):
    # "494(-4)" -> weight=494, diff=-4
    try:
        s = str(bw)
        if "(" in s:
            w = int(s.split("(")[0])
            diff = int(s.split("(")[1].rstrip(")"))
        else:
            w = int(s)
            diff = np.nan
        return (w, diff)
    except:
        return (np.nan, np.nan)

In [59]:
horse_df.drop(columns=["place"], inplace=True)
horse_df["weather"] = horse_df["weather"].apply(encoding_weather)
horse_df[["course_state", "distance_length"]] = horse_df["distance"].apply(lambda x: pd.Series(parse_encoding_distance(x)))
horse_df.drop(columns=["distance"], inplace=True)
horse_df["course_condition"] = horse_df["course_condition"].apply(encoding_course_condition)
horse_df["time"] = horse_df["time"].apply(time_to_seconds)
horse_df["margin"] = horse_df["margin"].apply(parse_margin)
horse_df[["passing_1st", "passing_2nd", "passing_3rd", "passing_4th"]] = horse_df["passing"].apply(lambda x: pd.Series(parse_passing(x)))
horse_df.drop(columns=["passing"], inplace=True)
horse_df[["pace_1st", "pace_2nd"]] = horse_df["pace"].apply(lambda x: pd.Series(parse_pace(x)))
horse_df.drop(columns=["pace"], inplace=True)
horse_df[["body_weight","body_diff"]] = horse_df["horse_weight"].apply(lambda x: pd.Series(parse_bodyweight(x)))
horse_df.drop(columns=["horse_weight"], inplace=True)

In [63]:
2018104746
horse_df.insert(0, "horse_id", 2018104746)

horse_df   # 先頭5行だけ表示

ValueError: cannot insert horse_id, already exists

In [64]:
result_table_path = r"C:\Users\yasak\Desktop\mykeibaAI_ver1p0\data\horse_result_table.pkl"
result_table_path = horse_df.to_pickle(result_table_path)

In [46]:
print(horse_df[:1].to_string())

  race_date  weather race_no       race_id num_of_horses frame_no horse_no odds popularity finish_position jockey_id weight  course_condition   time  margin final_3f prize_money  course_state  distance_length  passing_1st  passing_2nd  passing_3rd  passing_4th  pace_1st  pace_2nd  body_weight  body_diff
0  20240323        2      11  202407010511            16        4        8  9.8          5               4     01095     56               2.0  116.5     0.9     36.5       280.0             1             1900         11.0         11.0         11.0         10.0      29.1      36.4          490         -4


In [65]:
import pickle
import pandas as pd

with open(r"C:\Users\yasak\Desktop\mykeibaAI_ver1p0\data\horse_result_table.pkl", "rb") as f:
    df = pickle.load(f)
 
df   # 先頭5行だけ表示

Unnamed: 0,horse_id,race_date,weather,race_no,race_id,num_of_horses,frame_no,horse_no,odds,popularity,...,course_state,distance_length,passing_1st,passing_2nd,passing_3rd,passing_4th,pace_1st,pace_2nd,body_weight,body_diff
0,2018104746,20240323,2,11,202407010511,16,4,8,9.8,5,...,1,1900,11.0,11.0,11.0,10.0,29.1,36.4,490,-4
1,2018104746,20240114,0,10,202408010510,13,8,12,85.3,8,...,1,1800,10.0,11.0,12.0,10.0,37.1,36.1,494,0
2,2018104746,20231223,0,10,202309050710,16,7,14,13.2,4,...,1,1800,12.0,13.0,11.0,10.0,36.9,37.9,494,-2
3,2018104746,20231028,0,10,202308020810,15,7,14,3.0,1,...,1,1800,6.0,5.0,6.0,3.0,36.0,38.0,496,2
4,2018104746,20231001,1,10,202309040910,12,8,12,7.0,4,...,1,1800,9.0,9.0,7.0,7.0,36.4,37.3,494,0
5,2018104746,20230708,1,11,202307030311,16,6,12,18.2,7,...,1,1800,11.0,10.0,10.0,10.0,38.1,36.5,494,-2
6,2018104746,20230513,2,10,202308010710,16,1,1,7.6,4,...,1,1800,12.0,10.0,12.0,7.0,36.6,38.5,496,2
7,2018104746,20230325,2,11,202307020511,16,1,2,6.3,4,...,1,1900,6.0,7.0,7.0,7.0,30.4,35.9,494,0
8,2018104746,20230212,0,10,202309010210,13,8,12,9.9,5,...,1,1800,7.0,6.0,5.0,4.0,37.2,37.4,494,-6
9,2018104746,20230109,0,10,202307010410,16,6,12,40.4,8,...,1,1800,11.0,13.0,12.0,12.0,36.9,37.7,500,-2


In [49]:
# sex をエンコード
sex_map = {
    "牡": 0,
    "牝": 1,
    "セ": 2,  # 騙馬（せん馬）
}

df["sex"] = df["sex"].map(sex_map).astype("Int64")


In [51]:
print(df.tail(4)) 

           date       race_id  is_win rank    horse_id  weight_carried  \
28502  20240428  202408030411       0   15  2020103650            58.0   
28503  20240428  202408030411       0   16  2016104946            58.0   
28504  20240428  202408030411       0   中止  2019100630            58.0   
28505  20240428  202408030411       0   取消  2016104851            58.0   

      jockey_id  popularity   odds  last3f trainer_id  body_weight  sex  age  \
28502     05386         2.0    2.8    40.5      01103        464.0    0  4.0   
28503     05212         7.0   34.6    40.3      01071        462.0    0  8.0   
28504     01115        15.0  199.1     NaN      01058        486.0    0  5.0   
28505     01180         NaN    NaN     NaN      01151          NaN    2  8.0   

       time_sec  body_diff  
28502     199.8       -8.0  
28503     200.0        8.0  
28504       NaN        2.0  
28505       NaN        NaN  


In [66]:
import pandas as pd

# CSVを読み込み
df = pd.read_csv(r"C:\Users\yasak\Desktop\mykeibaAI_ver1p0\data\horse_id_list.csv", header=None)

# 先頭に挿入したい行を作成（ここでは1列目に "test"）
new_row = pd.DataFrame([["horse_id"] + [None]*(df.shape[1]-1)])

# 上に追加してインデックスをリセット
df = pd.concat([new_row, df], ignore_index=True)

# 保存
df.to_csv(r"C:\Users\yasak\Desktop\mykeibaAI_ver1p0\data\horse_id_list2.csv", index=False, header=False)


In [67]:
csv_path = r"C:\Users\yasak\Desktop\mykeibaAI_ver1p0\data\horse_id_list2.csv"
df = pd.read_csv(csv_path)

In [75]:
csv_path = r"C:\Users\yasak\Desktop\mykeibaAI_ver1p0\data\horse_id_list2.csv"
result_table_path = r"C:\Users\yasak\Desktop\mykeibaAI_ver1p0\data\horse_result_table.pkl"
bin_dir = r"C:\Users\yasak\Desktop\mykeibaAI_ver1p0\data\horse_result_html"
df = pd.read_csv(csv_path)
print(df)

# 既存pickleのhorse_idを確認
existing_df = pd.read_pickle(result_table_path)
print(existing_df)
existing_ids = set(existing_df["horse_id"].astype(str))
print(f"existing_ids: {existing_ids}")

# 新しく解析するhorse_idだけを抽出
target_ids = [str(rid) for rid in df["horse_id"] if str(rid) not in existing_ids]
print(f"target_ids: {target_ids}")

        horse_id
0     2019105746
1     2019102983
2     2018104746
3     2019100108
4     2019103518
...          ...
8709  2021102015
8710  2021106607
8711  2021100232
8712  2021102517
8713  2018100141

[8714 rows x 1 columns]
      horse_id race_date  weather race_no       race_id num_of_horses  \
0   2018104746  20240323        2      11  202407010511            16   
1   2018104746  20240114        0      10  202408010510            13   
2   2018104746  20231223        0      10  202309050710            16   
3   2018104746  20231028        0      10  202308020810            15   
4   2018104746  20231001        1      10  202309040910            12   
5   2018104746  20230708        1      11  202307030311            16   
6   2018104746  20230513        2      10  202308010710            16   
7   2018104746  20230325        2      11  202307020511            16   
8   2018104746  20230212        0      10  202309010210            13   
9   2018104746  20230109        0      10