In [1]:
import json
import glob
import os
import re
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm

In [2]:
csv_files = glob.glob(os.path.join("src", "raw", "*.csv"))
csv_lists = [os.path.basename(file) for file in csv_files]

PREPRO_DIR = Path(".", "src", "preprocessed")
PREPRO_DIR.mkdir(parents=True, exist_ok=True)

min_rev_list = [0.94, 0.945, 0.95]
max_rev_list = [1.65, 1.7, 1.75]

In [3]:
csv_files

['src\\raw\\all_race_results_2021.csv',
 'src\\raw\\all_race_results_2022.csv',
 'src\\raw\\all_race_results_2023.csv',
 'src\\raw\\all_race_results_2024.csv',
 'src\\raw\\all_race_results_2025.csv']

In [4]:
# 前処理済みのall_race_results.csvを作成（開催年ごとに1ファイル作成）
for cf in csv_lists:
    df = pd.read_csv(f"./src/raw/{cf}", encoding="utf-8")
    df = df[
        ["日付(yyyy.mm.dd)", "場所", "レース名", "クラス名", "馬番", "馬名", "種牡馬", "年齢",
        "騎手", "距離", "馬場状態", "前距離", "複勝オッズ下限", "複勝オッズ上限", "複勝配当"]
    ]

    df.dropna(subset=["複勝オッズ下限"], inplace=True)

    keywords = ["障害", "JG"]
    pattern = "|".join(keywords)
    df.loc[df["レース名"].str.contains(pattern), "コース区分"] = "障害"
    df.loc[~df["レース名"].str.contains(pattern), "コース区分"] = df["距離"].str[:1]
    df["コース区分"] = df["コース区分"].replace("ダ", "ダート")
    df["距離"] = df["距離"].str[1:].astype(int)

    df.loc[df["距離"]<=1600, "距離区分"] = "短距離"
    df.loc[(df["距離"] >= 1700) & (df["距離"] <= 2200), "距離区分"] = "中距離"
    df.loc[df["距離"]>=2300, "距離区分"] = "長距離"

    df.loc[(df["馬場状態"]!="良") & (df["コース区分"]=="芝"), "道悪判定"] = "芝道悪"
    df.loc[(df["馬場状態"]!="良") & (df["コース区分"]=="ダート"), "道悪判定"] = "ダ道悪"

    df = df.rename(columns={"日付(yyyy.mm.dd)": "日付"})
    df["日付"] = df["日付"].str.replace(".", "-")
    df["日付"] = df["日付"].str.replace("- ", "-0")
    df["日付"] = pd.to_datetime(df["日付"])

    df["複勝配当"] = df["複勝配当"].fillna(0)
    df["複勝配当"] = df["複勝配当"].astype(float)

    df.loc[df["距離"] - df["前距離"] > 0, "距離変遷"] = "距離延長"
    df.loc[df["距離"] - df["前距離"] < 0, "距離変遷"] = "距離短縮"

    df.loc[df["場所"].isin(["東京", "中京", "新潟"]), "回り"] = "左回り"
    df.loc[~df["場所"].isin(["東京", "中京", "新潟"]), "回り"] = "右回り"

    df.loc[df["年齢"] == 2, "馬齢"] = "2歳"
    df.loc[df["年齢"] == 3, "馬齢"] = "3歳"
    df.loc[df["年齢"] == 4, "馬齢"] = "4歳"
    df.loc[df["年齢"] >= 5, "馬齢"] = "5歳以上"
    
    with open("race_class_mapping.json", "r", encoding="utf-8") as f:
        rc_mapping = json.load(f)
    df["クラス名"] = df["クラス名"].map(rc_mapping)

    for mi in min_rev_list:
        for ma in max_rev_list:
            df[f"補正配当_{mi}-{ma}"] = df["複勝配当"]
            df.loc[df["複勝配当"] < 200, f"補正配当_{mi}-{ma}"] = df["複勝配当"] * mi
            df.loc[df["複勝配当"] >= 1700, f"補正配当_{mi}-{ma}"] = df["複勝配当"] * ma
            df[f"補正配当_{mi}-{ma}"] = df[f"補正配当_{mi}-{ma}"].round(2)
    
    df.to_csv(f"{PREPRO_DIR}/preprocessed_{cf}", index=False, encoding="utf-8-sig")

  df = pd.read_csv(f"./src/raw/{cf}", encoding="utf-8")


In [5]:
PER_DATA_DIR = Path(".", "src", "per_data")
PER_DATA_DIR.mkdir(parents=True, exist_ok=True)

pre_files = glob.glob(os.path.join(PREPRO_DIR, "*.csv"))
pre_lists = [os.path.basename(file) for file in pre_files]

for f in pre_files:
    filename = os.path.basename(f)
    year = re.findall(r"\d+", filename)[0]
    pre_df = pd.read_csv(f, encoding="utf-8")
    jockey_list = pre_df["騎手"].unique()
    sire_list = pre_df["種牡馬"].unique()

    for jo in tqdm(jockey_list):
        per_jockey_df = pre_df[pre_df["騎手"] == jo].copy()
        max_odds = per_jockey_df["複勝オッズ下限"].max()
        per_jockey_df["賭け金"] = max_odds / per_jockey_df["複勝オッズ下限"] * 100
        per_jockey_df["賭け金"] = per_jockey_df["賭け金"].round(-2)

        for mi in min_rev_list:
            for ma in max_rev_list:
                per_jockey_df[f"補正払戻_{mi}-{ma}"] = per_jockey_df[f"補正配当_{mi}-{ma}"] * per_jockey_df["賭け金"] / 100
                per_jockey_df[f"補正払戻_{mi}-{ma}"] = per_jockey_df[f"補正払戻_{mi}-{ma}"].round(2)

        per_jockey_dir = Path(PER_DATA_DIR, "jockey", jo)
        per_jockey_dir.mkdir(parents=True, exist_ok=True)
        per_jockey_df.to_csv(
            f"{per_jockey_dir}/per_results_{year}.csv",
            encoding="utf-8-sig",
            index=False
        )

    for si in tqdm(sire_list):
        per_sire_df = pre_df[pre_df["種牡馬"] == si].copy()
        max_odds = per_sire_df["複勝オッズ下限"].max()
        per_sire_df["賭け金"] = max_odds / per_sire_df["複勝オッズ下限"] * 100
        per_sire_df["賭け金"] = per_sire_df["賭け金"].round(-2)

        for mi in min_rev_list:
            for ma in max_rev_list:
                per_sire_df[f"補正払戻_{mi}-{ma}"] = per_sire_df[f"補正配当_{mi}-{ma}"] * per_sire_df["賭け金"] / 100
                per_sire_df[f"補正払戻_{mi}-{ma}"] = per_sire_df[f"補正払戻_{mi}-{ma}"].round(2)
        
        per_sire_dir = Path(PER_DATA_DIR, "sire", si)
        per_sire_dir.mkdir(parents=True, exist_ok=True)
        per_sire_df.to_csv(
            f"{per_sire_dir}/per_results_{year}.csv",
            encoding="utf-8-sig",
            index=False
        )

  0%|          | 0/155 [00:00<?, ?it/s]

  0%|          | 0/452 [00:00<?, ?it/s]

  0%|          | 0/196 [00:00<?, ?it/s]

  0%|          | 0/447 [00:00<?, ?it/s]

  pre_df = pd.read_csv(f, encoding="utf-8")


  0%|          | 0/198 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/195 [00:00<?, ?it/s]

  0%|          | 0/464 [00:00<?, ?it/s]