In [5]:
import os
import re
import glob
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm

In [6]:
LIST_DIR = Path(".", "list", "2026")
jockey_list_df = pd.read_csv(LIST_DIR / "jockey_prof_list_2026.csv", encoding="utf-8")
sire_list_df = pd.read_csv(LIST_DIR / "sire_prof_list_2026.csv", encoding="utf-8")
SRC_DIR = Path(".", "src")
j_path = Path(SRC_DIR, "per_data", "jockey")
s_path = Path(SRC_DIR, "per_data", "sire")
COL_DIR = Path(SRC_DIR, "collected_results")
COL_DIR.mkdir(parents=True, exist_ok=True)

jockey_list_columns = list(jockey_list_df.columns)
sire_list_columns = list(sire_list_df.columns)
jockey_list_columns.remove("騎手名")
sire_list_columns.remove("種牡馬名")

jockey_name_list = jockey_list_df["騎手名"].to_list()
jockey_name_list = [item.replace("\u3000", "") for item in jockey_name_list]
sire_name_list = sire_list_df["種牡馬名"].to_list()

min_rev_list = [0.94, 0.945, 0.95]
max_rev_list = [1.65, 1.7, 1.75]
year_weight_map = {2025: 1.0, 2024: 0.8, 2023: 0.6, 2022: 0.4, 2021: 0.2}

In [7]:
def weighted_avg_ignore_nan_col(values, weights):
    v = np.asarray(values, dtype=float)
    w = np.asarray(weights, dtype=float)

    m = (~np.isnan(v)) & (~np.isnan(w))
    if not m.any():
        return np.nan
    return np.average(v[m], weights=w[m])

In [8]:
# 騎手
for mi in min_rev_list:
    for ma in max_rev_list:
        rows = []
        for jn in jockey_name_list:
            per_jockey_dir = Path(j_path, jn)
            try:
                csv_files = glob.glob(os.path.join(per_jockey_dir, "per_results_*.csv"))
            except FileNotFoundError:
                print(f"対象のフォルダやファイルが見つかりませんでした: {per_jockey_dir}")
                
            res_data = {}
            for cl in csv_files:
                filename = os.path.basename(cl)
                year = re.findall(r"\d+", filename)[0]
                df = pd.read_csv(cl, encoding="utf-8")
            
                per_columns_data = {}
                for name in jockey_list_columns:
                    mask = df.astype(str).apply(lambda col: col == name)
                    exists = mask.any().any()
                    if not exists:
                        per_columns_data[name] = np.nan
                        continue
                    
                    if name in ["芝", "ダート", "障害"]:
                        grouped_df = df.groupby("コース区分").get_group(name).copy()
                    elif name in ["短距離", "中距離", "長距離"]:
                        grouped_df = df.groupby("距離区分").get_group(name).copy()
                    elif name in ["左回り", "右回り"]:
                        grouped_df = df.groupby("回り").get_group(name).copy()
                    elif name in ["札幌", "函館", "福島", "新潟", "東京", "中山", "中京", "京都", "阪神", "小倉"]:
                        grouped_df = df.groupby("場所").get_group(name).copy()
                    elif name in ["芝道悪", "ダ道悪"]:
                        grouped_df = df.groupby("道悪判定").get_group(name).copy()
                    else:
                        grouped_df = df.groupby("クラス名").get_group(name).copy()

                    rr = grouped_df[f"補正払戻_{mi}-{ma}"].sum() / grouped_df["賭け金"].sum()
                    per_columns_data[name] = rr
            
                res_data[str(year)] = per_columns_data
                
            base_df = pd.DataFrame.from_dict(res_data, orient="index")
            base_df = base_df.sort_index(ascending=False)
            base_df.index = base_df.index.astype(int)
            w = pd.Series(year_weight_map, dtype=float).reindex(base_df.index).to_numpy()
            weighted_avg_cols = base_df.apply(lambda c: weighted_avg_ignore_nan_col(c.to_numpy(dtype=float), w))
            calculated_df = pd.DataFrame(weighted_avg_cols).T
            calculated_df.index = [jn]
            rows.append(calculated_df)

        collected_res_df = pd.concat(rows, axis=0)
        collected_res_df = collected_res_df.reindex(columns=jockey_list_columns)
        collected_res_df = collected_res_df * 100 + 21
        collected_res_df = collected_res_df.round(3)
        collected_res_df.to_csv(f"{COL_DIR}/collected_res_{mi}-{ma}.csv", encoding="utf-8-sig")

In [9]:
# 騎手ごとの騎乗回数データ
for jn in tqdm(jockey_name_list):
    per_jockey_dir = Path(j_path, jn)
    try:
        csv_files = glob.glob(os.path.join(per_jockey_dir, "per_results_*.csv"))
    except FileNotFoundError:
        print(f"対象のフォルダやファイルが見つかりませんでした: {per_jockey_dir}")

    all_data = {}
    for cl in csv_files:
        filename = os.path.basename(cl)
        year = re.findall(r"\d+", filename)[0]
        df = pd.read_csv(cl, encoding="utf-8")

        per_columns_data = {}
        for name in jockey_list_columns:
            mask = df.astype(str).apply(lambda col: col == name)
            exists = mask.any().any()
            if not exists:
                per_columns_data[name] = np.nan
                continue
            
            if name in ["芝", "ダート", "障害"]:
                grouped_df = df.groupby("コース区分").get_group(name).copy()
            elif name in ["短距離", "中距離", "長距離"]:
                grouped_df = df.groupby("距離区分").get_group(name).copy()
            elif name in ["左回り", "右回り"]:
                grouped_df = df.groupby("回り").get_group(name).copy()
            elif name in ["札幌", "函館", "福島", "新潟", "東京", "中山", "中京", "京都", "阪神", "小倉"]:
                grouped_df = df.groupby("場所").get_group(name).copy()
            elif name in ["芝道悪", "ダ道悪"]:
                grouped_df = df.groupby("道悪判定").get_group(name).copy()
            else:
                grouped_df = df.groupby("クラス名").get_group(name).copy()

            per_columns_data[name] = len(grouped_df)

        all_data[year] = per_columns_data
            
    all_df = pd.DataFrame.from_dict(all_data)
    all_df = all_df.T
    all_df.to_csv(f"src/per_data/jockey/{jn}/number_of_rides.csv", encoding="utf-8-sig")

  0%|          | 0/135 [00:00<?, ?it/s]