In [21]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys
import matplotlib.pyplot as plt
import seaborn as sns

module_path = (Path().resolve().parent/ "Modules")
sys.path.append(str(module_path))

pd.set_option("display.max_columns", None)

import my_modules, model_tuner, features # 自作モジュール

In [22]:
df = pd.read_csv("../Data/train_data_tmp.csv", encoding="shift-jis")
df = my_modules.preprocessing(df)
df = my_modules.common_process(df)

  df["place_num"] = df["place"].replace(place_dict).astype(int)


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 252411 entries, 252634 to 0
Data columns (total 76 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   race_id              252411 non-null  int64         
 1   year                 252411 non-null  int64         
 2   month                252411 non-null  int64         
 3   day                  252411 non-null  int64         
 4   times                252411 non-null  int64         
 5   place                252411 non-null  object        
 6   daily                252411 non-null  object        
 7   race_num             252411 non-null  int64         
 8   horse                252411 non-null  object        
 9   jockey_id            252411 non-null  object        
 10  trainer_id           252411 non-null  int64         
 11  horse_N              252411 non-null  object        
 12  waku_num             252411 non-null  int64         
 13  horse_num          

In [40]:
from trueskill import TrueSkill
from itertools import combinations
from collections import defaultdict
from glicko2 import Player
from sklearn.preprocessing import PolynomialFeatures

def calc_trueskill_fast(df_to_copy, feature_col, target_col, prefix):
    """
    TrueSkill計算を効率的なDataFrame操作で高速化したバージョン。
    """
    if (target_col is None) or (prefix is None):
        raise ValueError("target_col and prefix must be specified")

    df = df_to_copy.copy()
    
    # 元のfeature_colリストを変更しないように新しいリストを作成
    new_feature_col = feature_col.copy()

    CONFIDENCE_MULTIPLIER = 3

    # 新しく追加する列名を事前に定義
    ts_mu_col = f"{prefix}_TrueSkill"
    ts_sigma_col = f"{prefix}_TrueSkill_sigma"
    ts_min_col = f"{prefix}_TrueSkill_min"
    ts_max_col = f"{prefix}_TrueSkill_max"
    ts_after_col = f"{prefix}_TrueSkill_after_racing"
    
    # 新しい特徴量をリストに追加
    new_feature_col.extend([ts_mu_col, ts_sigma_col, ts_min_col, ts_max_col])

    # TrueSkill環境とレーティング辞書を初期化
    env = TrueSkill(draw_probability=0.0)
    ratings = defaultdict(lambda: env.create_rating())

    # 処理済みのグループを格納するリスト
    processed_groups = []
    
    # groupbyオブジェクトを作成（sort=Falseで元の順序を維持）
    grouped = df.groupby("id_for_fold", observed=True, sort=False)

    for race_id, group in grouped:
        # 各グループのコピーに対して変更を加える
        group_copy = group.copy()
        
        all_targets = group_copy[target_col]

        # --- 1. レース前TrueSkillを効率的に記録 ---
        # .map()とlambda式を使い、辞書からmuとsigmaの値を高速に取得
        mu_series = all_targets.map(lambda x: ratings[x].mu)
        sigma_series = all_targets.map(lambda x: ratings[x].sigma)

        # 取得したSeriesをDataFrameの列として一括で代入
        group_copy[ts_mu_col] = mu_series
        group_copy[ts_sigma_col] = sigma_series
        
        # min/maxをベクトル演算で効率的に計算
        group_copy[ts_min_col] = mu_series - sigma_series * CONFIDENCE_MULTIPLIER
        group_copy[ts_max_col] = mu_series + sigma_series * CONFIDENCE_MULTIPLIER

        # --- 2. TrueSkill計算とレーティング更新 ---
        # 正常なレースデータのみを対象
        race_data = group_copy[group_copy["error_code"] == 0]
        
        # 意味のあるレーティング更新は、通常2つ以上のエンティティが存在する場合
        if len(race_data) >= 2:
            target_list = race_data[target_col].tolist()
            # ratings辞書から現在のレーティングオブジェクトのリストを作成
            race_ratings = [[ratings[target]] for target in target_list]
            ranks = race_data["rank"].tolist()

            # TrueSkillライブラリで新しいレーティングを計算
            new_ratings = env.rate(race_ratings, ranks=ranks)

            # ratings辞書を新しいレーティングで更新
            for target, new_rating_tuple in zip(target_list, new_ratings):
                ratings[target] = new_rating_tuple[0]
        
        # --- 3. レース後TrueSkillを効率的に記録 ---
        # 更新後のratings辞書からmuの値を.map()で取得
        group_copy[ts_after_col] = all_targets.map(lambda x: ratings[x].mu)

        # 処理済みのグループをリストに追加
        processed_groups.append(group_copy)

    # --- 4. 最後に処理済みグループを一度に結合 ---
    # .sort_values(by="datetime")で安全性を確保
    result_df = pd.concat(processed_groups).sort_values(by="datetime", ascending=True)

    return result_df, new_feature_col

In [41]:
df2, _  = calc_trueskill_fast(df, [], target_col="horse", prefix="horse")

In [42]:
df2["horse_TrueSkill"].isna().sum()

np.int64(0)

In [43]:
df2[["year", "month", "day", "place", "race_num", "horse", "horse_TrueSkill"]].sort_values(by="horse_TrueSkill", ascending=False).head(10)

Unnamed: 0,year,month,day,place,race_num,horse,horse_TrueSkill
147606,2022,4,3,阪神,11,エフフォーリア,51.754354
160126,2021,12,26,中山,11,エフフォーリア,51.288338
71709,2023,11,26,東京,12,イクイノックス,51.069069
29742,2024,10,27,東京,11,リバティアイランド,51.057917
186810,2021,5,23,東京,11,ソダシ,50.878951
71710,2023,11,26,東京,12,リバティアイランド,50.769014
75446,2023,10,29,東京,11,イクイノックス,50.37989
167485,2021,10,31,東京,11,エフフォーリア,50.377197
77410,2023,10,15,京都,11,リバティアイランド,49.817462
90733,2023,6,25,阪神,11,イクイノックス,49.748247


In [45]:
def calc_rating_diff(df, feature_col, target_col=None, prefix=None):
    if (target_col is None) or (prefix is None):
        raise ValueError("target_col and prefix must be selected")
    
    df = df.copy()
    feature_col = feature_col.copy()

    last_trueskill = df.groupby("horse", observed=True)[target_col]
    for i in [1, 3]:
        feature_name = f"{prefix}_diff_last{i}_racing"
        df[feature_name] = df[target_col] - last_trueskill.shift(i)
        feature_col.append(feature_name)

    return df, feature_col

tmp, _ = calc_rating_diff(df2, [], target_col="horse_TrueSkill", prefix="horse_TrueSkill")

In [46]:
tmp[tmp.horse == "イクイノックス"][["year", "month", "day", "horse", "horse_TrueSkill", "horse_TrueSkill_diff_last1_racing"]]

Unnamed: 0,year,month,day,horse,horse_TrueSkill,horse_TrueSkill_diff_last1_racing
175554,2021,8,28,イクイノックス,25.0,
165104,2021,11,20,イクイノックス,37.943884,12.943884
145885,2022,4,17,イクイノックス,44.061739,6.117855
140241,2022,5,29,イクイノックス,46.868687,2.806948
121687,2022,10,30,イクイノックス,48.122281,1.253594
114337,2022,12,25,イクイノックス,49.067328,0.945047
90733,2023,6,25,イクイノックス,49.748247,0.680918
75446,2023,10,29,イクイノックス,50.37989,0.631643
71709,2023,11,26,イクイノックス,51.069069,0.68918
