In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys
import matplotlib.pyplot as plt
import seaborn as sns

module_path = (Path().resolve().parent/ "Modules")
sys.path.append(str(module_path))

pd.set_option("display.max_columns", None)

import my_modules, model_tuner, features # 自作モジュール

In [2]:
df = pd.read_csv("../Data/train_data_tmp.csv", encoding="shift-jis")
df = my_modules.preprocessing(df)
df = my_modules.common_process(df)

  df["place_num"] = df["place"].replace(place_dict).astype(int)


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 252411 entries, 252634 to 0
Data columns (total 74 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   PCI                  251103 non-null  float64       
 1   PCI3                 252411 non-null  float64       
 2   RPCI                 252411 non-null  float64       
 3   race_id              252411 non-null  int64         
 4   year                 252411 non-null  int64         
 5   month                252411 non-null  int64         
 6   day                  252411 non-null  int64         
 7   times                252411 non-null  int64         
 8   place                252411 non-null  object        
 9   daily                252411 non-null  object        
 10  race_num             252411 non-null  int64         
 11  horse                252411 non-null  object        
 12  jockey_id            252411 non-null  object        
 13  trainer_id         

In [None]:
from trueskill import TrueSkill
from collections import defaultdict


def calc_trueskill_horse(df):
    df = df.copy()
    df["horse_TrueSkill"] = np.nan

    env = TrueSkill(draw_probability=0.0) # TrueSkill環境
    ratings = defaultdict(lambda:env.create_rating()) # 全馬のレートが入っている辞書

    grouped = df.groupby("id_for_fold", observed=True)

    for id, group in grouped:
        race_data = group[group["error_code"] == 0].copy()

        horse_list = race_data["horse"].tolist()
        race_ratings = [[ratings[horse]] for horse in horse_list]

        # 各馬のレーティングを埋め込み
        # error_codeが0ではない馬（異常終了）は一つ前のレースのデータを埋め込む
        all_horse_list = group["horse"].tolist()
        mu_array = [ratings[horse].mu for horse in all_horse_list]
        mask = (df["id_for_fold"] == id) & (df["horse"].isin(all_horse_list))
        df.loc[mask, "horse_TrueSkill"] = mu_array

        # レーティングの更新
        ranks = race_data["rank"].tolist() # レースの結果
        new_ratings = env.rate(race_ratings, ranks=ranks)

        for horse, new_group in zip(horse_list, new_ratings):
            ratings[horse] = new_group[0]


    return df

In [83]:
df2 = calc_trueskill_horse(df)

Processing group 2020010506010101...
Processing group 2020010506010102...
Processing group 2020010506010103...
Processing group 2020010506010104...
Processing group 2020010506010105...
Processing group 2020010506010106...
Processing group 2020010506010107...
Processing group 2020010506010108...
Processing group 2020010506010109...
Processing group 2020010506010110...
Processing group 2020010506010111...
Processing group 2020010506010112...
Processing group 2020010508010101...
Processing group 2020010508010102...
Processing group 2020010508010103...
Processing group 2020010508010104...
Processing group 2020010508010105...
Processing group 2020010508010106...
Processing group 2020010508010107...
Processing group 2020010508010108...
Processing group 2020010508010109...
Processing group 2020010508010110...
Processing group 2020010508010111...
Processing group 2020010508010112...
Processing group 2020010606010201...
Processing group 2020010606010202...
Processing group 2020010606010203...
P

In [84]:
df2[df2.TrueSkill.isna()].shape

(1311, 75)

In [85]:
df2[["year", "month", "day", "place", "race_num", "horse", "TrueSkill"]].sort_values(by="TrueSkill", ascending=False).head(50)

Unnamed: 0,year,month,day,place,race_num,horse,TrueSkill
147606,2022,4,3,阪神,11,エフフォーリア,51.754354
160126,2021,12,26,中山,11,エフフォーリア,51.288338
71709,2023,11,26,東京,12,イクイノックス,51.069069
29742,2024,10,27,東京,11,リバティアイランド,51.057917
186810,2021,5,23,東京,11,ソダシ,50.878951
71710,2023,11,26,東京,12,リバティアイランド,50.769014
75446,2023,10,29,東京,11,イクイノックス,50.37989
167485,2021,10,31,東京,11,エフフォーリア,50.377197
77410,2023,10,15,京都,11,リバティアイランド,49.817462
90733,2023,6,25,阪神,11,イクイノックス,49.748247


In [86]:
df2[df2["horse"] == "ソダシ"][["id_for_fold", "year", "month", "day", "place", "race_num", "rank", "TrueSkill"]]

Unnamed: 0,id_for_fold,year,month,day,place,race_num,rank,TrueSkill
226907,2020071202020405,2020,7,12,函館,5,1,25.0
220484,2020090501020711,2020,9,5,札幌,11,1,35.085008
214020,2020103105040711,2020,10,31,東京,11,1,43.101857
207871,2020121309060411,2020,12,13,阪神,11,1,46.470573
192717,2021041109020611,2021,4,11,阪神,11,1,49.080889
186810,2021052305021011,2021,5,23,東京,11,8,50.878951
175780,2021082201020411,2021,8,22,札幌,11,1,48.032776
169497,2021101709040411,2021,10,17,阪神,11,10,48.219369
162996,2021120507060211,2021,12,5,中京,11,12,46.353679
152724,2022022005010811,2022,2,20,東京,11,3,44.142028


In [87]:
df2[df2["id_for_fold"] == 2020090501020711][["horse", "waku_num", "TrueSkill", "rank"]].sort_values("TrueSkill", ascending=False)

Unnamed: 0,horse,waku_num,TrueSkill,rank
220497,スライリー,4,38.167932,14
220487,アオイゴールド,8,37.701557,4
220495,カガフラッシュ,3,37.701557,12
220485,ユーバーレーベン,5,37.149179,2
220488,ヴェローチェオロ,2,36.074816,5
220496,ピンクカメハメハ,1,36.074816,13
220484,ソダシ,8,35.085008,1
220486,バスラットレオン,4,35.085008,3
220493,リキサントライ,6,34.048583,10
220491,ウイングリュック,7,33.35631,8


In [108]:
from trueskill import TrueSkill
from collections import defaultdict


def calc_trueskill_jockey(df):
    df = df.copy()
    df["jockey_TrueSkill"] = np.nan

    env = TrueSkill(draw_probability=0.0) # TrueSkill環境
    ratings = defaultdict(lambda:env.create_rating()) # 全馬のレートが入っている辞書

    grouped = df.groupby("id_for_fold", observed=True)

    for id, group in grouped:    
        race_data = group[group["error_code"] == 0].copy()

        jockey_list = race_data["jockey_id"].tolist()
        race_ratings = [[ratings[jockey]] for jockey in jockey_list]

        # 各馬のレーティングを埋め込み
        # error_codeが0ではない馬（異常終了）は一つ前のレースのデータを埋め込む
        all_jockey_list = group["jockey_id"].tolist()
        mu_array = [ratings[jockey].mu for jockey in all_jockey_list]
        mask = (df["id_for_fold"] == id) & (df["jockey_id"].isin(all_jockey_list))
        df.loc[mask, "jockey_TrueSkill"] = mu_array

        # レーティングの更新
        ranks = race_data["rank"].tolist() # レースの結果
        new_ratings = env.rate(race_ratings, ranks=ranks)

        for jockey, new_group in zip(jockey_list, new_ratings):
            ratings[jockey] = new_group[0]


    return df

In [None]:
df_jockey = calc_trueskill_jockey(df)

In [112]:
df_jockey.info()

<class 'pandas.core.frame.DataFrame'>
Index: 252411 entries, 252634 to 0
Data columns (total 75 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   PCI                  251103 non-null  float64       
 1   PCI3                 252411 non-null  float64       
 2   RPCI                 252411 non-null  float64       
 3   race_id              252411 non-null  int64         
 4   year                 252411 non-null  int64         
 5   month                252411 non-null  int64         
 6   day                  252411 non-null  int64         
 7   times                252411 non-null  int64         
 8   place                252411 non-null  object        
 9   daily                252411 non-null  object        
 10  race_num             252411 non-null  int64         
 11  horse                252411 non-null  object        
 12  jockey_id            252411 non-null  object        
 13  trainer_id         

In [110]:
df_jockey[["year", "month", "day", "place", "race_num", "horse", "jockey_id", "jockey_TrueSkill"]].sort_values(by="jockey_TrueSkill", ascending=False).head(50)

Unnamed: 0,year,month,day,place,race_num,horse,jockey_id,jockey_TrueSkill
151642,2022,3,5,阪神,4,エバニスタ,1199,39.171009
252797,2020,1,5,京都,2,ベッサメモー,666,38.167932
251903,2020,1,11,中山,4,ヤマニンパジャッソ,1119,38.167932
206676,2020,12,26,阪神,9,エバンタイユドール,5583,38.104648
252523,2020,1,5,中山,8,レヴァンテ,660,36.92043
252779,2020,1,5,京都,3,マイハート,1128,36.880698
252554,2020,1,5,中山,6,ホーカスポーカス,1127,36.814171
23825,2024,12,14,中京,10,ワイワイレジェンド,5651,36.002153
151672,2022,3,5,阪神,2,メイショウトール,1199,35.830788
252684,2020,1,5,京都,10,ケプラー,1088,35.768892


In [105]:
df_jockey[df_jockey["jockey_TrueSkill"] == 25]

Unnamed: 0,PCI,PCI3,RPCI,race_id,year,month,day,times,place,daily,race_num,horse,jockey_id,trainer_id,horse_N,waku_num,horse_num,class_code,track_code,corner_num,dist,state,weather,age_code,sex,age,basis_weight,blinker,weight,inc_dec,weight_code,win_odds,win_odds_1,win_odds_1_pop,win_odds_2,win_odds_2_pop,win_mul_odds_Hi,win_mul_odds_Lo,win_mul_odds_1_Hi,win_mul_odds_1_Lo,win_mul_odds_1_pop,win_mul_odds_2_Hi,win_mul_odds_2_Lo,win_mul_odds_2_pop,rank,time_diff,time,corner1_rank,corner2_rank,corner3_rank,corner4_rank,last_3F_time,last_3F_rank,Ave_3F,last_3F_time_diff,leg,pop,prize,error_code,father,mother,broodmare_sire,broodmare_sire_type,horse_color,id,id_for_fold,field_type,flat_or_jump,turn_type,race_type,waku,datetime,target,target3,jockey_TrueSkill
252634,38.7,40.57,36.0,202001050601010115,2020,1,5,1,中山,1,1,ラブカワールド,1177,1131,16,8,15,7,24,2,1200,良,晴,12,牝,3,51.0,,436.0,4.0,3,474.2,355.9,15,254.1,14,153.0,39.3,115.2,36.0,15,74.6,30.0,14,13,2.0,1153,,,15.0,16.0,39.9,9,35.40,1.5,後方,15.0,0,0,トゥザワールド,ラブカテリーナ,オレハマッテルゼ,ロイヤルチャージャー系,黒鹿,2017104350,2020010506010101,ダート,平地,R,中山ダート1200,outer,2020-01-05 01:02:00,0,0,25.0
252635,36.6,40.57,36.0,202001050601010116,2020,1,5,1,中山,1,1,シラカワカツコ,1173,1031,16,8,16,7,24,2,1200,良,晴,12,牝,3,51.0,,404.0,4.0,3,233.1,239.5,13,197.1,13,83.6,21.6,62.3,19.6,13,30.6,12.4,12,14,2.1,1154,,,9.0,10.0,40.4,13,35.00,1.1,中団,13.0,0,0,ディープブリランテ,シャイニングピサ,Smart Strike,ネイティヴダンサー系,鹿毛,2017101747,2020010506010101,ダート,平地,R,中山ダート1200,outer,2020-01-05 01:02:00,0,0,25.0
252633,40.6,40.57,36.0,202001050601010102,2020,1,5,1,中山,1,1,スルーザリミッツ,1122,1118,16,1,2,7,24,2,1200,良,晴,12,牝,3,54.0,,424.0,-4.0,3,31.4,31.1,7,22.6,6,13.7,3.9,11.3,3.8,7,7.4,3.2,6,12,1.6,1149,,,16.0,15.0,39.3,6,35.60,1.7,後方,7.0,0,0,パイロ,ファービヨンド,ディープスカイ,ロイヤルチャージャー系,黒鹿,2017101861,2020010506010101,ダート,平地,R,中山ダート1200,inner,2020-01-05 01:02:00,0,0,25.0
252632,36.3,40.57,36.0,202001050601010104,2020,1,5,1,中山,1,1,セルレア,1161,1115,16,2,4,7,24,2,1200,良,晴,12,牝,3,53.0,,464.0,10.0,3,14.5,15.7,4,19.5,5,7.3,2.2,6.2,2.2,5,6.1,2.7,5,11,1.6,1149,,,6.0,6.0,40.2,11,34.70,0.8,中団,4.0,0,0,ロードカナロア,ダイワミストレス,ダイワメジャー,ロイヤルチャージャー系,鹿毛,2017102095,2020010506010101,ダート,平地,R,中山ダート1200,inner,2020-01-05 01:02:00,0,0,25.0
252631,32.9,40.57,36.0,202001050601010109,2020,1,5,1,中山,1,1,サノレーヌ,1164,1141,16,5,9,7,24,2,1200,良,晴,12,牝,3,51.0,,438.0,0.0,3,9.5,8.0,3,6.8,3,4.5,1.5,3.9,1.5,3,3.1,1.5,3,10,1.5,1148,,,1.0,1.0,40.9,15,33.90,0.0,逃げ,3.0,0,0,サウスヴィグラス,アニマートホウヨウ,フサイチコンコルド,ニアークティック系,鹿毛,2017100847,2020010506010101,ダート,平地,R,中山ダート1200,outer,2020-01-05 01:02:00,0,0,25.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14166,41.1,43.70,42.8,202503010602010210,2025,3,1,2,中山,1,2,ライブリブランコ,1218,1141,16,5,10,7,24,2,1200,良,晴,12,牡,3,54.0,,486.0,-4.0,3,517.7,222.0,16,262.6,16,115.0,56.4,51.2,25.0,16,37.2,20.6,16,10,2.4,1149,,,9.0,8.0,39.2,12,35.70,0.8,中団,16.0,0,0,ケープブランコ,ヒダカビジン,フジキセキ,ロイヤルチャージャー系,黒鹿,2022109081,2025030106020102,ダート,平地,R,中山ダート1200,outer,2025-03-01 02:02:00,0,0,25.0
14320,43.7,47.40,46.3,202503010901010304,2025,3,1,1,阪神,1,3,スターペスショウマ,1222,1140,16,2,4,7,24,4,1800,良,晴,12,牡,3,54.0,,456.0,2.0,3,56.1,35.9,8,32.1,9,14.2,7.8,15.8,7.2,9,11.5,6.8,9,8,1.7,1567,4.0,3.0,4.0,6.0,40.6,11,38.05,0.4,中団,8.0,0,0,リアルスティール,スターペスマリア,パイロ,ナスルーラ系,鹿毛,2022106997,2025030109010103,ダート,平地,R,阪神ダート1800,inner,2025-03-01 03:03:00,0,0,25.0
14307,38.6,42.50,41.9,202503010901010416,2025,3,1,1,阪神,1,4,スマートビビット,1220,1172,16,8,16,7,24,2,1400,良,晴,12,牝,3,52.0,,452.0,-4.0,3,148.3,83.1,11,68.1,12,38.6,20.3,19.4,11.2,13,18.8,12.7,13,11,2.2,1279,,,3.0,3.0,40.3,13,35.70,0.4,先行,13.0,0,0,エイシンヒカリ,スマートレグルス,ロードカナロア,,黒鹿,2022101555,2025030109010104,ダート,平地,R,阪神ダート1400,outer,2025-03-01 04:03:00,0,0,25.0
13793,39.6,46.93,46.5,202503020901020402,2025,3,2,1,阪神,2,4,アオユウスター,5567,5590,16,1,2,23,24,2,1200,良,小雨,12,牝,3,55.0,,440.0,3.0,3,506.9,280.2,16,243.0,16,124.3,69.2,60.1,35.7,16,45.9,31.0,16,16,4.1,1166,,,7.0,11.0,40.4,16,36.20,0.6,後方,16.0,0,0,アジアエクスプレス,スターレット,ブライアンズタイム,ロイヤルチャージャー系,栗毛,2022102889,2025030209010204,ダート,平地,R,阪神ダート1200,inner,2025-03-02 04:03:00,0,0,25.0
