In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys
import matplotlib.pyplot as plt
import seaborn as sns

module_path = (Path().resolve().parent/ "Modules")
sys.path.append(str(module_path))

pd.set_option("display.max_columns", None)

import my_modules, model_tuner, features # 自作モジュール

In [2]:
df = pd.read_csv("../Data/train_data_tmp.csv", encoding="shift-jis")
odds_df = pd.read_csv("../Data/Time_Series_Odds_win_odds.csv", encoding="shift-jis")

In [3]:
df = my_modules.preprocessing(df)
df = my_modules.common_process(df)

  df["place_num"] = df["place"].replace(place_dict).astype(int)


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 252411 entries, 252634 to 0
Data columns (total 76 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   race_id              252411 non-null  int64         
 1   year                 252411 non-null  int64         
 2   month                252411 non-null  int64         
 3   day                  252411 non-null  int64         
 4   times                252411 non-null  int64         
 5   place                252411 non-null  object        
 6   daily                252411 non-null  object        
 7   race_num             252411 non-null  int64         
 8   horse                252411 non-null  object        
 9   jockey_id            252411 non-null  object        
 10  trainer_id           252411 non-null  int64         
 11  horse_N              252411 non-null  object        
 12  waku_num             252411 non-null  int64         
 13  horse_num          

### 脚質の特徴量を追加

In [78]:
# 過去に選択された脚質を追加（回数+確率）
def calc_leg_cumsum(df_to_copy, feature_col_to_copy):
    df = df_to_copy.copy()
    feature_col = feature_col_to_copy.copy()
    target_col = ['後方', '中団', '逃げ', '先行', 'ﾏｸﾘ'] # なぜか追込がいない... 

    df["num_of_entries"] = df.groupby("horse", observed=True)["horse"].cumcount()

    leg_dummy = pd.get_dummies(df["leg"], drop_first=False).astype(int)
    df = pd.concat([df, leg_dummy], axis=1)

    grouped1 = df.groupby("horse", observed=True)
    grouped2 = df.groupby(["id_for_fold", "horse"], observed=True)

    # 同じ条件で1着になるの確率を計算
    bunsi1 = grouped1[target_col].cumsum() - grouped2[target_col].cumsum()

    for col in target_col:
        feature_name1 = f"{col}_per_entries"
        feature_name2 = f"{col}_cumcount_past_racing"
        df[feature_name1] = bunsi1[col] / df["num_of_entries"].replace(0, np.nan) # 脚質の選択確率を追加
        feature_col.append(feature_name1)
        df[feature_name2] = bunsi1[col] # 脚質の選択回数を追加
        feature_col.append(feature_name2)

    return df, feature_col

In [79]:
tmp, feature_col = calc_leg_cumsum(df, [])

In [80]:
tmp[tmp.horse == "イクイノックス"]

Unnamed: 0,race_id,year,month,day,times,place,daily,race_num,horse,jockey_id,trainer_id,horse_N,waku_num,horse_num,class_code,track_code,corner_num,dist,state,weather,age_code,sex,age,basis_weight,blinker,weight,inc_dec,weight_code,win_odds,win_odds_1,win_odds_1_pop,win_odds_2,win_odds_2_pop,win_mul_odds_Hi,win_mul_odds_Lo,win_mul_odds_1_Hi,win_mul_odds_1_Lo,win_mul_odds_1_pop,win_mul_odds_2_Hi,win_mul_odds_2_Lo,win_mul_odds_2_pop,rank,time_diff,time,corner1_rank,corner2_rank,corner3_rank,corner4_rank,last_3F_time,last_3F_rank,Ave_3F,PCI,PCI3,RPCI,last_3F_time_diff,leg,pop,prize,error_code,father,mother,broodmare_sire,broodmare_sire_type,horse_color,id,id_for_fold,field_type,flat_or_jump,turn_type,race_type,waku,datetime,age_type,sample_weight,target,target3,num_of_entries,中団,先行,後方,逃げ,ﾏｸﾘ,後方_per_entries,後方_cumcount_past_racing,中団_per_entries,中団_cumcount_past_racing,逃げ_per_entries,逃げ_cumcount_past_racing,先行_per_entries,先行_cumcount_past_racing,ﾏｸﾘ_per_entries,ﾏｸﾘ_cumcount_past_racing
175554,202108280404050502,2021,8,28,4,新潟,5,5,イクイノックス,5339,1051,15,2,2,15,12,2,1800,良,曇,11,牡,2,54.0,,474.0,,3,4.6,4.2,2,4.1,2,2.1,1.4,2.1,1.4,2,2.5,1.7,2,1,-1.0,1474,,,3.0,3.0,34.5,1,36.45,55.7,54.53,54.8,0.2,先行,2.0,700,0,キタサンブラック,シャトーブランシュ,キングヘイロー,ニアークティック系,青鹿,2019105219,2021082804040505,芝,平地,L,新潟芝1800,inner,2021-08-28 05:06:00,2,0.066667,1,1,0,0,1,0,0,0,,0,,0,,0,,0,,0
165104,202111200505051101,2021,11,20,5,東京,5,11,イクイノックス,5339,1126,12,1,1,179,11,2,1800,良,晴,11,牡,2,55.0,,482.0,8.0,3,2.6,2.5,1,3.0,1,1.6,1.3,1.6,1.3,1,1.9,1.5,1,1,-0.4,1462,,9.0,10.0,8.0,32.9,1,36.65,61.4,59.3,54.8,1.4,中団,1.0,3800,0,キタサンブラック,シャトーブランシュ,キングヘイロー,ニアークティック系,青鹿,2019105219,2021112005050511,芝,平地,L,東京芝1800,inner,2021-11-20 11:01:00,2,0.083333,1,1,1,1,0,0,0,0,0.0,0,0.0,0,0.0,0,1.0,1,0.0,0
145885,202204170603081118,2022,4,17,3,中山,8,11,イクイノックス,5339,1126,18,8,18,195,17,4,2000,良,曇,12,牡,3,57.0,,492.0,10.0,4,5.7,5.8,2,6.3,2,2.8,2.0,3.1,2.3,2,3.4,2.6,3,2,0.1,1598,7.0,5.0,4.0,3.0,34.6,8,36.51,55.5,57.17,54.1,0.4,先行,3.0,6000,0,キタサンブラック,シャトーブランシュ,キングヘイロー,ニアークティック系,青鹿,2019105219,2022041706030811,芝,平地,R,中山芝2000,outer,2022-04-17 11:02:00,3,0.055556,0,1,2,0,1,0,0,0,0.0,0,0.5,1,0.0,0,0.5,1,0.0,0
140241,202205290502121118,2022,5,29,2,東京,C,11,イクイノックス,5339,1126,18,8,18,195,11,4,2400,良,晴,12,牡,3,57.0,,484.0,-8.0,4,3.8,3.8,1,4.4,3,1.8,1.3,1.9,1.4,1,2.1,1.6,1,2,0.0,2219,16.0,16.0,16.0,14.0,33.6,1,36.1,57.4,55.1,51.0,1.6,後方,2.0,8000,0,キタサンブラック,シャトーブランシュ,キングヘイロー,ニアークティック系,青鹿,2019105219,2022052905021211,芝,平地,L,東京芝2400,outer,2022-05-29 11:01:00,3,0.055556,0,1,3,0,0,1,0,0,0.0,0,0.333333,1,0.0,0,0.666667,2,0.0,0
121687,202210300504091107,2022,10,30,4,東京,9,11,イクイノックス,5339,1126,15,4,7,195,11,3,2000,良,晴,13,牡,3,56.0,,488.0,4.0,4,2.6,2.7,1,3.0,1,1.4,1.1,1.4,1.2,1,1.6,1.3,1,1,-0.1,1575,,10.0,10.0,9.0,32.7,1,36.34,61.1,55.37,44.4,4.0,中団,1.0,20000,0,キタサンブラック,シャトーブランシュ,キングヘイロー,ニアークティック系,青鹿,2019105219,2022103005040911,芝,平地,L,東京芝2000,inner,2022-10-30 11:01:00,3,0.066667,1,1,4,1,0,0,0,0,0.25,1,0.25,1,0.0,0,0.5,2,0.0,0
114337,202212250605081109,2022,12,25,5,中山,8,11,イクイノックス,5339,1126,16,5,9,195,17,5,2500,良,晴,13,牡,3,55.0,,492.0,4.0,4,2.3,2.4,1,2.9,1,1.4,1.1,1.4,1.2,1,1.6,1.3,1,1,-0.4,2324,8.0,9.0,6.0,3.0,35.4,2,36.95,54.4,54.57,52.5,0.5,先行,1.0,40000,0,キタサンブラック,シャトーブランシュ,キングヘイロー,ニアークティック系,青鹿,2019105219,2022122506050811,芝,平地,R,中山芝2500,outer,2022-12-25 11:02:00,3,0.0625,1,1,5,0,1,0,0,0,0.2,1,0.4,2,0.0,0,0.4,2,0.0,0
90733,202306250903081105,2023,6,25,3,阪神,8,11,イクイノックス,5339,1126,17,3,5,195,17,4,2200,良,曇,13,牡,4,58.0,,492.0,,4,1.3,1.3,1,1.5,1,1.1,1.1,1.1,1.1,1,1.2,1.1,1,1,-0.0,2112,16.0,16.0,13.0,9.0,34.8,2,36.15,53.9,53.83,51.1,0.7,中団,1.0,22000,0,キタサンブラック,シャトーブランシュ,キングヘイロー,ニアークティック系,青鹿,2019105219,2023062509030811,芝,平地,R,阪神芝2200,inner,2023-06-25 11:03:00,over4,0.058824,1,1,6,1,0,0,0,0,0.166667,1,0.333333,2,0.0,0,0.5,3,0.0,0
75446,202310290504091107,2023,10,29,4,東京,9,11,イクイノックス,5339,1126,11,6,7,195,11,3,2000,良,晴,13,牡,4,58.0,,494.0,2.0,4,1.3,1.3,1,1.5,1,1.1,1.1,1.1,1.1,1,1.1,1.1,1,1,-0.4,1552,,3.0,3.0,3.0,34.2,3,34.71,51.5,53.07,49.4,0.5,先行,1.0,22000,0,キタサンブラック,シャトーブランシュ,キングヘイロー,ニアークティック系,青鹿,2019105219,2023102905040911,芝,平地,L,東京芝2000,outer,2023-10-29 11:01:00,over4,0.090909,1,1,7,0,1,0,0,0,0.142857,1,0.428571,3,0.0,0,0.428571,3,0.0,0
71709,202311260505081202,2023,11,26,5,東京,8,12,イクイノックス,5339,1126,18,1,2,195,11,4,2400,良,曇,13,牡,4,58.0,,498.0,4.0,4,1.3,1.3,1,1.5,1,1.1,1.1,1.1,1.1,1,1.1,1.1,1,1,-0.7,2218,3.0,3.0,3.0,3.0,33.5,1,36.1,57.8,57.03,46.2,3.0,先行,1.0,50000,0,キタサンブラック,シャトーブランシュ,キングヘイロー,ニアークティック系,青鹿,2019105219,2023112605050812,芝,平地,L,東京芝2400,inner,2023-11-26 12:01:00,over4,0.055556,1,1,8,0,1,0,0,0,0.125,1,0.375,3,0.0,0,0.5,4,0.0,0


In [82]:
tmp[feature_col].tail()

Unnamed: 0,後方_per_entries,後方_cumcount_past_racing,中団_per_entries,中団_cumcount_past_racing,逃げ_per_entries,逃げ_cumcount_past_racing,先行_per_entries,先行_cumcount_past_racing,ﾏｸﾘ_per_entries,ﾏｸﾘ_cumcount_past_racing
7,0.0,0,0.0,0,0.0,0,1.0,3,0.0,0
9,0.5,5,0.5,5,0.0,0,0.0,0,0.0,0
10,0.0,0,0.642857,9,0.0,0,0.285714,4,0.071429,1
5,0.2,1,0.2,1,0.0,0,0.6,3,0.0,0
0,0.0,0,0.666667,2,0.0,0,0.333333,1,0.0,0
