In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys

module_path = (Path().resolve().parent/ "Modules")
sys.path.append(str(module_path))

import my_modules, model_tuner # 自作モジュール

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def preprocessing(df_to_copy):
    df = df_to_copy.copy()
    original_col = df.columns.tolist()
    df["race_id"] = df["レースID(新)"]
    df["year"] = df["日付"] // 10000
    df["month"] = df["日付"] // 100 % 100
    df["day"] = df["日付"] % 100
    df["times"] = df["開催"].str[0].astype(int)
    df["place"] = df["場所"]
    df["daily"] = df["開催"].str[2]
    df["race_num"] = df["Ｒ"]
    df["horse"] = df["馬名S"]
    df["jockey_id"] = df["騎手コード"]
    df["trainer_id"] = df["調教師コード"]
    df["horse_N"] = df["頭数"]
    df["waku_num"] = df["枠番"]
    df["horse_num"] = df["馬番"]
    df["class_code"] = df["クラスコード"]
    df["track_code"] = df["トラックコード(JV)"]
    df["corner_num"] = df["ｺｰﾅｰ"]
    df["dist"] = df["距離"].str[1:].astype(int)
    df["state"] = df["馬場状態"]
    df["weather"] = df["天気"]
    df["age_code"] = df["競走種別"]
    df["sex"] = df["性別"]
    df["age"] = df["年齢"]
    df["basis_weight"] = df["斤量"].str.replace(r'[^0-9.]', '', regex=True).astype(float)
    df["blinker"] = df["ブリンカー"]
    df["weight"] = df["馬体重"]
    df["inc_dec"] = df["馬体重増減"]
    df["weight_code"] = df["重量コード"]
    df["win_odds"] = df["単勝オッズ"]
    df["win_odds_1"] = df["指時系1・単勝"]
    df["win_odds_1_pop"] = df["指時系1・人気"]
    df["win_odds_2"] = df["指時系2・単勝"]
    df["win_odds_2_pop"] = df["指時系2・人気"]
    df["win_mul_odds_Hi"] = df["複勝オッズ上限"]
    df["win_mul_odds_Lo"] = df["複勝オッズ下限"]
    df["win_mul_odds_1_Hi"] = df["複上1"]
    df["win_mul_odds_1_Lo"] = df["指時系1・複下"]
    df["win_mul_odds_1_pop"] = df["複人気1"]
    df["win_mul_odds_2_Hi"] = df["複上2"]
    df["win_mul_odds_2_Lo"] = df["指時系2・複下"]
    df["win_mul_odds_2_pop"] = df["複人気2"]
    df["rank"] = df["確定着順"] 
    df["time_diff"] = df["着差"]
    df["time"] = df["走破タイム"]
    df["corner1_rank"] = df["1角"]
    df["corner2_rank"] = df["2角"]
    df["corner3_rank"] = df["3角"]
    df["corner4_rank"] = df["4角"]
    df["last_3F_time"] = df["上り3F"]
    df["last_3F_rank"] = df["上り3F順"]
    df["Ave_3F"]= df["Ave-3F"]
    original_col.remove("PCI")
    original_col.remove("PCI3")
    original_col.remove("RPCI")
    df["last_3F_time_diff"] = df["上3F地点差"]
    df["leg"] = df["脚質"]
    df["pop"] = df["人気"]
    df["prize"] = df["賞金"]
    df["error_code"] = df["異常コード"]
    df["father"] = df["種牡馬"]
    df["mother"] = df["母馬"]
    df["broodmare_sire"] = df["母父馬"]
    df["broodmare_sire_type"] = df["母父タイプ名"]
    df["horse_color"] = df["毛色"]
    df["id"] = df["血統登録番号"]

    df = df.drop(original_col, axis=1)

    df.info()


    return df

In [None]:
def feature_engineering(df_to_copy, feature_col_to_copy=None):
    if feature_col_to_copy == None :
        feature_col_to_copy = ["waku_num", "horse_num", "sex", "age", "basis_weight", "blinker", "weight", "inc_dec"]
    feature_col = feature_col_to_copy.copy()
    df = df_to_copy.copy()

    # 直近3レースの結果とその平均, 過去全てのレースの記録の平均を追加
    last_race_col = ["weight", "inc_dec", "last_3F_time", "Ave_3F", "PCI"]
    for col in last_race_col:
        grouped = df.groupby("horse", observed=True)[col]
        for i in range(1, 4):
            # 過去1-3レースの結果を追加
            colname = f"{col}_last_{i}"
            df[colname] = grouped.shift(1)
            feature_col.append(colname)
        
        # 過去3レース分の結果の平均を追加
        df[f"{col}_mean_last_1_to_3"] = df[[f"{col}_last_{i}" for i in range(1, 4)]].mean(axis=1, skipna=True)
        feature_col.append(f"{col}_mean_last_1_to_3")

        # 過去全レース文の特徴量を追加
        cumsum = grouped.cumsum()
        count = grouped.cumcount()
        df[f"{col}_mean_all"] = (cumsum - df[col]) / count.replace(0, np.nan)
        feature_col.append(f"{col}_mean_all")


    # 過去その馬の全てのレースの1着率
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["horse"])

    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["dist"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["track_code"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["field_type"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["turn_type"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["weather"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["state"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["place"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["corner_num"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["class_code"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["basis_weight"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["age_code"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["weight_code"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["jockey_id"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["jockey_id", "class_code"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["jockey_id", "place"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["jockey_id", "dist"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["jockey_id", "field_type"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["jockey_id", "place", "dist"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["jockey_id", "place", "field_type", "dist"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["weather", "state"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["dist", "corner_num"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["dist", "track_code"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["dist", "class_code"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["place", "field_type", "dist"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["place", "field_type", "dist", "class_code"])

    # 過去他の馬も含む全レースで同条件でのレースの1着の確率
    # dist, field_type, place, race_type系
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["dist", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["dist", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["dist", "horse_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["field_type", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["field_type", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["field_type", "horse_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["place", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["place", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["place", "horse_num"])

    df, feature_col = grouped_winning_rate(df, feature_col, cols=["field_type", "dist", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["field_type", "dist", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["field_type", "dist", "horse_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["dist", "place", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["dist", "place", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["dist", "place", "horse_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["field_type", "place", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["field_type", "place", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["field_type", "place", "horse_num"])

    df, feature_col = grouped_winning_rate(df, feature_col, cols=["race_type", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["race_type", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["race_type", "horse_num"])


    # leg系(リーク情報なので一旦停止)
    '''
    df, feature_col = grouped_leg_winning_rate(df, feature_col, cols=["leg"])
    df, feature_col = grouped_leg_winning_rate(df, feature_col, cols=["leg", "dist"])
    df, feature_col = grouped_leg_winning_rate(df, feature_col, cols=["leg", "place"])
    df, feature_col = grouped_leg_winning_rate(df, feature_col, cols=["leg", "field_type"])
    df, feature_col = grouped_leg_winning_rate(df, feature_col, cols=["leg", "place", "field_type"])
    df, feature_col = grouped_leg_winning_rate(df, feature_col, cols=["leg", "place", "dist"])
    df, feature_col = grouped_leg_winning_rate(df, feature_col, cols=["leg", "dist", "field_type"])
    df, feature_col = grouped_leg_winning_rate(df, feature_col, cols=["leg","race_type"])
    df, feature_col = grouped_leg_winning_rate(df, feature_col, cols=["leg", "race_type", "waku"])
    df, feature_col = grouped_leg_winning_rate(df, feature_col, cols=["leg", "race_type", "waku_num"])
    df, feature_col = grouped_leg_winning_rate(df, feature_col, cols=["leg","race_type", "horse_num"])
    '''

    # jockey_id系
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "place"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "place", "dist"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "field_type"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "field_type", "dist"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "field_type", "place"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "dist"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "race_type"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "race_type", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "race_type", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "race_type", "horse_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "class_code"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "class_code", "place"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "class_code", "dist"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "class_code", "field_type"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "class_code", "race_type"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "class_code", "race_type", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "class_code", "race_type", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "class_code", "race_type", "horse_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "turn_type"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "turn_type", "dist"])

    # mother系
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "place"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "dist"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "field_type"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "turn_type"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "race_type"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "track_code"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "class_code"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "corner_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "horse_num"])

    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "place", "dist"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "place", "field_type"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "dist", "field_type"])

    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "place", "dist", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "place", "dist", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "place", "dist", "horse_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "dist", "field_type", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "dist", "field_type", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "dist", "field_type", "horse_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "field_type", "place", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "field_type", "place", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "field_type", "place", "horse_num"])

    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "race_type", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "race_type", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "race_type", "horse_num"])
    
    # father系
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "place"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "dist"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "field_type"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "turn_type"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "race_type"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "track_code"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "class_code"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "corner_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "horse_num"])

    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "place", "dist"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "place", "field_type"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "dist", "field_type"])

    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "place", "dist", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "place", "dist", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "place", "dist", "horse_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "dist", "field_type", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "dist", "field_type", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "dist", "field_type", "horse_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "field_type", "place", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "field_type", "place", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "field_type", "place", "horse_num"])

    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "race_type", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "race_type", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "race_type", "horse_num"])
    
    
    # その他特徴量を追加
    # weightに関する特徴量
    # weightは300kg以下の馬がいないことからこのようにした。
    df["basis_weight_per_weight"] = df["basis_weight"] / df["weight"].clip(lower=300) * 100 # 斤量/馬体重（％）
    feature_col.append("basis_weight_per_weight")
    df["basis_weight_plus_weight"] = df["basis_weight"] + df["weight"] # 斤量＋馬体重
    feature_col.append("basis_weight_plus_weight")
    df["inc_dec_rate"] = df["inc_dec"] / df["weight"].clip(lower=300) * 100 # 増減/馬体重（％）
    feature_col.append("inc_dec_rate")

    # 生涯獲得賞金
    df["lifetime_prize"] = df.groupby("horse", observed=True)["prize"].cumsum() - df["prize"]
    feature_col.append("lifetime_prize")
    
    # 前回と同じfield_typeかどうか
    df["last_field_type"] = df.groupby(["horse"], observed=True)["field_type"].shift(1)
    feature_name = "is_same_field_type_as_last"
    df[feature_name] =  df["field_type"] == df["last_field_type"]
    df[feature_name] = df[feature_name].astype("category")
    df = df.drop(["last_field_type"], axis=1)
    feature_col.append(feature_name)

    # 前回と同じクラスか
    df["last_class_code"] = df.groupby(["horse"], observed=True)["class_code"].shift(1)
    feature_name = "is_same_class_code_as_last"
    df[feature_name] = df["class_code"] == df["last_class_code"]
    df[feature_name] = df[feature_name].astype("category")
    df = df.drop(["last_class_code"], axis=1)
    feature_col.append(feature_name)

    # 前回と同じジョッキーか
    df["last_jockey"] = df.groupby(["horse"], observed=True)["jockey_id"].shift(1)
    feature_name = "is_same_jockey_as_last"
    df[feature_name] = df["jockey_id"] == df["last_jockey"]
    df[feature_name] = df[feature_name].astype("category")
    df = df.drop(["last_jockey"], axis=1)
    feature_col.append(feature_name)

    # 中何日か
    df["last_race_date"] = df.groupby("horse", observed=True)["datetime"].shift(1)
    df["interval"] = df["datetime"] - df["last_race_date"]
    df["interval_day"] = df["interval"].dt.days
    df["interval_week"] = df["interval_day"] // 7
    df = df.drop(["last_race_date", "interval"], axis=1)
    feature_col.append("interval_day")
    feature_col.append("interval_week")


    # 最後に全体を正規化（std=1とする)
    num_col = df[feature_col].select_dtypes(include=["number"]).columns.tolist()
    grouped_mean = df.groupby("id_for_fold", observed=True)[num_col].transform("mean")
    grouped_std = df.groupby("id_for_fold", observed=True)[num_col].transform("std")
    df[num_col] = (df[num_col] - grouped_mean) / grouped_std

    # 後でランキング化とかも付ける予定

    # dfを表示
    print(feature_col)
    display(df.tail())

    return df, feature_col


# 馬でグループ化したtarget-encodingをする関数
def grouped_horse_winning_rate(df_to_copy, feature_col_to_copy, cols=None):
    df = df_to_copy.copy()
    feature_col = feature_col_to_copy.copy()

    if cols == None :
        print("Error: please select cols")
        return
    
    # 1着の確率で計算
    grouped = df.groupby(["horse", *cols], observed=True)["target"]
    cumsum = grouped.cumsum()
    count = grouped.cumcount()
    feature_name = "horse_win_rate_" + "_".join(cols)
    df[feature_name] = (cumsum-df["target"]) / count.replace(0, np.nan)

    feature_col.append(feature_name)

    # 1-3着の確率で計算
    grouped = df.groupby(["horse", *cols], observed=True)["target3"]
    cumsum = grouped.cumsum()
    count = grouped.cumcount()
    feature_name = "horse_win_rate3_" + "_".join(cols)
    df[feature_name] = (cumsum-df["target3"]) / count.replace(0, np.nan)

    feature_col.append(feature_name)

    return df, feature_col


# 過去全てのレースでグループ化したtarget-encodingをする関数
def grouped_winning_rate(df_to_copy, feature_col_to_copy, cols):
    df = df_to_copy.copy()
    feature_col = feature_col_to_copy.copy()
    grouped1 = df.groupby(cols, observed=True)
    grouped2 = df.groupby(["id_for_fold", *cols], observed=True)

    # 同じ条件で1着になるの確率を計算
    bunsi1 = grouped1["target"].cumsum() - grouped2["target"].cumsum()
    bunbo1 = grouped1["target"].cumcount() - grouped2["target"].cumcount()

    feature_name = "all_win_rate_" + "_".join(cols)
    feature_col.append(feature_name)
    df[feature_name] = bunsi1 / bunbo1.replace(0, np.nan)

    # 同じ条件で1着になるの確率を計算
    bunsi3 = grouped1["target3"].cumsum() - grouped2["target3"].cumsum()
    bunbo3 = grouped1["target3"].cumcount() - grouped2["target3"].cumcount()

    feature_name3 = "all_win_rate3_" + "_".join(cols)
    feature_col.append(feature_name3)
    df[feature_name3] = bunsi3 / bunbo3.replace(0, np.nan)


    return df, feature_col

In [4]:
df = pd.read_csv("../Data/train_data_JV.csv", encoding="shift-jis")
df.head()

Unnamed: 0,レースID(新),日付,開催,場所,Ｒ,馬名S,騎手コード,調教師コード,頭数,枠番,...,脚質,人気,賞金,異常コード,種牡馬,母馬,母父馬,母父タイプ名,毛色,血統登録番号
0,202506080503021211,250608,3東2,東京,12,ゴールデンオスカー,1150,1189,14,7,...,中団,7.0,1550,0,スクリーンヒーロー,バラベルサイユ,アグネスデジタル,ネイティヴダンサー系,栗毛,2021101235
1,202506080503021212,250608,3東2,東京,12,サノノワンダー,1140,1127,14,7,...,後方,1.0,620,0,ヘニーヒューズ,プレトリアン,キングヘイロー,ニアークティック系,鹿毛,2022102039
2,202506080503021209,250608,3東2,東京,12,トーホウキザン,1215,1135,14,6,...,後方,13.0,390,0,トーホウジャッカル,トーホウドルチェ,サウスヴィグラス,ネイティヴダンサー系,栗毛,2020101764
3,202506080503021202,250608,3東2,東京,12,マンダリンボレロ,1085,1031,14,2,...,先行,3.0,230,0,スピルバーグ,ジャーメイン,アドマイヤムーン,ネイティヴダンサー系,栗毛,2020101655
4,202506080503021204,250608,3東2,東京,12,グラヴィス,1163,1075,14,3,...,先行,8.0,155,0,ハーツクライ,ラヴズオンリーミー,Storm Cat,ニアークティック系,鹿毛,2021105831


In [5]:
df = preprocessing(df)
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66749 entries, 0 to 66748
Data columns (total 65 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   PCI                  66241 non-null  float64
 1   PCI3                 66749 non-null  float64
 2   RPCI                 66749 non-null  float64
 3   race_id              66749 non-null  int64  
 4   year                 66749 non-null  int64  
 5   month                66749 non-null  int64  
 6   day                  66749 non-null  int64  
 7   times                66749 non-null  int64  
 8   place                66749 non-null  object 
 9   daily                66749 non-null  object 
 10  race_num             66749 non-null  int64  
 11  horse                66749 non-null  object 
 12  jockey_id            66749 non-null  int64  
 13  trainer_id           66749 non-null  int64  
 14  horse_N              66749 non-null  int64  
 15  waku_num             66749 non-null 

Unnamed: 0,PCI,PCI3,RPCI,race_id,year,month,day,times,place,daily,...,leg,pop,prize,error_code,father,mother,broodmare_sire,broodmare_sire_type,horse_color,id
0,53.0,52.6,47.6,202506080503021211,25,6,8,3,東京,2,...,中団,7.0,1550,0,スクリーンヒーロー,バラベルサイユ,アグネスデジタル,ネイティヴダンサー系,栗毛,2021101235
1,52.6,52.6,47.6,202506080503021212,25,6,8,3,東京,2,...,後方,1.0,620,0,ヘニーヒューズ,プレトリアン,キングヘイロー,ニアークティック系,鹿毛,2022102039
2,52.2,52.6,47.6,202506080503021209,25,6,8,3,東京,2,...,後方,13.0,390,0,トーホウジャッカル,トーホウドルチェ,サウスヴィグラス,ネイティヴダンサー系,栗毛,2020101764
3,46.0,52.6,47.6,202506080503021202,25,6,8,3,東京,2,...,先行,3.0,230,0,スピルバーグ,ジャーメイン,アドマイヤムーン,ネイティヴダンサー系,栗毛,2020101655
4,46.0,52.6,47.6,202506080503021204,25,6,8,3,東京,2,...,先行,8.0,155,0,ハーツクライ,ラヴズオンリーミー,Storm Cat,ニアークティック系,鹿毛,2021105831


In [6]:
df = my_modules.common_process(df)

  df["place_num"] = df["place"].replace(place_dict).astype(int)


In [None]:
# 過去全てのレースでグループ化したtarget-encodingをする関数
def grouped_winning_rate(df_to_copy, feature_col_to_copy, cols):
    df = df_to_copy.copy()
    feature_col = feature_col_to_copy.copy()
    grouped1 = df.groupby(cols, observed=True)
    grouped2 = df.groupby(["id_for_fold", *cols], observed=True)

    # 同じ条件で1着になるの確率を計算
    bunsi1 = grouped1["target"].cumsum() - grouped2["target"].cumsum()
    bunbo1 = grouped1["target"].cumcount() - grouped2["target"].cumcount()

    feature_name = "all_win_rate_" + "_".join(cols)
    feature_col.append(feature_name)
    df[feature_name] = bunsi1 / bunbo1.replace(0, np.nan)

    # 同じ条件で1着になるの確率を計算
    bunsi3 = grouped1["target3"].cumsum() - grouped2["target3"].cumsum()
    bunbo3 = grouped1["target3"].cumcount() - grouped2["target3"].cumcount()

    feature_name3 = "all_win_rate3_" + "_".join(cols)
    feature_col.append(feature_name3)
    df[feature_name3] = bunsi3 / bunbo3.replace(0, np.nan)


    return df, feature_col

df2, _ = grouped_winning_rate(df, [], cols=["waku_num", "place"])
df2[(df2.waku_num == 1) & (df2.place == "中山")][["id_for_fold", "waku_num", "place", "target3","all_win_rate3_waku_num_place"]].head(30)

Unnamed: 0,id_for_fold,waku_num,place,target3,all_win_rate3_waku_num_place
66569,2024010606010101,1,中山,0,
66560,2024010606010101,1,中山,0,
66548,2024010606010102,1,中山,0,0.0
66553,2024010606010102,1,中山,0,0.0
66535,2024010606010103,1,中山,0,0.0
66514,2024010606010104,1,中山,0,0.0
66508,2024010606010104,1,中山,1,0.0
66490,2024010606010105,1,中山,1,0.142857
66495,2024010606010105,1,中山,0,0.142857
66480,2024010606010106,1,中山,0,0.222222


In [9]:
tmp = df[(df["waku_num"] == 1) | (df["waku_num"] == 2)][["id_for_fold", "place", "horse", "waku_num","target"]]
tmp.head(30)

Unnamed: 0,id_for_fold,place,horse,waku_num,target
66569,2024010606010101,中山,コーストガード,1,0
66560,2024010606010101,中山,ミッキードラマー,1,0
66567,2024010606010101,中山,エテルノアキーロ,2,0
66557,2024010606010101,中山,グリントリッター,2,0
66746,2024010608010101,京都,メネラオス,2,0
66742,2024010608010101,京都,ヤルキゲンキフトシ,1,0
66747,2024010608010101,京都,オスピタリテ,1,0
66734,2024010608010101,京都,カフジテルビウム,2,0
66548,2024010606010102,中山,マーゴットエクラ,1,0
66549,2024010606010102,中山,クインズスピカ,2,0


In [14]:
tmp2 = tmp.copy()

grouped1 = tmp.groupby(["place", "waku_num"], observed=True)["target"]
grouped2 = tmp.groupby(["id_for_fold", "place", "waku_num"], observed=True)["target"]


tmp2["bunsi"] = grouped1.cumsum() - grouped2.cumsum()
tmp2["bunbo"] = grouped1.cumcount() - grouped2.cumcount()
tmp2["all_win_rate_waku_num"] = tmp2["bunsi"] / tmp2["bunbo"].replace(0, np.nan)

display(tmp2[(tmp2.place == "中山") & (tmp.waku_num == 1)].head(15))
display(tmp2[(tmp2.place == "京都") & (tmp2.waku_num == 1)].head(15))
display(tmp2[(tmp2.place == "中山") & (tmp.waku_num == 2)].head(15))
display(tmp2[(tmp2.place == "京都") & (tmp2.waku_num == 2)].head(15))

Unnamed: 0,id_for_fold,place,horse,waku_num,target,bunsi,bunbo,all_win_rate_waku_num
66569,2024010606010101,中山,コーストガード,1,0,0,0,
66560,2024010606010101,中山,ミッキードラマー,1,0,0,0,
66548,2024010606010102,中山,マーゴットエクラ,1,0,0,2,0.0
66553,2024010606010102,中山,ニシノフルール,1,0,0,2,0.0
66535,2024010606010103,中山,エルキーオ,1,0,0,4,0.0
66514,2024010606010104,中山,ノアファラオ,1,0,0,5,0.0
66508,2024010606010104,中山,タイキオナード,1,0,0,5,0.0
66490,2024010606010105,中山,マイネルモメンタム,1,0,0,7,0.0
66495,2024010606010105,中山,ジーティーオウジャ,1,0,0,7,0.0
66480,2024010606010106,中山,クロスザルビコン,1,0,0,9,0.0


Unnamed: 0,id_for_fold,place,horse,waku_num,target,bunsi,bunbo,all_win_rate_waku_num
66742,2024010608010101,京都,ヤルキゲンキフトシ,1,0,0,0,
66747,2024010608010101,京都,オスピタリテ,1,0,0,0,
66726,2024010608010102,京都,サンライズマウレア,1,0,0,2,0.0
66717,2024010608010102,京都,ハーバーライト,1,1,0,2,0.0
66703,2024010608010103,京都,ハッピーダンス,1,0,1,4,0.25
66716,2024010608010103,京都,ワンダーエトワール,1,0,1,4,0.25
66695,2024010608010104,京都,ジャスティンライズ,1,0,1,6,0.166667
66692,2024010608010104,京都,エクササイズ,1,0,1,6,0.166667
66667,2024010608010105,京都,エラトー,1,1,1,8,0.125
66669,2024010608010105,京都,パシフィックハイ,1,0,1,8,0.125


Unnamed: 0,id_for_fold,place,horse,waku_num,target,bunsi,bunbo,all_win_rate_waku_num
66567,2024010606010101,中山,エテルノアキーロ,2,0,0,0,
66557,2024010606010101,中山,グリントリッター,2,0,0,0,
66549,2024010606010102,中山,クインズスピカ,2,0,0,2,0.0
66551,2024010606010102,中山,ミルフルール,2,0,0,2,0.0
66529,2024010606010103,中山,ケイティブルーム,2,0,0,4,0.0
66525,2024010606010103,中山,ジェットブレイク,2,0,0,4,0.0
66506,2024010606010104,中山,マコタイガ,2,1,0,6,0.0
66513,2024010606010104,中山,ネオクラウン,2,0,0,6,0.0
66500,2024010606010105,中山,マイネルガンナー,2,0,1,8,0.125
66499,2024010606010105,中山,ネクタール,2,0,1,8,0.125


Unnamed: 0,id_for_fold,place,horse,waku_num,target,bunsi,bunbo,all_win_rate_waku_num
66746,2024010608010101,京都,メネラオス,2,0,0,0,
66734,2024010608010101,京都,カフジテルビウム,2,0,0,0,
66718,2024010608010102,京都,キタノハヤブサ,2,0,0,2,0.0
66721,2024010608010102,京都,マテンロウガーデン,2,0,0,2,0.0
66712,2024010608010103,京都,アルピニスト,2,0,0,4,0.0
66710,2024010608010103,京都,グディンナ,2,0,0,4,0.0
66687,2024010608010104,京都,マテンロウブラボー,2,0,0,6,0.0
66688,2024010608010104,京都,ケーヴァラ,2,0,0,6,0.0
66676,2024010608010105,京都,コーラルハート,2,0,0,8,0.0
66668,2024010608010105,京都,カズミクラーシュ,2,0,0,8,0.0
