In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys

module_path = (Path().resolve().parent/ "Modules")
sys.path.append(str(module_path))

import my_modules, model_tuner # 自作モジュール

In [2]:
names = [
    "race_id",
    "year",
    "month",
    "day",
    "times",
    "place",
    "daily",
    "race_num",
    "horse",
    "jockey_id",
    "horse_N",
    "waku_num",
    "horse_num",
    "class_code",
    "track_code",
    "corner_num",
    "dist",
    "state",
    "weather",
    "age_code",
    "sex",
    "age",
    "basis_weight",
    "blinker",
    "weight",
    "inc_dec",
    "weight_code",
    "win_odds",
    "rank",
    "time_diff",
    "time",
    "corner1_rank",
    "corner2_rank",
    "corner3_rank",
    "corner4_rank",
    "last_3F_time",
    "last_3F_rank",
    "Ave_3F",
    "PCI",
    "last_3F_time_diff",
    "leg",
    "pop",
    "prize",
    "error_code",
    "father",
    "mother",
    "id"
]


In [3]:
df2023 = pd.read_csv("../Data/record_data_2023.csv", encoding="shift-jis",header=None, names=names)
df2022 = pd.read_csv("../Data/record_data_2022.csv", encoding="shift-jis",header=None, names=names)
df2021 = pd.read_csv("../Data/record_data_2021.csv", encoding="shift-jis",header=None, names=names)
df = pd.concat([df2021, df2022, df2023], axis=0)
df.head()

Unnamed: 0,race_id,year,month,day,times,place,daily,race_num,horse,jockey_id,...,Ave_3F,PCI,last_3F_time_diff,leg,pop,prize,error_code,father,mother,id
0,202106120101010101,21,6,12,1,札幌,1,1,ミエノベルル,1144,...,34.4,46.1,0.6,中団,6.0,0,0,エピファネイア,ベルルミエール,2018104780
1,202106120101010102,21,6,12,1,札幌,1,1,セリシア,666,...,34.1,47.7,0.3,先行,1.0,0,0,エイシンヒカリ,サワノパルファン,2018100570
2,202106120101010103,21,6,12,1,札幌,1,1,リトルロータス,1173,...,35.1,51.7,1.3,後方,16.0,0,0,ブラックタイド,パパラチア,2018101518
3,202106120101010104,21,6,12,1,札幌,1,1,マイネルニゲラ,1091,...,34.0,44.7,0.2,先行,11.0,0,0,ジョーカプチーノ,シーナリー,2018101447
4,202106120101010105,21,6,12,1,札幌,1,1,ペイシャケイティー,1015,...,34.5,50.0,0.7,中団,4.0,51,0,カレンブラックヒル,ヒシシャトル,2018101539


In [4]:
df = my_modules.common_process(df)

  df["place_num"] = df["place"].replace(place_dict).astype(int)


- 欠損値埋め
- 中団、後方、追込、マクリをどう処理するか考える
- 脚質ごとにラベルをカウント
- 脚質ごとのラベルを正規化する
- target-encodingを脚質ごとのラベルで正規化する

In [5]:
# legを含めた過去のレース結果からのtarget_encoding
def grouped_leg_winning_rate(df_to_copy, feature_col_to_copy, cols=None):
    if cols == None:
        raise ValueError("cols must be specified.")
    
    df = df_to_copy.copy()
    feature_col = feature_col_to_copy.copy()

    # 欠損値処理(差しで大丈夫？)
    df["leg"] = df["leg"].fillna("差し")

    # ダミー変数化
    df["concat_col"] = df[cols[0]]# これにcolsを繋げてダミー変数にする
    for col in cols[1:]: # colを全部繋げる
        df["concat_col"] = df["concat_col"] + "_" + df[col].astype(str)

    dummies = pd.get_dummies(df["concat_col"], drop_first=False, dtype=int)
    df = pd.concat([df, dummies], axis=1)

    # 日付を作成
    df["date"] = df["year"]*10000 + df["month"]*100 + df["day"]

    # ダミー変数列を使って勝利数を掛け算
    # ターゲット列を作成
    grouped_cols = {}
    for dummy in dummies.columns:
        grouped_cols[dummy] = df[dummy]
        grouped_cols[f"{dummy}_target"] = df[dummy] * df["target"]
        grouped_cols[f"{dummy}_target3"] = df[dummy] * df["target3"]


    grouped_df = pd.DataFrame(grouped_cols)
    grouped_df["date"] = df["date"]

    grouper = grouped_df.groupby("date")

    for dummy in dummies.columns:
        # 当日を含まない同じ脚質の累積和
        grouped_cumsum = grouper[dummy].sum()
        cumsum = grouped_cumsum.cumsum() - grouped_cumsum 
        # 当日を含まない同じ脚質かつ勝った馬の累積和
        grouped_count = grouper[[f"{dummy}_target", f"{dummy}_target3"]].sum()
        count = grouped_count.cumsum() - grouped_count 

        # 累積和から勝率を計算
        cumsum = cumsum.replace(0, np.nan)
        count[f"{dummy}_target"] = count[f"{dummy}_target"] / cumsum
        count[f"{dummy}_target3"] = count[f"{dummy}_target3"] / cumsum
        
        win_rate = count.reset_index()
        win_rate.columns = ["date", f"tmp_{dummy}_win_rate", f"tmp_{dummy}_win_rate3"]

        try:
            merge_df = pd.concat([merge_df, win_rate[[f"tmp_{dummy}_win_rate", f"tmp_{dummy}_win_rate3"]]], axis=1)
        except:
            merge_df = win_rate


    print("merge_df")
    display(merge_df.tail())

    df = pd.merge(left=df, right=merge_df, how="left", on="date")

    # 各馬の過去の脚質の累積和を加える。
    leg_dummy = pd.get_dummies(df["leg"], drop_first=False, dtype=int)
    leg_dummy["horse"] = df["horse"]
    leg_cumsum = leg_dummy.groupby("horse", observed=True)[leg_dummy.columns[:-1]].cumsum()
    leg_df = leg_cumsum - leg_dummy[leg_dummy.columns[:-1]]
    leg_df_rename_dict = {
        "中団": "prev_leg_中団",
        "先行": "prev_leg_先行",
        "差し": "prev_leg_差し",
        "後方": "prev_leg_後方",
        "追込": "prev_leg_追込",
        "逃げ": "prev_leg_逃げ",
        "ﾏｸﾘ": "prev_leg_ﾏｸﾘ"
    }
    leg_df = leg_df.rename(columns=leg_df_rename_dict)

    print("leg_df")
    display(leg_df.tail())

    df = pd.concat([df, leg_df], axis=1)

    # 新しい特徴量を作ってそこに得られた値を埋め込む
    feature_name = "all_win_rate_" + "_".join(cols)
    feature_name3 = "all_win_rate3_" + "_".join(cols)
    df[feature_name] = np.nan
    df[feature_name3] = np.nan

    # 特徴量を選択して掛け算するための基準となる関数
    df["feature_name_row"] = ""
    for col in cols[1:]:
        df["feature_name_row"] = df["feature_name_row"] + "_" + df[col].astype(str) 
    df["feature_name_row1"] = df["feature_name_row"] + "_win_rate"
    df["feature_name_row3"] = df["feature_name_row"] + "_win_rate3"


    def embedding1(row):
        sum_of_win_rate = np.nan # 勝率の合計
        sum_of_legs = 0 # ダミー変数化した脚質の合計
        sum_of_win_rate_nan_flg = True # sum_of_win_rateがnanかどうかのフラグ

        for leg_type in df["leg"].unique().tolist():
            feature_name_row1_tmp = "tmp_" + str(leg_type) + row["feature_name_row1"]

            # row[feature_name_row_tmp]がない可能性があるので確認
            try:
                tmp = row[feature_name_row1_tmp]
            except:
                continue

            # もし列が存在して、条件にあう特徴量がnp.nanではないなら
            if not pd.isna(row[feature_name_row1_tmp]):
                if sum_of_win_rate_nan_flg:
                    sum_of_win_rate = 0 # np.nanではない行が一つでもあるならnp.nanを外す
                sum_of_win_rate_nan_flg = False
                sum_of_win_rate += row[feature_name_row1_tmp] * row["prev_leg_" + leg_type]
                sum_of_legs += row["prev_leg_"+ leg_type]
            
        if (not sum_of_win_rate_nan_flg) and (sum_of_legs != 0):
            return sum_of_win_rate / sum_of_legs
        else:
            return np.nan
    
    def embedding3(row):
        sum_of_win_rate = np.nan # 勝率の合計
        sum_of_legs = 0 # ダミー変数化した脚質の合計
        sum_of_win_rate_nan_flg = True # sum_of_win_rateがnanかどうかのフラグ

        for leg_type in df["leg"].unique().tolist():
            feature_name_row3_tmp = "tmp_" + str(leg_type) + row["feature_name_row3"]

            # row[feature_name_row_tmp]がない可能性があるので確認
            try:
                tmp = row[feature_name_row3_tmp]
            except:
                continue

            # もし列が存在して、条件にあう特徴量がnp.nanではないなら
            if not pd.isna(row[feature_name_row3_tmp]):
                if sum_of_win_rate_nan_flg:
                    sum_of_win_rate = 0 # np.nanではない行が一つでもあるならnp.nanを外す
                sum_of_win_rate_nan_flg = False
                sum_of_win_rate += row[feature_name_row3_tmp] * row["prev_leg_" + leg_type]
                sum_of_legs += row["prev_leg_"+ leg_type]
            
        if (not sum_of_win_rate_nan_flg) and (sum_of_legs != 0):
            return sum_of_win_rate / sum_of_legs
        else:
            return np.nan
        
    df[feature_name] = df.apply(embedding1, axis=1)
    df[feature_name3] = df.apply(embedding3, axis=1)
                

    feature_col.append(feature_name)
    feature_col.append(feature_name3)

    # 要らない行を削除
    tmp_cols = [f"tmp_{leg_type}_win_rate" for leg_type in dummies.columns]
    tmp_cols.extend([f"tmp_{leg_type}_win_rate3" for leg_type in dummies.columns])
    df = df.drop(tmp_cols, axis=1)
    df = df.drop(dummies.columns.tolist(), axis=1)
    df = df.drop(leg_df.columns.tolist(), axis=1)
    df = df.drop(["date", "concat_col", "feature_name_row", "feature_name_row1", "feature_name_row3"], axis=1)
    

    return df, feature_col


In [6]:
def feature_engineering(df_to_copy, feature_col_to_copy=None):
    if feature_col_to_copy == None :
        feature_col_to_copy = ["waku", "waku_num", "horse_num", "sex", "age", "basis_weight", "blinker", "weight", "inc_dec"]
    feature_col = feature_col_to_copy.copy()
    df = df_to_copy.copy()

    # 直近3レースの結果とその平均, 過去全てのレースの記録の平均を追加
    last_race_col = ["weight", "inc_dec", "last_3F_time", "Ave_3F", "PCI"]
    for col in last_race_col:
        grouped = df.groupby("horse", observed=True)[col]
        for i in range(1, 4):
            # 過去1-3レースの結果を追加
            colname = f"{col}_last_{i}"
            df[colname] = grouped.shift(1)
            feature_col.append(colname)
        
        # 過去3レース分の結果の平均を追加
        df[f"{col}_mean_last_1_to_3"] = df[[f"{col}_last_{i}" for i in range(1, 4)]].mean(axis=1, skipna=True)
        feature_col.append(f"{col}_mean_last_1_to_3")

        # 過去全レース文の特徴量を追加
        cumsum = grouped.cumsum()
        count = grouped.cumcount()
        df[f"{col}_mean_all"] = (cumsum - df[col]) / count.replace(0, np.nan)
        feature_col.append(f"{col}_mean_all")


    # 過去その馬の全てのレースの1着率
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["horse"])

    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["dist"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["track_code"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["field_type"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["turn_type"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["weather"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["state"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["place"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["corner_num"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["class_code"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["basis_weight"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["age_code"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["weight_code"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["jockey_id"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["jockey_id", "class_code"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["jockey_id", "place"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["jockey_id", "dist"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["jockey_id", "field_type"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["jockey_id", "place", "dist"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["jockey_id", "place", "field_type", "dist"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["weather", "state"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["dist", "corner_num"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["dist", "track_code"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["dist", "class_code"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["place", "field_type", "dist"])
    df, feature_col = grouped_horse_winning_rate(df, feature_col, cols=["place", "field_type", "dist", "class_code"])

    # 過去他の馬も含む全レースで同条件でのレースの1着の確率
    # dist, field_type, place, race_type系
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["dist", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["dist", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["dist", "horse_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["field_type", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["field_type", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["field_type", "horse_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["place", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["place", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["place", "horse_num"])

    df, feature_col = grouped_winning_rate(df, feature_col, cols=["field_type", "dist", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["field_type", "dist", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["field_type", "dist", "horse_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["dist", "place", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["dist", "place", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["dist", "place", "horse_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["field_type", "place", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["field_type", "place", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["field_type", "place", "horse_num"])

    df, feature_col = grouped_winning_rate(df, feature_col, cols=["race_type", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["race_type", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["race_type", "horse_num"])


    # leg系(リーク情報なので一旦停止)
    '''
    df, feature_col = grouped_leg_winning_rate(df, feature_col, cols=["leg"])
    df, feature_col = grouped_leg_winning_rate(df, feature_col, cols=["leg", "dist"])
    df, feature_col = grouped_leg_winning_rate(df, feature_col, cols=["leg", "place"])
    df, feature_col = grouped_leg_winning_rate(df, feature_col, cols=["leg", "field_type"])
    df, feature_col = grouped_leg_winning_rate(df, feature_col, cols=["leg", "place", "field_type"])
    df, feature_col = grouped_leg_winning_rate(df, feature_col, cols=["leg", "place", "dist"])
    df, feature_col = grouped_leg_winning_rate(df, feature_col, cols=["leg", "dist", "field_type"])
    df, feature_col = grouped_leg_winning_rate(df, feature_col, cols=["leg","race_type"])
    df, feature_col = grouped_leg_winning_rate(df, feature_col, cols=["leg", "race_type", "waku"])
    df, feature_col = grouped_leg_winning_rate(df, feature_col, cols=["leg", "race_type", "waku_num"])
    df, feature_col = grouped_leg_winning_rate(df, feature_col, cols=["leg","race_type", "horse_num"])
    '''

    # jockey_id系
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "place"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "place", "dist"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "field_type"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "field_type", "dist"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "field_type", "place"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "dist"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "race_type"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "race_type", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "race_type", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "race_type", "horse_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "class_code"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "class_code", "place"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "class_code", "dist"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "class_code", "field_type"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "class_code", "race_type"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "class_code", "race_type", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "class_code", "race_type", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "class_code", "race_type", "horse_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "turn_type"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["jockey_id", "turn_type", "dist"])


    # mother系
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "place"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "dist"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "field_type"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "turn_type"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "race_type"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "track_code"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "class_code"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "corner_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "horse_num"])

    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "place", "dist"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "place", "field_type"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "dist", "field_type"])

    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "place", "dist", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "place", "dist", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "place", "dist", "horse_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "dist", "field_type", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "dist", "field_type", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "dist", "field_type", "horse_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "field_type", "place", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "field_type", "place", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "field_type", "place", "horse_num"])

    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "race_type", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "race_type", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["mother", "race_type", "horse_num"])
    
    # father系
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "place"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "dist"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "field_type"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "turn_type"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "race_type"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "track_code"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "class_code"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "corner_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "horse_num"])

    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "place", "dist"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "place", "field_type"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "dist", "field_type"])

    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "place", "dist", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "place", "dist", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "place", "dist", "horse_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "dist", "field_type", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "dist", "field_type", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "dist", "field_type", "horse_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "field_type", "place", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "field_type", "place", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "field_type", "place", "horse_num"])

    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "race_type", "waku"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "race_type", "waku_num"])
    df, feature_col = grouped_winning_rate(df, feature_col, cols=["father", "race_type", "horse_num"])
    


    # その他特徴量を追加
    # weightに関する特徴量
    # weightは300kg以下の馬がいないことからこのようにした。
    df["basis_weight_per_weight"] = df["basis_weight"] / df["weight"].clip(lower=300) * 100 # 斤量/馬体重（％）
    feature_col.append("basis_weight_per_weight")
    df["basis_weight_plus_weight"] = df["basis_weight"] + df["weight"] # 斤量＋馬体重
    feature_col.append("basis_weight_plus_weight")
    df["inc_dec_rate"] = df["inc_dec"] / df["weight"].clip(lower=300) * 100 # 増減/馬体重（％）
    feature_col.append("inc_dec_rate")

    # 生涯獲得賞金
    df["lifetime_prize"] = df.groupby("horse", observed=True)["prize"].cumsum() - df["prize"]
    feature_col.append("lifetime_prize")
    
    # 前回と同じfield_typeかどうか
    df["last_field_type"] = df.groupby(["horse"], observed=True)["field_type"].shift(1)
    feature_name = "is_same_field_type_as_last"
    df[feature_name] =  df["field_type"] == df["last_field_type"]
    df[feature_name] = df[feature_name].astype("category")
    df = df.drop(["last_field_type"], axis=1)
    feature_col.append(feature_name)

    # 前回と同じクラスか
    df["last_class_code"] = df.groupby(["horse"], observed=True)["class_code"].shift(1)
    feature_name = "is_same_class_code_as_last"
    df[feature_name] = df["class_code"] == df["last_class_code"]
    df[feature_name] = df[feature_name].astype("category")
    df = df.drop(["last_class_code"], axis=1)
    feature_col.append(feature_name)

    # 前回と同じジョッキーか
    df["last_jockey"] = df.groupby(["horse"], observed=True)["jockey_id"].shift(1)
    feature_name = "is_same_jockey_as_last"
    df[feature_name] = df["jockey_id"] == df["last_jockey"]
    df[feature_name] = df[feature_name].astype("category")
    df = df.drop(["last_jockey"], axis=1)
    feature_col.append(feature_name)

    # 中何日か
    df["last_race_date"] = df.groupby("horse", observed=True)["datetime"].shift(1)
    df["interval"] = df["datetime"] - df["last_race_date"]
    df["interval_day"] = df["interval"].dt.days
    df["interval_week"] = df["interval_day"] // 7
    df = df.drop(["last_race_date", "interval"], axis=1)
    feature_col.append("interval_day")
    feature_col.append("interval_week")


    # 最後に全体を正規化（std=1とする)
    num_col = df[feature_col].select_dtypes(include=["number"]).columns.tolist()
    grouped_mean = df.groupby("id_for_fold", observed=True)[num_col].transform("mean")
    grouped_std = df.groupby("id_for_fold", observed=True)[num_col].transform("std")
    df[num_col] = (df[num_col] - grouped_mean) / grouped_std

    # 後でランキング化とかも付ける予定

    # dfを表示
    print(feature_col)
    display(df.tail())

    return df, feature_col


# 馬でグループ化したtarget-encodingをする関数
def grouped_horse_winning_rate(df_to_copy, feature_col_to_copy, cols=None):
    df = df_to_copy.copy()
    feature_col = feature_col_to_copy.copy()

    if cols == None :
        print("Error: please select cols")
        return
    
    # 1着の確率で計算
    grouped = df.groupby(["horse", *cols], observed=True)["target"]
    cumsum = grouped.cumsum()
    count = grouped.cumcount()
    feature_name = "horse_win_rate_" + "_".join(cols)
    df[feature_name] = (cumsum-df["target"]) / count.replace(0, np.nan)

    feature_col.append(feature_name)

    # 1-3着の確率で計算
    grouped = df.groupby(["horse", *cols], observed=True)["target3"]
    cumsum = grouped.cumsum()
    count = grouped.cumcount()
    feature_name = "horse_win_rate3_" + "_".join(cols)
    df[feature_name] = (cumsum-df["target3"]) / count.replace(0, np.nan)

    feature_col.append(feature_name)

    return df, feature_col


# 過去全てのレースでグループ化したtarget-encodingをする関数
def grouped_winning_rate(df_to_copy, feature_col_to_copy, cols):
    df = df_to_copy.copy()
    feature_col = feature_col_to_copy.copy()

    # 同じ条件で1着になるの確率を計算
    grouped = df.groupby(cols, observed=True)["target"]
    count = grouped.cumcount()
    cumsum = grouped.cumsum()
    feature_name = "all_win_rate_" + "_".join(cols)
    df[feature_name] = (cumsum-df["target"]) / count.replace(0, np.nan)
    feature_col.append(feature_name)

    # 同じ条件で1-3着になるの確率を計算
    grouped = df.groupby(cols, observed=True)["target3"]
    count = grouped.cumcount()
    cumsum = grouped.cumsum()
    feature_name = "all_win_rate3_" + "_".join(cols)
    df[feature_name] = (cumsum-df["target3"]) / count.replace(0, np.nan)
    feature_col.append(feature_name)

    return df, feature_col


In [7]:
df2, feature_col = feature_engineering(df)


['waku', 'waku_num', 'horse_num', 'sex', 'age', 'basis_weight', 'blinker', 'weight', 'inc_dec', 'weight_last_1', 'weight_last_2', 'weight_last_3', 'weight_mean_last_1_to_3', 'weight_mean_all', 'inc_dec_last_1', 'inc_dec_last_2', 'inc_dec_last_3', 'inc_dec_mean_last_1_to_3', 'inc_dec_mean_all', 'last_3F_time_last_1', 'last_3F_time_last_2', 'last_3F_time_last_3', 'last_3F_time_mean_last_1_to_3', 'last_3F_time_mean_all', 'Ave_3F_last_1', 'Ave_3F_last_2', 'Ave_3F_last_3', 'Ave_3F_mean_last_1_to_3', 'Ave_3F_mean_all', 'PCI_last_1', 'PCI_last_2', 'PCI_last_3', 'PCI_mean_last_1_to_3', 'PCI_mean_all', 'all_win_rate_horse', 'all_win_rate3_horse', 'horse_win_rate_dist', 'horse_win_rate3_dist', 'horse_win_rate_track_code', 'horse_win_rate3_track_code', 'horse_win_rate_field_type', 'horse_win_rate3_field_type', 'horse_win_rate_turn_type', 'horse_win_rate3_turn_type', 'horse_win_rate_weather', 'horse_win_rate3_weather', 'horse_win_rate_state', 'horse_win_rate3_state', 'horse_win_rate_place', 'horse

Unnamed: 0,year,month,day,times,place,daily,race_num,horse,jockey_id,horse_N,...,all_win_rate3_father_race_type_horse_num,basis_weight_per_weight,basis_weight_plus_weight,inc_dec_rate,lifetime_prize,is_same_field_type_as_last,is_same_class_code_as_last,is_same_jockey_as_last,interval_day,interval_week
43875,2023,12,28,5,阪神,9,12,テイエムイダテン,1144,16,...,-0.036142,-0.065131,0.510481,2.100688,-1.662749,True,True,False,1.302078,1.297078
43876,2023,12,28,5,阪神,9,12,ハギノメーテル,1138,16,...,,-1.494779,0.800458,1.381712,-0.876352,True,True,True,-0.351695,-0.357166
43877,2023,12,28,5,阪神,9,12,クムシラコ,1112,16,...,,1.02543,-0.359451,-1.036144,-0.645983,True,False,False,-0.212091,-0.206781
43870,2023,12,28,5,阪神,9,12,メイショウドウドウ,1171,16,...,,-0.524519,0.897117,0.361544,0.710054,True,True,False,-0.512777,-0.507552
43874,2023,12,28,5,阪神,9,12,アネゴハダ,1186,16,...,-0.758981,0.001723,-0.45611,1.116584,2.143579,True,True,True,-0.57721,-0.582745


## lightGBMでコーナー通過順位を予測してみる

In [11]:
from sklearn.model_selection import BaseCrossValidator

def train_test_group_split(X, y, test_size=0.2, groups="id_for_fold"):

    unique_group = X[groups].unique()
    n_groups = X[groups].nunique()
    n_train = round(n_groups * (1-test_size))
    s_groups = pd.Series(X[groups])

    train_groups = unique_group[:n_train]
    test_groups = unique_group[n_train:]

    train_idx = np.where(s_groups.isin(train_groups))[0]
    test_idx = np.where(s_groups.isin(test_groups))[0]

    X_train, y_train = X.iloc[train_idx, :], y.iloc[train_idx]
    X_test, y_test = X.iloc[test_idx, :], y.iloc[test_idx]

    return X_train, X_test, y_train, y_test

class GroupTimeSeriesSplit(BaseCrossValidator):
    def __init__(self, n_splits=5):
        self.n_splits = n_splits

    def split(self, X, y=None, groups=None):
        if groups is None:
            raise ValueError("groups must be provided")

        unique_groups = pd.Series(groups).drop_duplicates().values
        n_groups = len(unique_groups)

        if self.n_splits >= n_groups:
            raise ValueError("n_splits must be < n_groups")

        test_size = n_groups // (self.n_splits + 1)
        s_groups = pd.Series(groups)

        for i in range(self.n_splits):
            train_end   = (i+1) * test_size
            train_groups = unique_groups[:train_end]

            test_start  = train_end
            if i < self.n_splits - 1:
                test_end    = test_start + test_size
                test_groups  = unique_groups[test_start:test_end]
            else :
                test_groups = unique_groups[test_start:]

            train_idx = np.where(s_groups.isin(train_groups))[0]
            test_idx  = np.where(s_groups.isin(test_groups))[0]

            yield train_idx, test_idx

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits


In [45]:
import random
import lightgbm as lgb
from sklearn.metrics import root_mean_squared_error

def predict_cornerN_rank(df_to_copy, feature_col_to_copy):
    
    # 乱数シードを固定
    np.random.seed(42)
    random.seed(42)
    df = df_to_copy.copy()
    feature_col = feature_col_to_copy.copy()

    cat_col = df.select_dtypes(include=["category", "object"]).columns.tolist()
    for col in cat_col:
        df[col] = df[col].astype("category")
    

    # 通貨順位の下限がhorse_Nによって異なるので、とりあえずhorse_N = 16で実験
    df_horse_N = df[df.horse_N == 16]
    print(feature_col)

    
    # 目的変数のスケーリング（MinMaxScaling）
    # 目的変数のearly stoppingの検出が鋭くなる（RSMEの微細な変化を検知しやすい）
    # 木構造による分割で目的変数の"粗さ"が消える（整数離散値では分割点を探すのが難しい）
    target_col = df_horse_N["corner1_rank"] 
    target_col = (target_col - target_col.min()) / (target_col.max() - target_col.min())
    df_horse_N.loc[:,"corner1_rank"] = target_col

    # とりあえずcorner1_rankを予測
    X, y = df_horse_N.drop(["corner1_rank"], axis=1), df_horse_N["corner1_rank"]
    X_train, X_test, y_train, y_test = train_test_group_split(X, y, test_size=0.3) # 訓練データとテストデータに分割
    # early_stopping用にX_trainを更に分割
    # early_stoppingは、valid_setsに入ってるvalidデータで学習を止めるか判断する
    X_train_tr, X_train_val, y_train_tr, y_train_val = train_test_group_split(X_train, y_train, test_size=0.3) 

    lgb_train_tr = lgb.Dataset(X_train_tr[feature_col], y_train_tr)
    lgb_train_val = lgb.Dataset(X_train_val[feature_col], y_train_val)

    # チューニング開始
    params = {
        "objective": "regression",
        "metric": "rmse",
        "learning_rate": 0.015,
        "num_leaves": 40,
        "max_depth": 10,
        "min_data_in_leaf": 1,
        "min_data_in_bin": 1,
        "max_bin": 100,
        "seed": 42,
        "verbose": -1
    }

    # num_boost_roundをどの程度か確認
    model = lgb.train(params,
                    lgb_train_tr,
                    num_boost_round=10000,
                    valid_sets=[lgb_train_tr, lgb_train_val],
                    valid_names=["train_tr", "train_val"],
                    callbacks=[lgb.early_stopping(30, verbose=False)]
                    )
    
    y_test_pred = model.predict(X_test[feature_col])
    rmse_score = root_mean_squared_error(y_test, y_test_pred)

    print(f"RSME: {round(rmse_score, 10)}")
    print(f"RSME(reverse scaling): {round(rmse_score*15, 10)}")

In [46]:
predict_cornerN_rank(df2, feature_col)

['waku', 'waku_num', 'horse_num', 'sex', 'age', 'basis_weight', 'blinker', 'weight', 'inc_dec', 'weight_last_1', 'weight_last_2', 'weight_last_3', 'weight_mean_last_1_to_3', 'weight_mean_all', 'inc_dec_last_1', 'inc_dec_last_2', 'inc_dec_last_3', 'inc_dec_mean_last_1_to_3', 'inc_dec_mean_all', 'last_3F_time_last_1', 'last_3F_time_last_2', 'last_3F_time_last_3', 'last_3F_time_mean_last_1_to_3', 'last_3F_time_mean_all', 'Ave_3F_last_1', 'Ave_3F_last_2', 'Ave_3F_last_3', 'Ave_3F_mean_last_1_to_3', 'Ave_3F_mean_all', 'PCI_last_1', 'PCI_last_2', 'PCI_last_3', 'PCI_mean_last_1_to_3', 'PCI_mean_all', 'all_win_rate_horse', 'all_win_rate3_horse', 'horse_win_rate_dist', 'horse_win_rate3_dist', 'horse_win_rate_track_code', 'horse_win_rate3_track_code', 'horse_win_rate_field_type', 'horse_win_rate3_field_type', 'horse_win_rate_turn_type', 'horse_win_rate3_turn_type', 'horse_win_rate_weather', 'horse_win_rate3_weather', 'horse_win_rate_state', 'horse_win_rate3_state', 'horse_win_rate_place', 'horse

  df_horse_N.loc[:,"corner1_rank"] = target_col


RSME: 0.264085067
RSME(reverse scaling): 3.9612760045
