In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys

module_path = (Path().resolve().parent/ "Modules")
sys.path.append(str(module_path))

import my_modules, model_tuner # 自作モジュール

In [2]:
names = [
    "race_id",
    "year",
    "month",
    "day",
    "times",
    "place",
    "daily",
    "race_num",
    "horse",
    "jockey_id",
    "horse_N",
    "waku_num",
    "horse_num",
    "class_code",
    "track_code",
    "corner_num",
    "dist",
    "state",
    "weather",
    "age_code",
    "sex",
    "age",
    "basis_weight",
    "blinker",
    "weight",
    "inc_dec",
    "weight_code",
    "win_odds",
    "rank",
    "time_diff",
    "time",
    "corner1_rank",
    "corner2_rank",
    "corner3_rank",
    "corner4_rank",
    "last_3F_time",
    "last_3F_rank",
    "Ave_3F",
    "PCI",
    "last_3F_time_diff",
    "leg",
    "pop",
    "prize",
    "error_code",
    "father",
    "mother",
    "id"
]


In [3]:
df = pd.read_csv("../Data/record_data_2023.csv", encoding="shift-jis",header=None, names=names)
df.head()

Unnamed: 0,race_id,year,month,day,times,place,daily,race_num,horse,jockey_id,...,Ave_3F,PCI,last_3F_time_diff,leg,pop,prize,error_code,father,mother,id
0,202307220101010101,23,7,22,1,札幌,1,1,ウィスピースノー,1095,...,35.8,53.8,0.8,後方,5.0,55,0,ワールドエース,ハイリマイリ,2021100648
1,202307220101010102,23,7,22,1,札幌,1,1,ロードスタウト,1157,...,35.6,51.4,0.6,中団,7.0,0,0,ロードカナロア,フィラデルフィア,2021100159
2,202307220101010103,23,7,22,1,札幌,1,1,コミックガール,1197,...,35.6,50.8,0.6,中団,4.0,0,0,シルバーステート,コイクレナイ,2021100265
3,202307220101010104,23,7,22,1,札幌,1,1,デビルシズカチャン,5339,...,35.3,51.1,0.3,先行,3.0,83,0,ベストウォーリア,シシリアンブリーズ,2021105553
4,202307220101010105,23,7,22,1,札幌,1,1,サトミノキラリ,1170,...,35.2,52.6,0.2,先行,1.0,550,0,ビッグアーサー,パレード,2021101429


In [4]:
df = my_modules.common_process(df)

  df["place_num"] = df["place"].replace(place_dict).astype(int)


- 欠損値埋め
- 中団、後方、追込、マクリをどう処理するか考える
- 脚質ごとにラベルをカウント
- 脚質ごとのラベルを正規化する
- target-encodingを脚質ごとのラベルで正規化する

In [None]:
def grouped_leg_winning_rate(df_to_copy, feature_col_to_copy, cols=None):
    if cols == None:
        raise ValueError("cols must be specified.")
    
    df = df_to_copy.copy()
    feature_col = feature_col_to_copy.copy()

    # 欠損値処理(差しで大丈夫？)
    df["leg"] = df["leg"].fillna("差し")
    
    # 脚質予測をどうするか

    # 脚質の変換(必要か？)
    leg_dict = { # 改善の余地あり
        "中団": "差し",
        "後方": "追込",
        "ﾏｸﾘ": "先行"
    }
    df["leg"] = df["leg"].replace(leg_dict)

    # ダミー変数化
    df["concat_col"] = df["leg"].astype("str") # これにcolsを繋げてダミー変数にする
    for col in cols: # colを全部繋げる
        df["concat_col"] = df["concat_col"] + "_" + df[col].astype(str)

    dummies = pd.get_dummies(df["concat_col"], drop_first=False, dtype=int)
    df = pd.concat([df, dummies], axis=1)

    # 日付を作成
    df["date"] = df["year"]*10000 + df["month"]*100 + df["day"]


    # ダミー変数列（＝脚質名）を使って勝利数を掛け算
    for dummy in dummies.columns:
        df[f"{dummy}_target"] = df[dummy] * df["target"]
        df[f"{dummy}_target3"] = df[dummy] * df["target3"]

        grouped = df.groupby("date")[dummy].sum()
        cumsum = grouped.cumsum() - grouped # 当日を含まない同じ脚質の累積和

        grouped1 = df.groupby("date")[f"{dummy}_target"].sum()
        count1 = grouped1.cumsum() - grouped1 # 当日を含まない同じ脚質かつ勝った馬の累積和

        win_rate1 = count1/cumsum.replace(0, np.nan)
        win_rate1 = win_rate1.reset_index()
        win_rate1.columns = ["date", f"tmp_{dummy}_win_rate"]

        grouped3 = df.groupby("date")[f"{dummy}_target3"].sum()
        count3 = grouped3.cumsum() - grouped3 

        win_rate3 = count3/cumsum.replace(0, np.nan)
        win_rate3 = win_rate3.reset_index()
        win_rate3.columns = ["date", f"tmp_{dummy}_win_rate3"]

        df = pd.merge(df, win_rate1, how="left", on="date")
        df = pd.merge(df, win_rate3, how="left", on="date")
        df = df.drop([f"{dummy}_target", f"{dummy}_target3"], axis=1)
    

    # 新しい特徴量を作ってそこに得られた値を埋め込む
    # targetの場合
    feature_name = "all_win_rate_" + "_".join(cols)
    df[feature_name] = np.nan
    def embedding(row):
        for dummy in dummies.columns:
            if row[dummy] == 1:
                return row[f"tmp_{dummy}_win_rate"]
        
        return np.nan

    df[feature_name] = df.apply(embedding, axis=1)
    feature_col.append(feature_name)

    # target3の場合
    feature_name = "all_win_rate3_" + "_".join(cols)
    df[feature_name] = np.nan
    def embedding(row):
        for dummy in dummies.columns:
            if row[dummy] == 1:
                return row[f"tmp_{dummy}_win_rate3"]
        
        return np.nan

    df[feature_name] = df.apply(embedding, axis=1)
    feature_col.append(feature_name)

    # 要らない行を削除
    tmp_cols = [f"tmp_{leg_type}_win_rate" for leg_type in dummies.columns]
    tmp_cols.extend([f"tmp_{leg_type}_win_rate3" for leg_type in dummies.columns])
    df = df.drop(tmp_cols, axis=1)
    df = df.drop(dummies.columns.tolist(), axis=1)
    df = df.drop("date", axis=1)
    df = df.drop("concat_col", axis=1)
    

    return df, feature_col


In [74]:
def grouped_leg_winning_rate(df_to_copy, feature_col_to_copy, cols=None):
    if cols == None:
        raise ValueError("cols must be specified.")
    
    df = df_to_copy.copy()
    feature_col = feature_col_to_copy.copy()

    # 欠損値処理(差しで大丈夫？)
    df["leg"] = df["leg"].fillna("差し")
    
    # 脚質予測をどうするか

    # 脚質の変換(必要か？)
    leg_dict = { # 改善の余地あり
        "中団": "差し",
        "後方": "追込",
        "ﾏｸﾘ": "先行"
    }
    df["leg"] = df["leg"].replace(leg_dict)

    # ダミー変数化
    df["concat_col"] = df["leg"].astype("str") # これにcolsを繋げてダミー変数にする
    for col in cols: # colを全部繋げる
        df["concat_col"] = df["concat_col"] + "_" + df[col].astype(str)

    dummies = pd.get_dummies(df["concat_col"], drop_first=False, dtype=int)
    df = pd.concat([df, dummies], axis=1)

    # 日付を作成
    df["date"] = df["year"]*10000 + df["month"]*100 + df["day"]


    # ダミー変数列を使って勝利数を掛け算
    # ターゲット列を作成
    grouped_cols = {}
    for dummy in dummies.columns:
        grouped_cols[dummy] = df[dummy]
        grouped_cols[f"{dummy}_target"] = df[dummy] * df["target"]
        grouped_cols[f"{dummy}_target3"] = df[dummy] * df["target3"]

    grouped_df = pd.DataFrame(grouped_cols)
    grouped_df["date"] = df["date"]

    print("grouped_df")
    display(grouped_df["先行_1800"].head(30))

    grouper = grouped_df.groupby("date")


    for dummy in dummies.columns:
        # 当日を含まない同じ脚質の累積和
        grouped_cumsum = grouper[dummy].sum()
        cumsum = grouped_cumsum.cumsum() - grouped_cumsum 
        # 当日を含まない同じ脚質かつ勝った馬の累積和
        grouped_count = grouper[[f"{dummy}_target", f"{dummy}_target3"]].sum()
        count = grouped_count.cumsum() - grouped_count 

        # 累積和から勝率を計算
        cumsum = cumsum.replace(0, np.nan)
        count[f"{dummy}_target"] = count[f"{dummy}_target"] / cumsum
        count[f"{dummy}_target3"] = count[f"{dummy}_target3"] / cumsum
        
        win_rate = count.reset_index()
        win_rate.columns = ["date", f"tmp_{dummy}_win_rate", f"tmp_{dummy}_win_rate3"]

        try:
            merge_df = pd.concat([merge_df, win_rate[[f"tmp_{dummy}_win_rate", f"tmp_{dummy}_win_rate3"]]], axis=1)
        except:
            merge_df = win_rate

    print("merge_df")
    display(merge_df["tmp_先行_1800_win_rate"].head(30))
    
    df = pd.merge(left=df, right=merge_df, on="date", how="left")
    print("merged_df")
    display(df[(df.leg=="先行") & (df.dist==1800)][["day", "tmp_先行_1800_win_rate"]].head(30))

    # ここから下がおかしい

    # 新しい特徴量を作ってそこに得られた値を埋め込む
    df["dummies_sum"] = df[dummies.columns.tolist()].sum(axis=1, skipna=True)

    feature_name = "all_win_rate_" + "_".join(cols)
    feature_name3 = "all_win_rate3_" + "_".join(cols)
    df[feature_name] = 0
    df[feature_name3] = 0
    for dummy in dummies.columns:
        df[feature_name] += df[f"tmp_{dummy}_win_rate"].replace(np.nan, 0) * df[dummy]
        df[feature_name3] += df[f"tmp_{dummy}_win_rate3"].replace(np.nan, 0) * df[dummy]
    
    df[feature_name] = df[feature_name] / df["dummies_sum"]
    df[feature_name3] = df[feature_name3] / df["dummies_sum"]
    feature_col.append(feature_name)
    feature_col.append(feature_name3)

    # 要らない行を削除
    tmp_cols = [f"tmp_{leg_type}_win_rate" for leg_type in dummies.columns]
    tmp_cols.extend([f"tmp_{leg_type}_win_rate3" for leg_type in dummies.columns])
    df = df.drop(tmp_cols, axis=1)
    df = df.drop(dummies.columns.tolist(), axis=1)
    df = df.drop(["date", "concat_col", "dummies_sum"], axis=1)
    

    return df, feature_col


In [75]:
df2, feature_col = grouped_leg_winning_rate(df, [], cols=["dist"])

grouped_df


19441    0
19442    0
19456    0
19454    0
19453    0
19452    0
19451    0
19450    0
19455    0
19448    0
19447    0
19446    0
19445    0
19444    0
19443    0
19449    0
26764    1
26763    0
26762    0
26761    1
26758    0
26759    1
26757    0
26753    0
26754    0
26765    0
26760    0
26755    0
26756    0
19471    0
Name: 先行_1800, dtype: int64

merge_df


0          NaN
1     0.080000
2     0.078431
3     0.067568
4     0.088235
5     0.099237
6     0.127389
7     0.127660
8     0.123894
9     0.132530
10    0.127820
11    0.126761
12    0.132013
13    0.135802
14    0.142450
15    0.142105
16    0.138350
17    0.138009
18    0.135417
19    0.129921
20    0.129390
21    0.128028
22    0.130148
23    0.129688
24    0.130564
25    0.130014
26    0.135279
27    0.136654
28    0.137376
29    0.135167
Name: tmp_先行_1800_win_rate, dtype: float64

merged_df


Unnamed: 0,day,tmp_先行_1800_win_rate
16,5,
19,5,
21,5,
30,5,
37,5,
41,5,
65,5,
67,5,
68,5,
95,5,


In [77]:
df2 = df2[(df2.leg =="先行") & (df2.dist == 1800)]
df2[["day","target", "all_win_rate_dist", "target3", "all_win_rate3_dist"]].head(30)

Unnamed: 0,day,target,all_win_rate_dist,target3,all_win_rate3_dist
16,5,0,0.0,0,0.0
19,5,0,0.0,0,0.0
21,5,0,0.0,0,0.0
30,5,0,0.0,0,0.0
37,5,0,0.0,0,0.0
41,5,0,0.0,0,0.0
65,5,0,0.0,0,0.0
67,5,0,0.0,0,0.0
68,5,0,0.0,1,0.0
95,5,0,0.0,0,0.0
