In [20]:
import numpy as np
import pandas as pd
import pickle
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
with open("C:\\Users\\yasak\\Desktop\\mykeibaAI_ver1p0\\data\\race_result_table.pkl", "rb") as f:
    model = pickle.load(f)

print(model.columns)
print(f"Number of items: {len(model.columns)}")

Number of items: 16


In [11]:
print(model[:10])

       date       race_id  is_win rank    horse_id  weight_carried jockey_id  \
0  20231001  202309040910       1    1  2019105746            58.0     05203   
1  20231001  202309040910       0    2  2019102983            58.0     01093   
2  20231001  202309040910       0    3  2018104746            58.0     01174   
3  20231001  202309040910       0    4  2019100108            58.0     01157   
4  20231001  202309040910       0    5  2019103518            58.0     01128   
5  20231001  202309040910       0    6  2019102869            58.0     01176   
6  20231001  202309040910       0    7  2019105032            58.0     00732   
7  20231001  202309040910       0    8  2019101864            58.0     01095   
8  20231001  202309040910       0    9  2019102593            58.0     01116   
9  20231001  202309040910       0   10  2019105993            58.0     01154   

   popularity   odds  last3f trainer_id  body_weight sex  age  time_sec  \
0         5.0    7.2    35.9      01127     

In [13]:
def convert_sex(sex):
    # --- 性別(sex) ---
    sex_char = str(sex[0])
    if sex_char == "牡":
        return 0
    elif sex_char == "牝":
        return 1
    elif sex_char == "セ":   # 騙馬（せん馬）
        return 2
    else:
        return np.nan

In [14]:
model['sex'] = model['sex'].apply(convert_sex)

In [16]:
model[:20]

Unnamed: 0,date,race_id,is_win,rank,horse_id,weight_carried,jockey_id,popularity,odds,last3f,trainer_id,body_weight,sex,age,time_sec,body_diff
0,20231001,202309040910,1,1,2019105746,58.0,5203,5.0,7.2,35.9,1127,494.0,0,4.0,111.3,-4.0
1,20231001,202309040910,0,2,2019102983,58.0,1093,8.0,22.2,36.3,1069,494.0,0,4.0,111.4,-6.0
2,20231001,202309040910,0,3,2018104746,58.0,1174,4.0,7.0,36.8,429,494.0,2,5.0,111.5,0.0
3,20231001,202309040910,0,4,2019100108,58.0,1157,11.0,33.9,37.1,1039,494.0,0,4.0,111.6,0.0
4,20231001,202309040910,0,5,2019103518,58.0,1128,10.0,30.1,36.8,425,532.0,0,4.0,111.6,8.0
5,20231001,202309040910,0,6,2019102869,58.0,1176,6.0,10.4,37.6,1159,522.0,0,4.0,111.7,-6.0
6,20231001,202309040910,0,7,2019105032,58.0,732,9.0,25.3,38.0,1075,524.0,0,4.0,112.0,-6.0
7,20231001,202309040910,0,8,2019101864,58.0,1095,7.0,13.1,37.8,1180,522.0,0,4.0,112.1,6.0
8,20231001,202309040910,0,9,2019102593,58.0,1116,12.0,124.1,36.5,1146,496.0,0,4.0,112.1,-6.0
9,20231001,202309040910,0,10,2019105993,58.0,1154,3.0,6.9,37.5,1002,520.0,0,4.0,112.3,4.0


In [None]:
model.to_pickle("C:\\Users\\yasak\\Desktop\\mykeibaAI_ver1p0\\data\\race_result_table_fixed.pkl")

In [22]:
feature = ['weight_carried', 'popularity', 'odds', 'body_weight', 'sex', 'age', 'body_diff']
X = model[feature]
y = model['is_win']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [24]:
params = {
    'objective': 'binary', # 2クラス分類
    'metric': 'binary_logloss', # ロス関数
    'boosting_type': 'gbdt', # 勾配ブースティング
    'learning_rate': 0.1, # 学習率(小さいほど慎重に学習、ただし学習時間は長くなる)
    'num_leaves': 31, # 木の葉の数(複雑なモデルほど大きく)
    'verbose': -1, # 全てのログの出力を非表示
    'early_stopping_round': 30, # 30回連続で改善しなければ終了
    'num_boost_round': 300 # 最大100本の木を作成
}

In [26]:
ml_model = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_eval], valid_names=['train', 'eval'])
y_pred_prob = ml_model.predict(X_test, num_iteration=ml_model.best_iteration)
y_pred = (y_pred_prob >= 0.5).astype(int)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

results = X_test.copy()
results["actual"] = y_test
results["predicted"] = y_pred
results["predicted_prob"] = y_pred_prob

print(results.head(10))

Accuracy: 92.74%
       weight_carried  popularity   odds  body_weight  sex  age  body_diff  \
21107            57.0        12.0  110.6        446.0    0  3.0       -2.0   
17038            55.0         3.0    6.5        472.0    1  3.0       -8.0   
2281             54.0         7.0   73.5        466.0    1  3.0        8.0   
3378             58.0         1.0    3.0        496.0    2  5.0        2.0   
19085            56.0         9.0   45.2        480.0    2  6.0       -6.0   
13174            57.0        11.0   35.6        464.0    0  4.0        8.0   
4404             56.0         2.0    3.6        462.0    0  2.0       -2.0   
28181            51.0        11.0   42.0        412.0    1  3.0      -14.0   
16441            57.0         2.0    3.1        444.0    0  3.0        4.0   
14118            58.0         7.0   27.2        522.0    0  5.0       10.0   

       actual  predicted  predicted_prob  
21107       0          0        0.007538  
17038       0          0        0.1257

In [35]:
results["actual"].value_counts()

actual
0    5283
1     419
Name: count, dtype: int64

In [34]:
results["predicted"].value_counts()

predicted
0    5679
1      23
Name: count, dtype: int64

In [27]:
model["is_win"].value_counts()

is_win
0    26439
1     2067
Name: count, dtype: int64

In [28]:
y_train.sum(), y_test.sum()

(np.int64(1648), np.int64(419))

In [29]:
y_pred.sum()

np.int64(23)

In [30]:
len(model)

28506