In [12]:
import pandas as pd
from statistics import mean
from datetime import datetime as dt
from dateutil.relativedelta import relativedelta
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_curve
import joblib
import re

# ランダムフォレスト回帰で予想するよ〜。
# こっちは単純な馬の順位
# df = pd.read_pickle('../DATA/df_for_learning_shin.pkl')

# こっちは馬の順位/頭数をスコア化したもの（小さい方がいい値）
df = pd.read_pickle('../DATA/df_for_learning_without_none.pkl')
horse_df = pd.read_pickle('../DATA/horse_result_of_5_years_shin.pkl')

score = []
for _, row in df.iterrows():
    try:
        if row['着順'] == 1:
            score.append(100)
        elif row['着順'] == 2:
            score.append(80)
        elif row['着順'] == 3:
            score.append(70)
        else:
            score.append(0)
    except TypeError:
        score.append(0)
        continue
df['スコア'] = score

# 騎手年齢の2乗を加えてみたよ。あんま意味なかったからコメントアウトしてるよ。
# jockey_old_2_dict = {}
# for race_id, df_child in df.groupby(df.index):
#     jockey_old_2 = []
#     for _, row in df_child.iterrows():
#         jockey_old_2.append((30 - row['騎手年齢']) ** 2)
#     jockey_old_2_se = pd.Series(jockey_old_2, index=[race_id] * len(df_child))
#     jockey_old_2_dict[race_id] = pd.concat([df_child, jockey_old_2_se], axis=1)
# df = pd.concat([jockey_old_2_dict[key] for key in jockey_old_2_dict])
# df = df.rename(columns={0: '騎手年齢2'})

# 訓練データとテストデータに分けるよ。
race_ids = df.index.unique().tolist()

# ランダムに訓練データを選ぶ
# train_race_ids, test_race_ids = train_test_split(race_ids, test_size=0.2, random_state=0)

# 2024年のデータをテストデータに選ぶ
train_race_ids = []
test_race_ids = []
for race_id in race_ids:
    if re.match(r'(2023|2022|2021|2020)\d+', race_id):
        train_race_ids.append(re.match(r'(2023|2022|2021|2020)\d+', race_id).group())
    else:
        test_race_ids.append(re.match(r'2024\d+', race_id).group())
        
train_df = df[df.index.isin(train_race_ids)]
test_df = df[df.index.isin(test_race_ids)]

# 学習に使うパラメータ(特徴量)だよ〜。ここを変えると結果が変わるよ。
# 今の所col = ['騎手直近単勝率', '騎手直近複勝率', '騎手経験値', '騎手年齢', '馬直近成績']で最大値36.1%だよ。
col = ['騎手直近単勝率', '騎手直近複勝率', '騎手経験値', '騎手年齢', '馬直近成績']
X_train = train_df[col]
y_train = train_df['スコア']
X_test = test_df[col]
y_test = test_df['スコア']

model = RandomForestRegressor(random_state=0).fit(X_train, y_train)
# model = RandomForestRegressor()

y_pred = model.predict(X_test)

test_df['予測値'] = y_pred
test_df_with_rank = {}
for race_id, race_data in test_df.groupby(test_df.index):
    race_data['ランク'] = race_data['予測値'].rank(ascending=False)
    test_df_with_rank[race_id] = race_data
test_df_with_each_rank = pd.concat([test_df_with_rank[key] for key in test_df_with_rank])

ranked_test_dict = {}
for race_id, each_df in test_df_with_each_rank.groupby(test_df_with_each_rank.index):
    predict_list = []
    for _, row in each_df.iterrows():
#         print(row['ランク'])
        if row['ランク'] <= 3:
            predict_list.append('victory')
        else:
            predict_list.append("I'm a loser.")
    predict_se = pd.Series(predict_list, index=([race_id] * len(each_df)))
    ranked_test_dict[race_id] = pd.concat([each_df, predict_se], axis=1)
test_df_with_each_rank = pd.concat([ranked_test_dict[key] for key in ranked_test_dict])

test_df_2 = test_df_with_each_rank.rename(columns={0: '予測勝敗'})
test_df_2['齢'] = test_df_2['齢'].map(int)
test_col = ['斤量', '騎手直近単勝率', '騎手直近複勝率', '騎手年齢', '性', '齢', '馬直近成績', '騎手経験値', 'スコア']
print(test_df_2[test_col].corr()['スコア'])
print('全レースでの的中率は以下')
true_positive = []
false_positive = []
true_negative = []
false_negative = []
for _, row in test_df_2.iterrows():
    condition_tp = (row['勝敗'] == 'victory') and (row['予測勝敗'] == 'victory')
#     condition_fp = (row['勝敗'] == "I'm a loser.") and (row['予測勝敗'] == 'victory')
    condition_tn = (row['勝敗'] == "I'm a loser.") and (row['予測勝敗'] == "I'm a loser.")
#     condition_fn = (row['勝敗'] == "victory") and (row['予測勝敗'] == "I'm a loser.")
    true_positive.append(condition_tp)
#     false_positive.append(condition_fp)
    true_negative.append(condition_tn)
#     false_negative.append(condition_fn)
accuracy_tp = sum(true_positive) / len(true_positive)
accuracy_tn = sum(true_negative) / len(true_negative)
accuracy_t = (sum(true_positive) + sum(true_negative)) / len(true_positive)
# accuracy_fp = sum(false_positive) / len(false_positive)
# accuracy_fn = sum(false_negative) / len(false_negative)
print(accuracy_tp)
print(accuracy_tn)
print(f'正解率：{accuracy_t}')

target_list = []
for race_id, each_df in test_df_2.groupby(test_df_2.index):
    victories_df = each_df.head(3)
    for _, row in victories_df.iterrows():
        target_victory = (row['勝敗'] == 'victory') and (row['予測勝敗'] == 'victory')
        target_list.append(target_victory)
target_ratio = sum(target_list) / len(target_list)
print(target_ratio)
print('---------------------------------')

# モデルの保存
filename = '../keiba_app/trained_model/keiba_model.pkl'
joblib.dump(model, filename, compress=3)

# test_df_with_each_rank['予測勝敗'] = [predict[key] for key in predict]
# print(test_df_with_each_rank)
# accuracy_top = test_df.loc[test_df['予測値'] == 'top']
# print(accuracy_top)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['予測値'] = y_pred


斤量         0.046131
騎手直近単勝率    0.233774
騎手直近複勝率    0.234248
騎手年齢       0.048453
性          0.023478
齢         -0.079279
馬直近成績     -0.239027
騎手経験値      0.050505
スコア        1.000000
Name: スコア, dtype: float64
全レースでの的中率は以下
0.07338540452319887
0.6460130566565633
正解率：0.7193984611797621
0.32280431432973805
---------------------------------


['../keiba_app/trained_model/keiba_model.pkl']

In [119]:
import pandas as pd

df = pd.read_pickle('../DATA/recent_5_race_df_for_learning.pkl')

print(df.columns)

Index(['着順', '枠番', '馬番', '馬名', '性齢', '斤量', '騎手', 'タイム', '着差', '単勝', '人気',
       '馬体重', '調教師', 'コース長', '天気', 'レース場', '場の状態', '馬id', '騎手id', '調教師id', '性',
       '齢', '天気index', 'レース場index', '場の状態index', '開催年', '騎手直近単勝率', '騎手直近複勝率',
       '騎手経験値', '騎手年齢', '勝敗'],
      dtype='object')


In [147]:
import pandas as pd
from datetime import datetime as dt

df = pd.read_pickle('../DATA/df_for_learning_with_date.pkl')
horse_df = pd.read_pickle('../DATA/horse_result_of_5_years.pkl')

# dates = []
# for date in horse_df['日付']:
#     dates.append(dt.strptime(date, '%Y/%m/%d'))
# horse_df['日付'] = dates

for race_id, each_race_df in df.groupby(df.index):
    race_date = each_race_df['開催年月日'].unique()
    print(race_date)

['2020-07-25T00:00:00.000000000']
['2020-07-25T00:00:00.000000000']
['2020-07-25T00:00:00.000000000']
['2020-07-25T00:00:00.000000000']
['2020-07-25T00:00:00.000000000']
['2020-07-25T00:00:00.000000000']
['2020-07-25T00:00:00.000000000']
['2020-07-25T00:00:00.000000000']
['2020-07-25T00:00:00.000000000']
['2020-07-25T00:00:00.000000000']
['2020-07-25T00:00:00.000000000']
['2020-07-25T00:00:00.000000000']
['2020-07-26T00:00:00.000000000']
['2020-07-26T00:00:00.000000000']
['2020-07-26T00:00:00.000000000']
['2020-07-26T00:00:00.000000000']
['2020-07-26T00:00:00.000000000']
['2020-07-26T00:00:00.000000000']
['2020-07-26T00:00:00.000000000']
['2020-07-26T00:00:00.000000000']
['2020-07-26T00:00:00.000000000']
['2020-07-26T00:00:00.000000000']
['2020-07-26T00:00:00.000000000']
['2020-07-26T00:00:00.000000000']
['2020-08-01T00:00:00.000000000']
['2020-08-01T00:00:00.000000000']
['2020-08-01T00:00:00.000000000']
['2020-08-01T00:00:00.000000000']
['2020-08-01T00:00:00.000000000']
['2020-08-01T0

['2020-11-21T00:00:00.000000000']
['2020-11-22T00:00:00.000000000']
['2020-11-22T00:00:00.000000000']
['2020-11-22T00:00:00.000000000']
['2020-11-22T00:00:00.000000000']
['2020-11-22T00:00:00.000000000']
['2020-11-22T00:00:00.000000000']
['2020-11-22T00:00:00.000000000']
['2020-11-22T00:00:00.000000000']
['2020-11-22T00:00:00.000000000']
['2020-11-22T00:00:00.000000000']
['2020-11-22T00:00:00.000000000']
['2020-11-22T00:00:00.000000000']
['2020-11-23T00:00:00.000000000']
['2020-11-23T00:00:00.000000000']
['2020-11-23T00:00:00.000000000']
['2020-11-23T00:00:00.000000000']
['2020-11-23T00:00:00.000000000']
['2020-11-23T00:00:00.000000000']
['2020-11-23T00:00:00.000000000']
['2020-11-23T00:00:00.000000000']
['2020-11-23T00:00:00.000000000']
['2020-11-23T00:00:00.000000000']
['2020-11-23T00:00:00.000000000']
['2020-11-23T00:00:00.000000000']
['2020-11-28T00:00:00.000000000']
['2020-11-28T00:00:00.000000000']
['2020-11-28T00:00:00.000000000']
['2020-11-28T00:00:00.000000000']
['2020-11-28T0

['2020-02-22T00:00:00.000000000']
['2020-02-22T00:00:00.000000000']
['2020-02-22T00:00:00.000000000']
['2020-02-22T00:00:00.000000000']
['2020-02-22T00:00:00.000000000']
['2020-02-22T00:00:00.000000000']
['2020-02-22T00:00:00.000000000']
['2020-02-22T00:00:00.000000000']
['2020-02-22T00:00:00.000000000']
['2020-02-23T00:00:00.000000000']
['2020-02-23T00:00:00.000000000']
['2020-02-23T00:00:00.000000000']
['2020-02-23T00:00:00.000000000']
['2020-02-23T00:00:00.000000000']
['2020-02-23T00:00:00.000000000']
['2020-02-23T00:00:00.000000000']
['2020-02-23T00:00:00.000000000']
['2020-02-23T00:00:00.000000000']
['2020-02-23T00:00:00.000000000']
['2020-02-23T00:00:00.000000000']
['2020-02-23T00:00:00.000000000']
['2020-04-25T00:00:00.000000000']
['2020-04-25T00:00:00.000000000']
['2020-04-25T00:00:00.000000000']
['2020-04-25T00:00:00.000000000']
['2020-04-25T00:00:00.000000000']
['2020-04-25T00:00:00.000000000']
['2020-04-25T00:00:00.000000000']
['2020-04-25T00:00:00.000000000']
['2020-04-25T0

['2021-06-13T00:00:00.000000000']
['2021-06-13T00:00:00.000000000']
['2021-06-19T00:00:00.000000000']
['2021-06-19T00:00:00.000000000']
['2021-06-19T00:00:00.000000000']
['2021-06-19T00:00:00.000000000']
['2021-06-19T00:00:00.000000000']
['2021-06-19T00:00:00.000000000']
['2021-06-19T00:00:00.000000000']
['2021-06-19T00:00:00.000000000']
['2021-06-19T00:00:00.000000000']
['2021-06-19T00:00:00.000000000']
['2021-06-19T00:00:00.000000000']
['2021-06-19T00:00:00.000000000']
['2021-06-20T00:00:00.000000000']
['2021-06-20T00:00:00.000000000']
['2021-06-20T00:00:00.000000000']
['2021-06-20T00:00:00.000000000']
['2021-06-20T00:00:00.000000000']
['2021-06-20T00:00:00.000000000']
['2021-06-20T00:00:00.000000000']
['2021-06-20T00:00:00.000000000']
['2021-06-20T00:00:00.000000000']
['2021-06-20T00:00:00.000000000']
['2021-06-20T00:00:00.000000000']
['2021-06-20T00:00:00.000000000']
['2021-06-26T00:00:00.000000000']
['2021-06-26T00:00:00.000000000']
['2021-06-26T00:00:00.000000000']
['2021-06-26T0

['2021-06-13T00:00:00.000000000']
['2021-06-13T00:00:00.000000000']
['2021-06-13T00:00:00.000000000']
['2021-06-13T00:00:00.000000000']
['2021-06-13T00:00:00.000000000']
['2021-06-13T00:00:00.000000000']
['2021-06-13T00:00:00.000000000']
['2021-06-13T00:00:00.000000000']
['2021-06-13T00:00:00.000000000']
['2021-06-19T00:00:00.000000000']
['2021-06-19T00:00:00.000000000']
['2021-06-19T00:00:00.000000000']
['2021-06-19T00:00:00.000000000']
['2021-06-19T00:00:00.000000000']
['2021-06-19T00:00:00.000000000']
['2021-06-19T00:00:00.000000000']
['2021-06-19T00:00:00.000000000']
['2021-06-19T00:00:00.000000000']
['2021-06-19T00:00:00.000000000']
['2021-06-19T00:00:00.000000000']
['2021-06-19T00:00:00.000000000']
['2021-06-20T00:00:00.000000000']
['2021-06-20T00:00:00.000000000']
['2021-06-20T00:00:00.000000000']
['2021-06-20T00:00:00.000000000']
['2021-06-20T00:00:00.000000000']
['2021-06-20T00:00:00.000000000']
['2021-06-20T00:00:00.000000000']
['2021-06-20T00:00:00.000000000']
['2021-06-20T0

['2021-09-11T00:00:00.000000000']
['2021-09-11T00:00:00.000000000']
['2021-09-11T00:00:00.000000000']
['2021-09-12T00:00:00.000000000']
['2021-09-12T00:00:00.000000000']
['2021-09-12T00:00:00.000000000']
['2021-09-12T00:00:00.000000000']
['2021-09-12T00:00:00.000000000']
['2021-09-12T00:00:00.000000000']
['2021-09-12T00:00:00.000000000']
['2021-09-12T00:00:00.000000000']
['2021-09-12T00:00:00.000000000']
['2021-09-12T00:00:00.000000000']
['2021-09-12T00:00:00.000000000']
['2021-09-12T00:00:00.000000000']
['2021-09-18T00:00:00.000000000']
['2021-09-18T00:00:00.000000000']
['2021-09-18T00:00:00.000000000']
['2021-09-18T00:00:00.000000000']
['2021-09-18T00:00:00.000000000']
['2021-09-18T00:00:00.000000000']
['2021-09-18T00:00:00.000000000']
['2021-09-18T00:00:00.000000000']
['2021-09-18T00:00:00.000000000']
['2021-09-18T00:00:00.000000000']
['2021-09-18T00:00:00.000000000']
['2021-09-18T00:00:00.000000000']
['2021-09-19T00:00:00.000000000']
['2021-09-19T00:00:00.000000000']
['2021-09-19T0

['2021-08-22T00:00:00.000000000']
['2021-08-22T00:00:00.000000000']
['2021-08-22T00:00:00.000000000']
['2021-08-28T00:00:00.000000000']
['2021-08-28T00:00:00.000000000']
['2021-08-28T00:00:00.000000000']
['2021-08-28T00:00:00.000000000']
['2021-08-28T00:00:00.000000000']
['2021-08-28T00:00:00.000000000']
['2021-08-28T00:00:00.000000000']
['2021-08-28T00:00:00.000000000']
['2021-08-28T00:00:00.000000000']
['2021-08-28T00:00:00.000000000']
['2021-08-28T00:00:00.000000000']
['2021-08-28T00:00:00.000000000']
['2021-08-29T00:00:00.000000000']
['2021-08-29T00:00:00.000000000']
['2021-08-29T00:00:00.000000000']
['2021-08-29T00:00:00.000000000']
['2021-08-29T00:00:00.000000000']
['2021-08-29T00:00:00.000000000']
['2021-08-29T00:00:00.000000000']
['2021-08-29T00:00:00.000000000']
['2021-08-29T00:00:00.000000000']
['2021-08-29T00:00:00.000000000']
['2021-08-29T00:00:00.000000000']
['2021-08-29T00:00:00.000000000']
['2021-09-04T00:00:00.000000000']
['2021-09-04T00:00:00.000000000']
['2021-09-04T0

['2022-02-05T00:00:00.000000000']
['2022-02-05T00:00:00.000000000']
['2022-02-05T00:00:00.000000000']
['2022-02-05T00:00:00.000000000']
['2022-02-05T00:00:00.000000000']
['2022-02-06T00:00:00.000000000']
['2022-02-06T00:00:00.000000000']
['2022-02-06T00:00:00.000000000']
['2022-02-06T00:00:00.000000000']
['2022-02-06T00:00:00.000000000']
['2022-02-06T00:00:00.000000000']
['2022-02-06T00:00:00.000000000']
['2022-02-06T00:00:00.000000000']
['2022-02-06T00:00:00.000000000']
['2022-02-06T00:00:00.000000000']
['2022-02-06T00:00:00.000000000']
['2022-02-06T00:00:00.000000000']
['2022-02-12T00:00:00.000000000']
['2022-02-12T00:00:00.000000000']
['2022-02-12T00:00:00.000000000']
['2022-02-12T00:00:00.000000000']
['2022-02-12T00:00:00.000000000']
['2022-02-12T00:00:00.000000000']
['2022-02-12T00:00:00.000000000']
['2022-02-12T00:00:00.000000000']
['2022-02-12T00:00:00.000000000']
['2022-02-12T00:00:00.000000000']
['2022-02-12T00:00:00.000000000']
['2022-02-12T00:00:00.000000000']
['2022-02-13T0

['2022-01-15T00:00:00.000000000']
['2022-01-15T00:00:00.000000000']
['2022-01-15T00:00:00.000000000']
['2022-01-15T00:00:00.000000000']
['2022-01-16T00:00:00.000000000']
['2022-01-16T00:00:00.000000000']
['2022-01-16T00:00:00.000000000']
['2022-01-16T00:00:00.000000000']
['2022-01-16T00:00:00.000000000']
['2022-01-16T00:00:00.000000000']
['2022-01-16T00:00:00.000000000']
['2022-01-16T00:00:00.000000000']
['2022-01-16T00:00:00.000000000']
['2022-01-16T00:00:00.000000000']
['2022-01-16T00:00:00.000000000']
['2022-01-16T00:00:00.000000000']
['2022-01-22T00:00:00.000000000']
['2022-01-22T00:00:00.000000000']
['2022-01-22T00:00:00.000000000']
['2022-01-22T00:00:00.000000000']
['2022-01-22T00:00:00.000000000']
['2022-01-22T00:00:00.000000000']
['2022-01-22T00:00:00.000000000']
['2022-01-22T00:00:00.000000000']
['2022-01-22T00:00:00.000000000']
['2022-01-22T00:00:00.000000000']
['2022-01-22T00:00:00.000000000']
['2022-01-22T00:00:00.000000000']
['2022-01-23T00:00:00.000000000']
['2022-01-23T0

['2022-01-15T00:00:00.000000000']
['2022-01-15T00:00:00.000000000']
['2022-01-15T00:00:00.000000000']
['2022-01-15T00:00:00.000000000']
['2022-01-15T00:00:00.000000000']
['2022-01-15T00:00:00.000000000']
['2022-01-15T00:00:00.000000000']
['2022-01-15T00:00:00.000000000']
['2022-01-16T00:00:00.000000000']
['2022-01-16T00:00:00.000000000']
['2022-01-16T00:00:00.000000000']
['2022-01-16T00:00:00.000000000']
['2022-01-16T00:00:00.000000000']
['2022-01-16T00:00:00.000000000']
['2022-01-16T00:00:00.000000000']
['2022-01-16T00:00:00.000000000']
['2022-01-16T00:00:00.000000000']
['2022-01-16T00:00:00.000000000']
['2022-01-16T00:00:00.000000000']
['2022-01-16T00:00:00.000000000']
['2022-01-22T00:00:00.000000000']
['2022-01-22T00:00:00.000000000']
['2022-01-22T00:00:00.000000000']
['2022-01-22T00:00:00.000000000']
['2022-01-22T00:00:00.000000000']
['2022-01-22T00:00:00.000000000']
['2022-01-22T00:00:00.000000000']
['2022-01-22T00:00:00.000000000']
['2022-01-22T00:00:00.000000000']
['2022-01-22T0

['2023-08-20T00:00:00.000000000']
['2023-08-26T00:00:00.000000000']
['2023-08-26T00:00:00.000000000']
['2023-08-26T00:00:00.000000000']
['2023-08-26T00:00:00.000000000']
['2023-08-26T00:00:00.000000000']
['2023-08-26T00:00:00.000000000']
['2023-08-26T00:00:00.000000000']
['2023-08-26T00:00:00.000000000']
['2023-08-26T00:00:00.000000000']
['2023-08-26T00:00:00.000000000']
['2023-08-26T00:00:00.000000000']
['2023-08-26T00:00:00.000000000']
['2023-08-27T00:00:00.000000000']
['2023-08-27T00:00:00.000000000']
['2023-08-27T00:00:00.000000000']
['2023-08-27T00:00:00.000000000']
['2023-08-27T00:00:00.000000000']
['2023-08-27T00:00:00.000000000']
['2023-08-27T00:00:00.000000000']
['2023-08-27T00:00:00.000000000']
['2023-08-27T00:00:00.000000000']
['2023-08-27T00:00:00.000000000']
['2023-08-27T00:00:00.000000000']
['2023-08-27T00:00:00.000000000']
['2023-09-02T00:00:00.000000000']
['2023-09-02T00:00:00.000000000']
['2023-09-02T00:00:00.000000000']
['2023-09-02T00:00:00.000000000']
['2023-09-02T0

['2023-09-16T00:00:00.000000000']
['2023-09-16T00:00:00.000000000']
['2023-09-16T00:00:00.000000000']
['2023-09-16T00:00:00.000000000']
['2023-09-16T00:00:00.000000000']
['2023-09-16T00:00:00.000000000']
['2023-09-16T00:00:00.000000000']
['2023-09-17T00:00:00.000000000']
['2023-09-17T00:00:00.000000000']
['2023-09-17T00:00:00.000000000']
['2023-09-17T00:00:00.000000000']
['2023-09-17T00:00:00.000000000']
['2023-09-17T00:00:00.000000000']
['2023-09-17T00:00:00.000000000']
['2023-09-17T00:00:00.000000000']
['2023-09-17T00:00:00.000000000']
['2023-09-17T00:00:00.000000000']
['2023-09-17T00:00:00.000000000']
['2023-09-17T00:00:00.000000000']
['2023-09-18T00:00:00.000000000']
['2023-09-18T00:00:00.000000000']
['2023-09-18T00:00:00.000000000']
['2023-09-18T00:00:00.000000000']
['2023-09-18T00:00:00.000000000']
['2023-09-18T00:00:00.000000000']
['2023-09-18T00:00:00.000000000']
['2023-09-18T00:00:00.000000000']
['2023-09-18T00:00:00.000000000']
['2023-09-18T00:00:00.000000000']
['2023-09-18T0

['2023-04-09T00:00:00.000000000']
['2023-04-15T00:00:00.000000000']
['2023-04-15T00:00:00.000000000']
['2023-04-15T00:00:00.000000000']
['2023-04-15T00:00:00.000000000']
['2023-04-15T00:00:00.000000000']
['2023-04-15T00:00:00.000000000']
['2023-04-15T00:00:00.000000000']
['2023-04-15T00:00:00.000000000']
['2023-04-15T00:00:00.000000000']
['2023-04-15T00:00:00.000000000']
['2023-04-15T00:00:00.000000000']
['2023-04-15T00:00:00.000000000']
['2023-04-16T00:00:00.000000000']
['2023-04-16T00:00:00.000000000']
['2023-04-16T00:00:00.000000000']
['2023-04-16T00:00:00.000000000']
['2023-04-16T00:00:00.000000000']
['2023-04-16T00:00:00.000000000']
['2023-04-16T00:00:00.000000000']
['2023-04-16T00:00:00.000000000']
['2023-04-16T00:00:00.000000000']
['2023-04-16T00:00:00.000000000']
['2023-04-16T00:00:00.000000000']
['2023-04-16T00:00:00.000000000']
['2023-06-03T00:00:00.000000000']
['2023-06-03T00:00:00.000000000']
['2023-06-03T00:00:00.000000000']
['2023-06-03T00:00:00.000000000']
['2023-06-03T0

['2024-04-28T00:00:00.000000000']
['2024-04-28T00:00:00.000000000']
['2024-04-28T00:00:00.000000000']
['2024-04-28T00:00:00.000000000']
['2024-04-28T00:00:00.000000000']
['2024-05-04T00:00:00.000000000']
['2024-05-04T00:00:00.000000000']
['2024-05-04T00:00:00.000000000']
['2024-05-04T00:00:00.000000000']
['2024-05-04T00:00:00.000000000']
['2024-05-04T00:00:00.000000000']
['2024-05-04T00:00:00.000000000']
['2024-05-04T00:00:00.000000000']
['2024-05-04T00:00:00.000000000']
['2024-05-04T00:00:00.000000000']
['2024-05-04T00:00:00.000000000']
['2024-05-04T00:00:00.000000000']
['2024-05-05T00:00:00.000000000']
['2024-05-05T00:00:00.000000000']
['2024-05-05T00:00:00.000000000']
['2024-05-05T00:00:00.000000000']
['2024-05-05T00:00:00.000000000']
['2024-05-05T00:00:00.000000000']
['2024-05-05T00:00:00.000000000']
['2024-05-05T00:00:00.000000000']
['2024-05-05T00:00:00.000000000']
['2024-05-05T00:00:00.000000000']
['2024-05-05T00:00:00.000000000']
['2024-05-05T00:00:00.000000000']
['2024-05-11T0

['2024-03-24T00:00:00.000000000']
['2024-03-24T00:00:00.000000000']
['2024-03-24T00:00:00.000000000']
['2024-03-24T00:00:00.000000000']
['2024-03-24T00:00:00.000000000']
['2024-03-24T00:00:00.000000000']
['2024-03-30T00:00:00.000000000']
['2024-03-30T00:00:00.000000000']
['2024-03-30T00:00:00.000000000']
['2024-03-30T00:00:00.000000000']
['2024-03-30T00:00:00.000000000']
['2024-03-30T00:00:00.000000000']
['2024-03-30T00:00:00.000000000']
['2024-03-30T00:00:00.000000000']
['2024-03-30T00:00:00.000000000']
['2024-03-30T00:00:00.000000000']
['2024-03-30T00:00:00.000000000']
['2024-03-30T00:00:00.000000000']
['2024-03-31T00:00:00.000000000']
['2024-03-31T00:00:00.000000000']
['2024-03-31T00:00:00.000000000']
['2024-03-31T00:00:00.000000000']
['2024-03-31T00:00:00.000000000']
['2024-03-31T00:00:00.000000000']
['2024-03-31T00:00:00.000000000']
['2024-03-31T00:00:00.000000000']
['2024-03-31T00:00:00.000000000']
['2024-03-31T00:00:00.000000000']
['2024-03-31T00:00:00.000000000']
['2024-03-31T0

In [15]:
import pandas as pd
from statistics import mean
from datetime import datetime as dt
from dateutil.relativedelta import relativedelta

# horse_df = pd.read_pickle('../DATA/horse_result_of_5_years.pkl')
horse_df = pd.read_pickle('../DATA/horse_result_of_5_years_shin.pkl')
race_df = pd.read_pickle('../DATA/df_for_learning_with_date.pkl')

# dates = []
# ranks = []
# for _, row in horse_df.iterrows():
#     dates.append(dt.strptime(row['日付'], '%Y/%m/%d'))
#     try:
#         ranks.append(int(row['着順']))
#     except ValueError:
#         ranks.append('')
#         continue

# horse_df['日付'] = dates
# horse_df['着順'] = ranks
# horse_df.to_pickle('../DATA/horse_result_of_5_years_shin.pkl')


def horse_recent_victory_ratio(race_df, horse_df):
    '''
    馬の最近半年間の勝率を追加したレースデータが返ってくるよ。
    '''
    recent_horse_scores = []
    for race_id, row in race_df.iterrows():
        count = len(race_df.loc[race_df.index == race_id])

        each_horse_df = horse_df.loc[row['馬id'] == horse_df.index]
        ranks = []
        for _, target in each_horse_df.iterrows():
            if row['開催年月日'] + relativedelta(months=-6) <= target['日付'] < row['開催年月日']:
                if type(target['着順']) == int:
                    # 平均順位をスコアとしてみるよ。
                    ranks.append(target['着順'])
                else:
                    ranks.append(count / 2)
            elif target['日付'] == row['開催年月日']:
                # 工事中
                if 
        recent_horse_scores.append(mean(ranks))
        
    race_df['馬直近成績'] = recent_horse_scores
    return race_df

df = horse_recent_victory_ratio(race_df, horse_df)
# print(df)
df.to_pickle('../DATA/df_for_learning_shin.pkl')

In [44]:
import pandas as pd
import re
from statistics import mean
from datetime import datetime as dt
from dateutil.relativedelta import relativedelta

df = pd.read_pickle('../DATA/df_for_learning_shin.pkl')
race_ids = df.index.unique().tolist()

# train_race_ids = re.match(r'(2023|2022|2021|2020)\d+', '20220101010101')
# print(train_race_ids)
train_race_ids = []
test_race_ids = []
for race_id in race_ids:
    if re.match(r'(2023|2022|2021|2020)\d+', race_id):
        train_race_ids.append(re.match(r'(2023|2022|2021|2020)\d+', race_id).group())
    else:
        test_race_ids.append(re.match(r'2024\d+', race_id).group())

print(test_race_ids)

['202402010101', '202402010102', '202402010103', '202402010104', '202402010105', '202402010106', '202402010107', '202402010108', '202402010109', '202402010110', '202402010111', '202402010112', '202402010201', '202402010202', '202402010203', '202402010204', '202402010205', '202402010206', '202402010207', '202402010208', '202402010209', '202402010210', '202402010211', '202402010212', '202402010301', '202402010302', '202402010303', '202402010304', '202402010305', '202402010306', '202402010307', '202402010308', '202402010309', '202402010310', '202402010311', '202402010312', '202402010401', '202402010402', '202402010403', '202402010404', '202402010405', '202402010406', '202402010407', '202402010408', '202402010409', '202402010410', '202402010411', '202402010412', '202402010501', '202402010502', '202402010503', '202402010504', '202402010505', '202402010506', '202402010507', '202402010508', '202402010509', '202402010510', '202402010511', '202402010512', '202402010601', '202402010602', '202402

In [49]:
import pandas as pd
from statistics import mean
from datetime import datetime as dt
from dateutil.relativedelta import relativedelta

# horse_df = pd.read_pickle('../DATA/horse_result_of_5_years.pkl')
horse_df = pd.read_pickle('../DATA/horse_result_of_5_years_shin.pkl')
race_df = pd.read_pickle('../DATA/df_for_learning_with_date.pkl').head(100)
start_date = dt(2020, 1,1)

def horse_recent_victory_ratio(race_df, horse_df):
    '''
    馬の最近半年間の勝率を追加したレースデータが返ってくるよ。新馬レースやその馬の最初のレースはスコア化しないよ。
    '''
    recent_horse_scores = []
    check_unless_new = []
    for race_id, row in race_df.iterrows():
        count = len(race_df.loc[race_df.index == race_id])

        each_horse_df = horse_df.loc[row['馬id'] == horse_df.index]
        ranks = []
        for _, target in each_horse_df.iterrows():
            if row['開催年月日'] + relativedelta(months=-6) <= target['日付'] < row['開催年月日']:
                if type(target['着順']) == int:
                    # 平均順位をスコアとしてみるよ。
                    ranks.append(target['着順'])
                    check_unless_new.append(True)
                else:
                    # 順位がつかなかったレースは真ん中順位ということにするよ。
                    ranks.append(count / 2)
                    check_unless_new.append(True)
            elif target['日付'] == row['開催年月日']:
                # 通常は加算しないけど、新馬などその馬の最初のレースや久々のレースについては欠損値対策として中間順位とするよ。
                if not ranks:
                    ranks.append(count / 2)
                    check_unless_new.append(False)
                    
        recent_horse_scores.append(mean(ranks))
        
    race_df['馬直近成績'] = recent_horse_scores
    race_df['非新馬レース'] = check_unless_new
    return race_df

df = horse_recent_victory_ratio(race_df, horse_df)
print(df)


ValueError: Length of values (450) does not match length of index (100)

In [9]:
import pandas as pd

# horse_df_shin = pd.read_pickle('../DATA/horse_result_of_5_years_shin.pkl')
# print(horse_df_shin.columns)
# horse_df = pd.read_pickle('../DATA/horse_result_of_5_years.pkl')
# print(horse_df)

df = pd.read_pickle('../DATA/df_for_learning_shin.pkl')
print(df.columns)

Index(['着順', '枠番', '馬番', '馬名', '性齢', '斤量', '騎手', 'タイム', '着差', '単勝', '人気',
       '馬体重', '調教師', 'コース長', '天気', 'レース場', '場の状態', '馬id', '騎手id', '調教師id', '性',
       '齢', '天気index', 'レース場index', '場の状態index', '開催年', '騎手直近単勝率', '騎手直近複勝率',
       '騎手経験値', '騎手年齢', '勝敗', '開催年月日', '馬直近成績'],
      dtype='object')


In [1]:
import sklearn
import numpy

print(sklearn.__version__)
print(numpy.__version__)

1.5.1
2.0.1
