In [134]:
import pandas as pd

result = pd.read_pickle('result_new.pickle')
race_infos = pd.read_pickle('race_infos.pickle')
horse_data = pd.read_pickle('horse_data.pickle')

In [138]:
result.info()

<class 'pandas.core.frame.DataFrame'>
Index: 43609 entries, 202001010101 to 202010020812
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   着順         43609 non-null  object 
 1   枠番         43609 non-null  int64  
 2   馬番         43609 non-null  int64  
 3   馬名         43609 non-null  object 
 4   性齢         43609 non-null  object 
 5   斤量         43609 non-null  float64
 6   騎手         43609 non-null  object 
 7   タイム        43245 non-null  object 
 8   着差         40123 non-null  object 
 9   単勝         43609 non-null  object 
 10  人気         43471 non-null  float64
 11  馬体重        43609 non-null  object 
 12  調教師        43609 non-null  object 
 13  horse_id   43609 non-null  object 
 14  jockey_id  43609 non-null  object 
dtypes: float64(2), int64(2), object(11)
memory usage: 5.3+ MB


In [40]:
def preprocessing(kekka):
    result = kekka.copy()
    #着順の0-9以外の文字を含むデータを除く
    result = result[-result["着順"].astype(str).str.contains("\D")] 
    result["着順"] = result["着順"].astype(int)
    #性齢を性と齢に分ける
    result["性"] = result["性齢"].map(lambda x: str(x)[0])
    result["齢"] = result["性齢"].map(lambda x: str(x)[1:]).astype(int)
    #馬体重を体重と体重増減に分ける
    result["体重"] = result["馬体重"].str.split('(',expand=True)[0].astype(int)
    result["体重増減"] = result["馬体重"].str.split('(',expand=True)[1].str[:-1].astype(int)
    
    result["単勝"] = result["単勝"].astype(float)
    result.drop(['性齢','馬体重','タイム','着差','調教師'],axis=1,inplace=True)
    
    return result

In [48]:
result_p = preprocessing(result)
result_addinfo = result_p.merge(race_infos, left_index=True, right_index=True, how='inner')

Unnamed: 0,着順,枠番,馬番,馬名,斤量,騎手,単勝,人気,horse_id,jockey_id,性,齢,体重,体重増減,course_len,weather,race_type,ground_state,date
202001010101,1,6,6,ウインルーア,54.0,横山武史,16.0,3.0,2018101626,2018101626,牝,2,438,4,1800,曇,芝,良,2020年7月25日
202001010101,2,2,2,アークライト,54.0,ルメール,1.9,2.0,2018105193,2018105193,牡,2,510,0,1800,曇,芝,良,2020年7月25日
202001010101,3,3,3,ギャラントウォリア,54.0,池添謙一,1.8,1.0,2018104800,2018104800,牡,2,482,-6,1800,曇,芝,良,2020年7月25日
202001010101,4,1,1,ジュンブーケ,52.0,亀田温心,22.2,4.0,2018102410,2018102410,牝,2,442,0,1800,曇,芝,良,2020年7月25日
202001010101,5,4,4,キタノマンゲツ,54.0,藤岡康太,55.7,5.0,2018100828,2018100828,牡,2,426,-8,1800,曇,芝,良,2020年7月25日


In [140]:
result_addinfo.info()

<class 'pandas.core.frame.DataFrame'>
Index: 43239 entries, 202001010101 to 202010020812
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   着順            43239 non-null  int32         
 1   枠番            43239 non-null  int64         
 2   馬番            43239 non-null  int64         
 3   馬名            43239 non-null  object        
 4   斤量            43239 non-null  float64       
 5   騎手            43239 non-null  object        
 6   単勝            43239 non-null  float64       
 7   人気            43239 non-null  float64       
 8   horse_id      43239 non-null  object        
 9   jockey_id     43239 non-null  object        
 10  性             43239 non-null  object        
 11  齢             43239 non-null  int32         
 12  体重            43239 non-null  int32         
 13  体重増減          43239 non-null  int32         
 14  course_len    43239 non-null  object        
 15  weather       43239 non

In [54]:
import datetime
result_addinfo['date'] =  pd.to_datetime(result_addinfo['date'], format='%Y年%m月%d日')

In [57]:
result_addinfo.to_pickle('result_addinfo_p')

In [234]:
from tqdm.notebook import tqdm_notebook as tqdm

class HorseResults:
    def __init__(self, horse_results):
        self.horse_results = horse_results[['日付','R','賞金']]
        self.preprocessing()
        
        
    def preprocessing(self): #特に引数がない場合はselfのみ
        result = self.horse_results.copy()
        #着順の0-9以外の文字を含むデータを除く
        #result = result[-result["R"].astype(str).str.contains("\D")] 
        result["着順"] = result["R"].astype(float)
        result["賞金"].fillna(0, inplace=True)
        result["賞金"] = result["賞金"].astype(float)
        result["日付"] = pd.to_datetime(result["日付"])
        result.drop(["R"],axis=1, inplace=True)
        
        self.horse_results = result
        
    def average(self, horse_id_list, date, n_samples='all'):
        self.horse_results.rename(columns={'着順':'着順_ave','賞金':'賞金_ave'}, inplace=True)
        target_df = self.horse_results.loc[horse_id_list]
        #平均する対象をレース日付以前のうち直近n_samplesのデータのみに限定する（古いデータを反映させないため）
        if n_samples=='all':
            filtered_df = target_df[target_df['日付'] < date]
        elif n_samples > 0:
            filtered_df = target_df[target_df['日付'] < date].\
                sort_values('日付',ascending=False).groupby(level=0).head(n_samples)
        else:
            raise Exception('n_samples must be >0')
        
        return filtered_df.groupby(level=0)[['着順_ave','賞金_ave']].mean()
    
    def merge(self, results, date, n_samples='all'):
        df = results[results['date'] == date]
        horse_id_list = df['horse_id']
        merged_df = df.merge(self.average(horse_id_list, date, n_samples), left_on='horse_id', right_index=True, how='left')
        return merged_df
    
    def merge_all(self, results, n_samples='all'):
        date_list = results['date'].unique()
        #レース結果の同じ日付のかたまりごとに1行ずつhorse_idにひもづくその日付以前の戦績の平均を算出し（average）、
        # レース結果のかたまりにマージする(merge)という意味
        merged_df = pd.concat([self.merge(results, date, n_samples) for date in tqdm(date_list)]) 
        return merged_df

In [235]:
hr = HorseResults(horse_data)

In [236]:
results_5R = hr.merge_all(result_addinfo)

HBox(children=(FloatProgress(value=0.0, max=104.0), HTML(value='')))




In [237]:
results_5R

Unnamed: 0,着順,枠番,馬番,馬名,斤量,騎手,単勝,人気,horse_id,jockey_id,...,齢,体重,体重増減,course_len,weather,race_type,ground_state,date,着順_ave,賞金_ave
202001010101,1,6,6,ウインルーア,54.0,横山武史,16.0,3.0,2018101626,2018101626,...,2,438,4,1800,曇,芝,良,2020-07-25,6.000000,0.000000
202001010101,2,2,2,アークライト,54.0,ルメール,1.9,2.0,2018105193,2018105193,...,2,510,0,1800,曇,芝,良,2020-07-25,5.000000,280.000000
202001010101,3,3,3,ギャラントウォリア,54.0,池添謙一,1.8,1.0,2018104800,2018104800,...,2,482,-6,1800,曇,芝,良,2020-07-25,5.000000,280.000000
202001010101,4,1,1,ジュンブーケ,52.0,亀田温心,22.2,4.0,2018102410,2018102410,...,2,442,0,1800,曇,芝,良,2020-07-25,5.000000,70.000000
202001010101,5,4,4,キタノマンゲツ,54.0,藤岡康太,55.7,5.0,2018100828,2018100828,...,2,426,-8,1800,曇,芝,良,2020-07-25,5.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202010010412,13,1,1,コチョウジュニア,56.0,中井裕二,114.0,14.0,2016109126,2016109126,...,4,452,-6,1200,曇,芝,重,2020-01-26,8.285714,100.000000
202010010412,14,6,12,エンドゾーンダンス,57.0,黛弘人,110.9,13.0,2015105273,2015105273,...,5,472,-4,1200,曇,芝,重,2020-01-26,4.956522,2.017391
202010010412,15,5,9,バカラクイーン,54.0,横山武史,5.1,2.0,2016101947,2016101947,...,4,478,10,1200,曇,芝,重,2020-01-26,6.200000,152.260000
202010010412,16,8,16,スマートスリロス,54.0,柴田大知,115.3,15.0,2016103652,2016103652,...,4,512,-6,1200,曇,芝,重,2020-01-26,4.333333,108.888889


In [238]:
results_5R["rank"] = results_5R["着順"].map(lambda x:1 if x <4 else 0)
results_5R.drop(["馬名","着順","horse_id"],axis=1, inplace=True)
results_5R.head()

Unnamed: 0,枠番,馬番,斤量,騎手,単勝,人気,jockey_id,性,齢,体重,体重増減,course_len,weather,race_type,ground_state,date,着順_ave,賞金_ave,rank
202001010101,6,6,54.0,横山武史,16.0,3.0,2018101626,牝,2,438,4,1800,曇,芝,良,2020-07-25,6.0,0.0,1
202001010101,2,2,54.0,ルメール,1.9,2.0,2018105193,牡,2,510,0,1800,曇,芝,良,2020-07-25,5.0,280.0,1
202001010101,3,3,54.0,池添謙一,1.8,1.0,2018104800,牡,2,482,-6,1800,曇,芝,良,2020-07-25,5.0,280.0,1
202001010101,1,1,52.0,亀田温心,22.2,4.0,2018102410,牝,2,442,0,1800,曇,芝,良,2020-07-25,5.0,70.0,0
202001010101,4,4,54.0,藤岡康太,55.7,5.0,2018100828,牡,2,426,-8,1800,曇,芝,良,2020-07-25,5.0,0.0,0


In [239]:
results_5R_m =results_5R.drop(["jockey_id"],axis=1, inplace=False)
# results_5R.info()
results_5R_d = pd.get_dummies(results_5R_m)

In [240]:
results_5R_d.head()

Unnamed: 0,枠番,馬番,斤量,単勝,人気,齢,体重,体重増減,date,着順_ave,...,weather_曇,weather_雨,weather_雪,race_type_ダート,race_type_芝,race_type_障害,ground_state_不良,ground_state_稍重,ground_state_良,ground_state_重
202001010101,6,6,54.0,16.0,3.0,2,438,4,2020-07-25,6.0,...,1,0,0,0,1,0,0,0,1,0
202001010101,2,2,54.0,1.9,2.0,2,510,0,2020-07-25,5.0,...,1,0,0,0,1,0,0,0,1,0
202001010101,3,3,54.0,1.8,1.0,2,482,-6,2020-07-25,5.0,...,1,0,0,0,1,0,0,0,1,0
202001010101,1,1,52.0,22.2,4.0,2,442,0,2020-07-25,5.0,...,1,0,0,0,1,0,0,0,1,0
202001010101,4,4,54.0,55.7,5.0,2,426,-8,2020-07-25,5.0,...,1,0,0,0,1,0,0,0,1,0


In [241]:
def split_data(df, test_size=0.3):
    sorted_id_list = df.sort_values('date').index.unique()
    train_id_list = sorted_id_list[:round(len(sorted_id_list)*(1-test_size))]
    test_id_list = sorted_id_list[round(len(sorted_id_list)*(1-test_size)):]
    train =df.loc[train_id_list]
    test = df.loc[test_id_list]
    train.drop(['date'],axis=1,inplace=True)
    test.drop(['date'],axis=1,inplace=True)
    return train, test

In [242]:
train, test = split_data(results_5R_d)

In [244]:
X_train = train.drop(['rank'], axis=1)
y_train = train['rank']
X_test = test.drop(['rank'], axis=1)
y_test = test['rank']

In [246]:
#勾配ブースティング木
import lightgbm as lgb
from sklearn.metrics import roc_curve, roc_auc_score

params = {
    'num_leaves':3, #デフォルトは34
    'n_estimators':70,
    'min_data_in_leaf':300,
#     'class_weight':'balanced',
    'random_state':100
}

lgb_clf = lgb.LGBMClassifier(**params)
lgb_clf.fit(X_train.values, y_train.values) #値に日本語が入っているとエラー

y_pred_train = lgb_clf.predict_proba(X_train)[:,1]
print(roc_auc_score(y_train, y_pred_train))
y_pred = lgb_clf.predict_proba(X_test)[:,1]
print(roc_auc_score(y_test, y_pred))

0.8064551774212403
0.8102429476058268


In [248]:
importances = pd.DataFrame({'features': X_train.columns, 'importance': lgb_clf.feature_importances_})
importances.sort_values('importance', ascending=False)[:20]

Unnamed: 0,features,importance
3,単勝,95
4,人気,14
9,賞金_ave,9
8,着順_ave,7
2,斤量,5
6,体重,4
0,枠番,2
7,体重増減,1
5,齢,1
112,騎手_横山和生,1


In [None]:
#単勝や人気が効いているのは変わらないが、過去の賞金や着順の平均値も予測に効いている