In [17]:
from tqdm import tqdm
import time
import requests
import pandas as pd
import random

class HorseResult:
    '''
    馬の戦績をスクレイピングするよ。
    
    Input:
        馬idのリスト
        horse_id_list: list
    
    Output:
        馬の戦績のデータフレーム
        horse_results_df: pd.DataFrame
    '''
    @staticmethod
    def scrape(horse_id_list):
        horse_results = {}
        for horse_id in tqdm(horse_id_list):
            url = 'https://db.netkeiba.com/horse/' + str(horse_id) + '/'
            try:
                response = requests.get(url)
                response.encoding = 'EUC-JP'
                html = response.text
                
                df = pd.read_html(html)[3]
                if df.columns[0] == '受賞歴':
                    df = pd.read_html(html)[4]
                df.index = [horse_id] * len(df)
                horse_results[horse_id] = df
                
                time.sleep(random.uniform(1, 3))

            except IndexError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break
            
        horse_results_df = pd.concat([horse_results[key] for key in horse_results])
        return horse_results_df
                
race_results = {}
for year in range(2020, 2025, 1):
    race_results[str(year)] = pd.read_pickle(f'../DATA/{year}_race_result.pkl')
race_results_df = pd.concat([race_results[key] for key in race_results])

horse_id_list = race_results_df['馬id'].unique().tolist()
horse_id_list_1 = horse_id_list[10116:]
horse_results = HorseResult.scrape(horse_id_list_1)
horse_results.to_pickle(f'../DATA/horse_result_1.pkl')


100%|██████████████████████████████████| 17634/17634 [12:48:42<00:00,  2.62s/it]


In [2]:
for year in range(2020, 2025, 1):
    print(f'{str(year)}/race_result.pkl')

2020/race_result.pkl
2021/race_result.pkl
2022/race_result.pkl
2023/race_result.pkl
2024/race_result.pkl


In [5]:
import pandas as pd

df0 = pd.read_pickle('../DATA/horse_result_0.pkl')
df1 = pd.read_pickle('../DATA/horse_result_1.pkl')
df = pd.concat([df0, df1])
df.to_pickle('../DATA/horse_result_of_5_years.pkl')

In [4]:
import pandas as pd
from datetime import datetime as dt
from dateutil.relativedelta import relativedelta

df = pd.read_pickle('../DATA/horse_result_of_5_years.pkl')
df['日付'] = df['日付'].apply(lambda x: dt.strptime(x, '%Y/%m/%d'))
df = df.loc[df['日付'] >= (dt.today() + relativedelta(months=-6))]

# 平均順位より、上位何％とかを集計した方がいいかも？要検討
def get_order_ave_of_horse_data(each_df):
    rows = []
    for _, row in each_df.iterrows():
        try:
            rows.append(int(row['着順']))
        except ValueError:
            continue
    try:
        order_ave = sum(rows) / len(rows)
    except ZeroDivisionError:
        order_ave = 0
    
    return [order_ave] * len(each_df)
order_ave_dict = {}
for horse_id, each_df in df.groupby(df.index):
    each_df['order_ave'] = get_order_ave_of_horse_data(each_df)
    order_ave_dict[horse_id] = each_df
new_df = pd.concat([order_ave_dict[key] for key in order_ave_dict])
new_df['horse_id'] = new_df.index
col = ['horse_id', 'order_ave']
horse_df_for_db = new_df[col]
horse_df_for_db.to_pickle('../DATA/horse_df_for_db_20240801.pkl')