In [17]:
from tqdm import tqdm
import time
import requests
import pandas as pd
import random

class HorseResult:
    '''
    馬の戦績をスクレイピングするよ。
    
    Input:
        馬idのリスト
        horse_id_list: list
    
    Output:
        馬の戦績のデータフレーム
        horse_results_df: pd.DataFrame
    '''
    @staticmethod
    def scrape(horse_id_list):
        horse_results = {}
        for horse_id in tqdm(horse_id_list):
            url = 'https://db.netkeiba.com/horse/' + str(horse_id) + '/'
            try:
                response = requests.get(url)
                response.encoding = 'EUC-JP'
                html = response.text
                
                df = pd.read_html(html)[3]
                if df.columns[0] == '受賞歴':
                    df = pd.read_html(html)[4]
                df.index = [horse_id] * len(df)
                horse_results[horse_id] = df
                
                time.sleep(random.uniform(1, 3))

            except IndexError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break
            
        horse_results_df = pd.concat([horse_results[key] for key in horse_results])
        return horse_results_df
                
race_results = {}
for year in range(2020, 2025, 1):
    race_results[str(year)] = pd.read_pickle(f'../DATA/{year}_race_result.pkl')
race_results_df = pd.concat([race_results[key] for key in race_results])

horse_id_list = race_results_df['馬id'].unique().tolist()
horse_id_list_1 = horse_id_list[10116:]
horse_results = HorseResult.scrape(horse_id_list_1)
horse_results.to_pickle(f'../DATA/horse_result_1.pkl')


100%|██████████████████████████████████| 17634/17634 [12:48:42<00:00,  2.62s/it]


In [2]:
for year in range(2020, 2025, 1):
    print(f'{str(year)}/race_result.pkl')

2020/race_result.pkl
2021/race_result.pkl
2022/race_result.pkl
2023/race_result.pkl
2024/race_result.pkl


In [5]:
import pandas as pd

df0 = pd.read_pickle('../DATA/horse_result_0.pkl')
df1 = pd.read_pickle('../DATA/horse_result_1.pkl')
df = pd.concat([df0, df1])
df.to_pickle('../DATA/horse_result_of_5_years.pkl')

In [30]:
import pandas as pd
from datetime import datetime as dt
from dateutil.relativedelta import relativedelta

df = pd.read_pickle('../DATA/horse_result_of_5_years.pkl')
race_df = pd.read_pickle('../DATA/race_results_of_5_years.pkl')
df['日付'] = df['日付'].apply(lambda x: dt.strptime(x, '%Y/%m/%d'))
df = df.loc[df['日付'] >= (dt.today() + relativedelta(months=-6))]

# 平均順位より、上位何％とかを集計した方がいいかも？要検討
def get_order_ave_of_horse_data(each_df, date):
    rows = []
    for _, row in each_df.loc[each_df['日付'] <= date].iterrows():
        try:
            rows.append(int(row['着順']) / int(row['頭数']))
        except ValueError:
            continue
    try:
        order_ave = sum(rows) / len(rows)
    except ZeroDivisionError:
        order_ave = 0
    
    return order_ave

new_dfs = []
for horse_id, each_df in df.groupby(df.index):
    each_dates = each_df['日付'].tolist()
    order_ave_dict = {}
    for date in each_dates:
        str_date = str(date)
        order_ave = get_order_ave_of_horse_data(each_df, date)
        order_ave_dict[str_date] = order_ave
    data = {
        'horse_id': horse_id,
        'order_ave': [order_ave_dict]
    }
    new_df = pd.DataFrame(data)
    new_df.index = [horse_id]
    new_dfs.append(new_df)

horse_df_for_db = pd.concat(new_dfs)
horse_df_for_db.to_pickle('../DATA/horse_df_for_db_20240801_kai.pkl')

In [5]:
import pandas as pd

df = pd.read_pickle('../DATA/horse_result_of_5_years.pkl')

print(df)

                    日付    開催 天気     R          レース名  映像    頭数   枠番  馬番    オッズ  \
2018101626  2024/01/27  2京都1  曇  10.0  許波多特別(2勝クラス) NaN   9.0  7.0   7   79.4   
2018101626  2024/01/14  1中山5  晴   7.0     4歳以上2勝クラス NaN  15.0  3.0   5  228.7   
2018101626  2023/12/10  5中山4  晴   9.0  チバテレ杯(2勝クラス) NaN  13.0  8.0  12   84.2   
2018101626  2023/11/18  5東京5  晴   9.0   南武特別(2勝クラス) NaN  12.0  5.0   6   80.3   
2018101626  2023/11/04  5東京1  晴  12.0     3歳以上2勝クラス NaN  10.0  6.0   6   11.0   
...                ...   ... ..   ...           ...  ..   ...  ...  ..    ...   
2021106204  2024/06/01    佐賀  晴  11.0    SAGAリベンジャー NaN  11.0  3.0   3   13.3   
2021106204  2024/05/18    佐賀  晴  11.0    SAGAリベンジャー NaN  11.0  8.0  10   51.2   
2021106204  2024/05/04    佐賀  曇  11.0    SAGAリベンジャー NaN  12.0  7.0  10   39.8   
2021106204  2024/04/20    佐賀  雨  11.0    SAGAリベンジャー NaN  12.0  8.0  12   74.5   
2021106204  2024/03/02  2小倉7  曇   1.0         3歳未勝利 NaN  16.0  5.0  10  208.1   

            ...   着差 ﾀｲﾑ指数 

In [56]:
import pandas as pd
from datetime import datetime as dt
from dateutil.relativedelta import relativedelta
from tqdm import tqdm

df = pd.read_pickle('../DATA/horse_result_of_5_years.pkl')
race_df = pd.read_pickle('../DATA/race_results_of_5_years.pkl')
df['日付'] = df['日付'].apply(lambda x: dt.strptime(x, '%Y/%m/%d'))

# 平均順位より、上位何％とかを集計した方がいいかも？要検討
def get_order_ave_of_horse_data(each_df, date):
    rows = []
    for _, row in each_df.loc[each_df['日付'] <= date].iterrows():
        try:
            rows.append(int(row['着順']) / int(row['頭数']))
        except ValueError:
            continue
    try:
        order_ave = sum(rows) / len(rows)
    except ZeroDivisionError:
        order_ave = 0
    
    return order_ave

new_dfs = []
for horse_id, each_df in tqdm(df.groupby(df.index)):
    each_dates = each_df['日付'].tolist()
    order_ave_dict = {}
    for date in each_dates:
        str_date = str(date)
        order_ave = get_order_ave_of_horse_data(each_df, date)
        order_ave_dict[str_date] = order_ave
    data = {
        'horse_id': horse_id,
        'order_ave': [order_ave_dict]
    }
    new_df = pd.DataFrame(data)
    new_df.index = [horse_id]
    new_dfs.append(new_df)

horse_df_for_db = pd.concat(new_dfs)
horse_df_for_db.to_pickle('../DATA/horse_result_of_5_years_with_order_ave.pkl')

100%|█████████████████████████████████████| 27750/27750 [05:12<00:00, 88.69it/s]


In [4]:
import pandas as pd
from datetime import datetime as dt
from datetime import date as d
from dateutil.relativedelta import relativedelta
from tqdm import tqdm

horse_df = pd.read_pickle('../DATA/horse_result_of_5_years_with_order_ave.pkl')
race_df = pd.read_pickle('../DATA/df_for_learning_with_date.pkl')
recent_scores = []
for race_id, each_df in tqdm(race_df.groupby(race_df.index)):
    horse_ids = each_df['馬id'].tolist()
    race_date = each_df['開催年月日'][0]
    for horse_id in horse_ids:
        horse_data = horse_df[horse_df.index == horse_id]
        for _, orders in horse_data['order_ave'].items():
            recent_date = max((key for key in orders if dt.strptime(key, '%Y-%m-%d %H:%M:%S') < race_date), default=None)
            if recent_date:
                recent_order_ave = orders[recent_date]
            else:
                recent_order_ave = None
            recent_scores.append(recent_order_ave)
race_df['馬直近成績'] = recent_scores
# race_df.to_pickle('../DATA/df_for_learning_kai.pkl')
print(race_df['馬直近成績'])

  race_date = each_df['開催年月日'][0]
100%|████████████████████████████████████| 15546/15546 [00:37<00:00, 410.77it/s]


202001010101    1.000000
202001010101    0.333333
202001010101    0.285714
202001010101    0.714286
202001010101    0.857143
                  ...   
202410020812    0.364903
202410020812    0.445835
202410020812    0.521743
202410020812    0.468210
202410020812    0.549447
Name: 馬直近成績, Length: 213090, dtype: float64


In [21]:
import pandas as pd
from datetime import datetime as dt
from datetime import date as d
from dateutil.relativedelta import relativedelta
from tqdm import tqdm

race_df = pd.read_pickle('../DATA/df_for_learning_kai.pkl')
col = ['着順', '開催年月日', '馬直近成績']
right_dfs = []
for race_id, each_df in race_df.groupby(race_df.index):
    if not each_df['馬直近成績'].isnull().any():
        right_dfs.append(each_df)

new_df = pd.concat(right_dfs)
new_df.to_pickle('../DATA/df_for_learning_without_none.pkl')

In [24]:
some_dict = {'6': 280, '15': 780, '3': 110}
str_data = 6

if str_data in some_dict.keys():
    print('いいね')
else:
    print('よくないね')
data = [True] * len(some_dict)
print(data)

よくないね
[True, True, True]


In [32]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from io import StringIO

horse_id = '2019103042'
url = 'https://db.netkeiba.com/horse/' + horse_id + '/'
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'}
session = requests.Session()
response = session.get(url, headers=header)
response.encoding = 'EUC-JP'
html = response.text
soup = BeautifulSoup(html, 'html.parser')
target_table = soup.find('table', attrs={'class': 'db_h_race_results'})
str_table = str(target_table)
df = pd.read_html(StringIO(str_table))[0]
df = df.rename(columns=lambda x: x.replace(' ', ''))
print(df)

            日付     開催  天気   R         レース名  映像  頭数  枠番  馬番    オッズ  ...   着差  \
0   2024/11/16   3福島5   曇  10  西郷特別(2勝クラス) NaN  15   5   9   24.9  ...  2.7   
1   2024/08/11   2札幌2   晴   9  桑園特別(2勝クラス) NaN  12   8  12   32.9  ...  0.5   
2   2024/07/13   3小倉5   曇   9  雲仙特別(2勝クラス) NaN  14   8  13   36.5  ...  0.6   
3   2024/03/10   1阪神6   晴  12    4歳以上2勝クラス NaN  13   5   7  211.8  ...  1.9   
4   2024/02/03   1小倉7  小雨  10  有田特別(2勝クラス) NaN  14   7  12   27.4  ...  1.7   
5   2023/11/19   3京都6   晴   8    3歳以上2勝クラス NaN  15   7  13   35.2  ...  2.3   
6   2023/08/19   3小倉3   晴   9  雲仙特別(2勝クラス) NaN  14   5   8   37.4  ...  1.3   
7   2023/03/12  1阪神10   晴  12    4歳以上2勝クラス NaN  16   4   7   55.2  ...  2.0   
8   2023/02/04   1小倉7   晴  10  有田特別(2勝クラス) NaN  14   4   6   14.2  ...  1.0   
9   2022/10/23   4新潟4   曇  10  妙高特別(2勝クラス) NaN  15   2   2    5.7  ...  4.9   
10  2022/07/10   3小倉4   晴   7    3歳以上1勝クラス NaN  11   1   1    4.0  ... -0.2   
11  2022/03/21   2中京4   晴   3        3歳未勝利 NaN  16  

In [44]:
from datetime import datetime as dt

list_a = ['20241103', '20241104', '20241105', '20241121']
for element in list_a:
    datetime = dt.strptime(element, '%Y%m%d').day

<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>


In [45]:
list1 = []
string1 = ','.join(list1)
print(string1)




In [48]:
from datetime import datetime as dt
from dateutil import relativedelta
import re

string = 'ClassTrigger'
trigger = string.split('Trigger')[0]
print(trigger)

Class
