In [21]:
from tqdm import tqdm
import time
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
import random

class RaceResult:
    @staticmethod
    def scrape(race_ids):
        """
        レース結果のスクレイピングを行うよ。
    
        入力input:
        レースIDのリスト
        race_ids : list
    
        出力output:
        結果のデータフレーム
        race_results : pd.DataFrame
        """
        # 辞書型で出力を定義しておく
        race_results = {}
        for race_id in tqdm(race_ids, leave=False):
            try:
                url = "https://db.netkeiba.com/race/" + race_id + "/"

                response = requests.get(url)
                response.encoding = "EUC-JP"

                df = pd.read_html(response.text)[0]
                # 半角スペースがあったら除去するよ〜
                df = df.rename(columns=lambda x: x.replace(' ', ''))
                # 正規表現で天気とレース情報をスクレイピングするよ〜
                soup = BeautifulSoup(response.text, "html.parser")
                text = soup.select("div.data_intro p")[0].text
                words = re.findall(r'\w+', text)
                for info in words:
                    if 'm' in info:
                        df['コース長'] = [int(''.join(re.findall(r'\d+', info)))] * len(df)
                    if info in ['曇', '晴', '雨', '小雨', '小雪', '雪', ]:
                        df['天気'] = [info] * len(df)
                    if info in ['芝', 'ダート', '障']:
                        df['レース場'] = [info] * len(df)
                    if info in ['良', '稍重', '重', '不良']:
                        df['場の状態'] = [info] * len(df)

                # 今度はお馬さんidと騎手さんid、調教師idを取得するよ〜
                horse_id_list = []
                jockey_id_list = []
                trainer_id_list = []

                horse_link_list = soup.find('table', attrs={'summary': 'レース結果'}).find_all('a', attrs={'href': re.compile(r'^/horse/')})
                for horse_link in horse_link_list:
                    horse_id = int(''.join(re.findall(r'\d+', horse_link['href'])))
                    horse_id_list.append(horse_id)

                jockey_link_list = soup.find('table', attrs={'summary': 'レース結果'}).find_all('a', attrs={'href': re.compile(r'^/jockey/result/recent/')})
                for jockey_link in jockey_link_list:
                    jockey_id = int(''.join(re.findall(r'\d+', jockey_link['href'])))
                    jockey_id_list.append(jockey_id)

                trainer_link_list = soup.find('table', attrs={'summary': 'レース結果'}).find_all('a', attrs={'href': re.compile(r'^/trainer/result/recent/')})
                for trainer_link in trainer_link_list:
                    trainer_id = int(''.join(re.findall(r'\d+', trainer_link['href'])))
                    trainer_id_list.append(trainer_id)

                df['馬id'] = horse_id_list
                df['騎手id'] = jockey_id_list
                df['調教師id'] = trainer_id_list

                df.index = [race_id] * len(df)
                race_results[race_id] = df
                
                time.sleep(random.uniform(1, 3))
            
            except IndexError:
                continue
            except AttributeError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break
            
        race_results_df = pd.concat([race_results[key] for key in race_results])
            
        return race_results_df

year = 2020

race_id_list = []
for place in range(1, 11, 1):
    for kai in range(1, 7, 1):
        for day in range(1, 13, 1):
            for r in range(1, 13, 1):
                race_id = str(year) + str(place).zfill(2) + str(kai).zfill(2) + str(day).zfill(2) + str(r).zfill(2)
                race_id_list.append(race_id)

race_results = RaceResult.scrape(race_id_list)
race_results.to_pickle(f'../DATA/{year}_race_result.pkl')


                                                                                

In [19]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime as dt

def scrape_race_data(race_id):
    url = "https://db.netkeiba.com/race/" + race_id + "/"

    response = requests.get(url)
    response.encoding = "EUC-JP"
  # 正規表現で天気とレース情報をスクレイピングするよ〜
    soup = BeautifulSoup(response.text, "html.parser")
    text = soup.select("div.data_intro h1")[0].text
#     words = re.findall(r'\S+', text)
    sub_text = soup.select('p.smalltxt')[0].text
    date_str = re.findall(r'\S+', sub_text)[0]
    date = dt.strptime(date_str, '%Y年%m月%d日')
    return date

print(scrape_race_data('202402010601'))

2024-06-23 00:00:00


In [29]:
from tqdm import tqdm
import time
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime as dt

# このdfは、engineering後のデータフレームだよ。順番前後しちゃってるよ。ごめんね。
df = pd.read_pickle('../DATA/recent_5_race_df_for_learning.pkl')
# 既存のdfに日付データを追加するよ〜
date_data = {}
for race_id, each_df in tqdm(df.groupby(df.index)):
    url = "https://db.netkeiba.com/race/" + race_id + "/"
    response = requests.get(url)
    response.encoding = 'EUC-JP'
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    content = soup.select('p.smalltxt')[0].text
    pattern = '\d+年\d+月\d+日'
    str_date = re.search(pattern, content).group()
    race_date = dt.strptime(str_date, '%Y年%m月%d日')
    date_data[race_id] = pd.DataFrame(([race_date] * len(each_df)), index=([race_id] * len(each_df)))
    
    time.sleep(1)

date_df = pd.concat([date_data[key] for key in date_data])
df_with_date = pd.concat([df, date_df], axis=1)
df_with_date = df_with_date.rename(columns={0: '開催年月日'})
df_with_date.to_pickle('../DATA/df_for_learning_with_date.pkl')
# date_data_df = pd.DataFrame(date_data)
# df['開催年月日'] = date_data_df
# print(df['開催年月日'])

100%|███████████████████████████████████| 15546/15546 [6:23:03<00:00,  1.48s/it]


In [17]:
import pandas as pd
from datetime import datetime as dt
from dateutil.relativedelta import relativedelta

half_year_ago = dt.today() + relativedelta(months=-6)
df = pd.read_pickle('../DATA/df_for_learning_with_date.pkl')
df_for_db = df.loc[df['開催年月日'] >= half_year_ago]
race_ids = []
for index, _ in df_for_db.iterrows():
    race_ids.append(index)
df_for_db['レースid'] = race_ids
df_for_db = df_for_db.rename(columns={
    '開催年月日': 'race_date',
    'レースid': 'race_id',
    '着順': 'order',
    '馬名': 'horse_name',
    '馬id': 'horse_id',
    '騎手': 'jockey_name',
    '騎手id': 'jockey_id',
    '単勝': 'odds'
})
df_for_db = df_for_db[['race_date', 'race_id', 'order', 'horse_name', 'horse_id', 'jockey_name', 'jockey_id', 'odds']]

df_for_db.to_pickle('../DATA/df_for_db_20240721.pkl')
# for date, grouped_df in df_for_db.groupby(['race_date', 'race_id']):
    
# df_sorted = df_for_db.sort_values('開催年月日', ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_for_db['レースid'] = race_ids


In [15]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re

class NewRace:
    @staticmethod
    def scrape_new_race(new_race_id):
        url = 'https://race.netkeiba.com/race/shutuba.html?race_id=' + new_race_id
        try:
            response = requests.get(url)
            response.encoding = 'EUC-JP'

            soup = BeautifulSoup(response.text, 'html.parser')
            main_text = soup.select('div.RaceList_Item02 h1')[0].text
            if '新馬' in main_text:
                return
            elif '未出走' in main_text:
                return

            df = pd.read_html(response.text)[0]
            df = df.rename(columns=lambda x: x.replace(' ', ''))

            horse_id_list = []
            jockey_id_list = []
            
            horse_link_list = soup.find('table', attrs={'class': 'Shutuba_Table'}).find_all('a', attrs={'href': re.compile(r'^https://db.netkeiba.com/horse/\d+')})
            for horse_link in horse_link_list:
                horse_id = ''.join(re.findall(r'\d+', horse_link['href']))
                horse_id_list.append(horse_id)
                
            jockey_link_list = soup.find('table', attrs={'class': 'Shutuba_Table'}).find_all('a', attrs={'href': re.compile(r'^https://db.netkeiba.com/jockey/result/recent/\d+')})
            for jockey_link in jockey_link_list:
                jockey_id = ''.join(re.findall(r'\d+', jockey_link['href']))
                jockey_id_list.append(jockey_id)
                
            df['horse_id'] = horse_id_list
            df['jockey_id'] = jockey_id_list
        except Exception as e:
            print(e)
        return df

print(NewRace.scrape_new_race('202401010401'))

    枠  馬番   印         馬名  性齢    斤量    騎手     厩舎   馬体重(増減) Unnamed:9_level_0  \
    枠  馬番   印         馬名  性齢    斤量    騎手     厩舎   馬体重(増減) Unnamed:9_level_1   
0   1   1 NaN   エナジーポケット  牡3  57.0   佐々木   栗東森田   418(-6)             ---.-   
1   2   2 NaN     キーシンガー  牝3  52.0    長浜   美浦青木   442(+6)             ---.-   
2   3   3 NaN  ツキガキレイデスネ  牝3  55.0     黛   美浦小島   462(+6)             ---.-   
3   3   4 NaN    レジーナチェリ  牡3  57.0    丹内  美浦伊藤伸    434(0)             ---.-   
4   4   5 NaN  ブライティアダイヤ  牝3  55.0    永野   美浦稲垣   464(+6)             ---.-   
5   4   6 NaN    スリータイガー  牝3  55.0   西村淳  栗東高橋忠   428(-2)             ---.-   
6   5   7 NaN  スウィートリワード  牝3  55.0    武豊   栗東宮本   454(-2)             ---.-   
7   5   8 NaN    オメガサミット  牡3  56.0   角田河   栗東今野  450(-16)             ---.-   
8   6   9 NaN   インマイポケット  牝3  55.0  ルメール   栗東武幸   428(-2)             ---.-   
9   6  10 NaN        スコア  牡3  54.0    高杉   美浦栗田   458(+4)             ---.-   
10  7  11 NaN  シャーリーゴールド  牝3  55.0    浜中   栗東須貝    4