netkeibaから競馬のデータをスクレイピングするためのコード

class MakeCSVは、過去のレースのurlを取得し、それぞれのurlからPandas,BeautifulSoup,Seleniumを使用し必要なデータをスクレイピングする。最終的にPandasのDataFrameにまとめて保存する

class PredictScrはこれから行われるレースのデータをスクレイピングし、DataFrameとして保存する


In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Hiragino Sans'
import seaborn as sns
pd.options.display.max_info_columns = 200
pd.set_option('display.max_columns', 50)
import numpy as np
import re
import os
import collections
import tqdm
from tqdm.notebook import trange
from func_timeout import func_timeout, FunctionTimedOut
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
import warnings
warnings.simplefilter('ignore')
import gc
from datetime import datetime
from datetime import date



In [2]:
class MakeCSV:
    def __init__(self):
        self.rank_list = ["新馬", "未勝利", "1勝", "500万", "2勝", "1000万", "3勝", "1600万", "OP", "オープン", "L", "G3", "G2", "G1"]
        self.place_list = ["札幌", "函館", "福島", "新潟", "東京", "中山", "中京", "京都", "阪神", "小倉"]
        
    def search(self, place, s_year, f_year, s_month=1, f_month=12, rank="all", gd="grass", dist="all", age="all", fem=False, race_name="None", hyde=False):
        search_place = place
        place_dict = {"札幌":"check_Jyo_01", "函館":"check_Jyo_02", "福島":"check_Jyo_03", "新潟":"check_Jyo_04",
                      "東京":"check_Jyo_05", "中山":"check_Jyo_06", "中京":"check_Jyo_07", "京都":"check_Jyo_08",
                      "阪神":"check_Jyo_09", "小倉":"check_Jyo_10"}
        rank_dict = {"G1": 1, "G2": 2, "G3": 3, "OP": 4, "3勝":5, "2勝":6, "1勝":7, "新馬":8, "未勝利":9}
        age_dict = {"2歳": 11, "3歳": 12, "3歳以上": 13, "4歳以上": 14}
        for word, read in place_dict.items():
            search_place = search_place.replace(word, read)
        
        if hyde:
            options = Options()
            options.add_argument('--headless')
            browser = webdriver.Chrome(options=options)
        else:
            browser = webdriver.Chrome()
        browser.get("https://db.netkeiba.com/?pid=race_search_detail")
        browser.maximize_window()
        
        browser.find_element(By.ID, search_place).click()

        if rank=="all":
             rank_range = [1, 2, 3, 4, 5, 6, 7, 9]
        else:
            rank_range = [rank_dict[r] for r in rank]
        for num in rank_range:
            browser.find_element(By.ID, "check_grade_{}".format(num)).click()
        
        browser.find_element(By.ID, "check_track_1").click()
        if gd == "mix":
            browser.find_element(By.ID, "check_track_2").click()
        elif gd == "only_dirt":
            browser.find_element(By.ID, "check_track_1").click()
            browser.find_element(By.ID, "check_track_2").click()
        
        if dist != "all":
            for d in dist:
                browser.find_element(By.ID, "check_kyori_{}".format(str(d))).click()
        
        if age == "all":
            age_range = [11, 12, 13, 14]
        else:
            age_range = [age_dict[a] for a in age]
        for num in age_range:
            browser.find_element(By.ID, "check_barei_{}".format(num)).click()

        if fem:
            browser.find_element(By.ID, "check_jyoken_1").click()

        if race_name!="None":
            browser.find_element(By.CSS_SELECTOR, "form > table > tbody > tr:nth-child(1) > td > input").send_keys(race_name)
        
        css_form = "form > table > tbody > tr:nth-child(3) > td > select:nth-child({}) > option:nth-child({})"
        this_year = int(browser.find_element(By.CSS_SELECTOR, css_form.format(1,2)).text)
        s_year_css = str(2 + (this_year - s_year))
        s_month_css = str(1 + s_month)
        f_year_css = str(2 + (this_year - f_year))
        f_month_css = str(1 + f_month)
        browser.find_element(By.CSS_SELECTOR, css_form.format("1", s_year_css)).click()
        browser.find_element(By.CSS_SELECTOR, css_form.format("2", s_month_css)).click()
        browser.find_element(By.CSS_SELECTOR, css_form.format("3", f_year_css)).click()
        browser.find_element(By.CSS_SELECTOR, css_form.format("4", f_month_css)).click()
        
        browser.execute_script("window.scrollTo(0, 600);")
        browser.find_element(By.CSS_SELECTOR, "form > table > tbody > tr:nth-child(11) > td > select > option:nth-child(3)").click()
        browser.find_element(By.XPATH, "//input[@value='検索']").click()  
        
        url_elements = browser.find_elements(By.CSS_SELECTOR, "td:nth-child(5) > a")
        self.race_urls = [element.get_attribute("href") for element in url_elements]
        if len(browser.find_elements(By.CSS_SELECTOR, "li")) > 80:
            browser.execute_script("window.scrollTo(0, 2500);")
            browser.find_element(By.CSS_SELECTOR, "div.common_pager > ul:nth-child(2) > li:nth-child(5) > a").click()
            browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            browser.execute_script("window.scrollBy(0, -1000);")
            page_num = int(browser.find_element(By.CSS_SELECTOR, "div.common_pager > ul:nth-child(1) > li.Page_Active").text)
            browser.find_element(By.CSS_SELECTOR, "div.common_pager > ul:nth-child(2) > li:nth-child(3) > a").click()
            css_next_form = "div.common_pager > ul:nth-child(1) > li:nth-child({}) > a"
            for i in range(page_num-1):
                if page_num < 10:
                    browser.execute_script("window.scrollTo(0, 2500);")
                    browser.find_element(By.CSS_SELECTOR, css_next_form.format(str(page_num+4))).click()
                else:
                    browser.execute_script("window.scrollTo(0, 2500);")
                    browser.find_element(By.CSS_SELECTOR, css_next_form.format(str(14))).click()
                url_elements = browser.find_elements(By.CSS_SELECTOR, "td:nth-child(5) > a")
                urls = [element.get_attribute("href") for element in url_elements]
                self.race_urls = self.race_urls + urls
        self.race_nums = len(self.race_urls)
        print("レース数:", self.race_nums)
        return self.race_urls
    
    def make_table(self, race_url, no_odds=True):
        self.race_url = race_url
        tables = pd.read_html(self.race_url)
        self.tab = tables[0]
        while not str(self.tab.iloc[-1,0]).isdigit():
            self.tab = self.tab.drop(self.tab.index[-1])
        self.head = len(self.tab)
        self.tab["head"] = self.head
        self.tab["horse_EW"] = self.tab["調教師"].str[1]
        self.tab["weight"] = self.tab["馬体重"].str.split("(").str.get(0)
        self.tab["weight_change"] = self.tab["馬体重"].str.split("(").str.get(1).str.replace(")", "")
        self.tab["sex"] = self.tab["性齢"].str[0]
        self.tab["age"] = self.tab["性齢"].str[1:]
        self.tab = self.tab.drop(["着差", "性齢", "馬体重", "調教師"], axis=1)
        if no_odds:
            self.tab = self.tab.drop(["単勝", "人 気"], axis=1)
        self.use_bs()
        self.horse_url()
        self.horse_table()
        #self.use_selenium()  make_tableに入れるとしたらここ
        self.tab = pd.concat([self.tab, self.add_tab], axis=1)
        self.tab = self.tab.rename(columns={"着 順":"result", "枠 番":"waku", "馬 番":"num", "斤量":"carry", "単勝":"odds",
                                            "人 気":"popular"})
        self.tab["race"] = self.tab["year"] + self.tab["month"] + self.tab["day"] + self.clock
        self.tab["class"] = 0
        div = np.round((self.head-3)/2)+3
        self.tab.loc[3:div, "class"] = 1
        self.tab.loc[div:, "class"] = 2
        self.tab["reg"] = self.tab["タイム"].apply(self.time_to_float)
        #self.tab["relative_time"] = self.relative_times
        #self.tab = self.tab.drop(["馬名", "騎手", "タイム", "horse_url"], axis=1)
        self.tab = self.tab.drop(["タイム"], axis=1)
        self.tab = self.tab.rename(columns={"馬名": "horse_name", "騎手": "jockey"})
        int8_col = ["waku", "num", "head", "weight_change", "age", "month", "day", "kai", "nichi", "class"]
        int16_col = ["year", "weight", "dist"]
        self.tab[int8_col] = self.tab[int8_col].astype("int8")
        self.tab[int16_col] = self.tab[int16_col].astype("int16")
        return self.tab
        
    def use_bs(self):
        html = self.get_html(self.race_url)
        soup = BeautifulSoup(html.content, "html.parser")
        title_soup = soup.find(class_="data_intro")
        self.table_soup = soup.select_one("table")
        race_info = title_soup.select_one("span").text.replace("\xa0", "").split("/")
        race_info2 = title_soup.find(class_="smalltxt").text.replace("\xa0", "").split(" ")
        race_name = title_soup.select_one("h1").text
        race_div = race_info2[2]
        self.tab["race_rank"] = self.to_race_rank(race_name, race_div)
        self.tab["GD"] = re.search('[障芝ダ]', race_info[0]).group()
        self.tab["turn"] = re.search('[右左直]', race_info[0]).group()
        self.tab["dist"] = re.search(r'\d+', race_info[0]).group()
        try:
            self.tab["course"] = re.search('外', race_info[0]).group()
        except AttributeError:
            self.tab["course"] = "内"
        self.tab["weather"] = re.search('[晴曇小雨雪]', race_info[1]).group()
        clock = race_info[3].split(":")
        self.clock = clock[1].strip() + clock[2].strip()
        self.tab["condition"] = re.search('[良稍重不]', race_info[2]).group()
        self.tab["year"] = re.findall(r'\d+', race_info2[0])[0]
        self.tab["month"] = re.findall(r'\d+', race_info2[0])[1]
        self.tab["day"] = re.findall(r'\d+', race_info2[0])[2]
        self.tab["kai"] = re.findall(r'\d+', race_info2[1])[0]
        self.tab["nichi"] = re.findall(r'\d+', race_info2[1])[1]
        self.tab["place"] = self.to_place(race_info2[1])
        self.tab["prize"] = self.table_soup.select("tr")[1].select("td")[-1].text
        
    def horse_url(self):
        self.url_list = []
        tr_list = self.table_soup.select("tr")[1:self.head+1]
        for tr in tr_list:
            href = tr.select("td")[3].find("a").get("href")
            url = "https://db.netkeiba.com" + href
            self.url_list.append(url)
        self.tab["horse_url"] = self.url_list
    
    def horse_table(self):
        row_list = []
        self.ind_list = []
        for url in self.url_list:
            h_tabs = pd.read_html(url)
            h_tab = h_tabs[3]
            if len(h_tab.columns) < 20:
                h_tab = h_tabs[4]
            h_info = h_tabs[1]
            #all_prize = h_info.set_index(0).loc["獲得賞金", 1].split("万")[0].replace(",", "").replace("億", "")
            
            race_year, race_month, race_day = self.tab.loc[self.tab["horse_url"]==url, ["year", "month", "day"]].iloc[0]
            race_date = race_year+"/"+race_month.zfill(2)+"/"+race_day.zfill(2)
            ind = h_tab[h_tab["日付"]==race_date].index[0]
            self.ind_list.append(ind)
            use_tab = h_tab.iloc[ind+1:ind+4]
            if use_tab.empty:
                row = [0, "y", "y", "y", "y"]
                row_list.append(row)
            else:
                pre_year, pre_month, pre_day = use_tab.iloc[0,0].split("/")
                interval = self.day_interval(int(race_year)-int(pre_year), int(race_month)-int(pre_month), int(race_day)-int(pre_day))
                use_tab["year"] = use_tab["日付"].str.split("/").str.get(0)
                use_tab["month"] = use_tab["日付"].str.split("/").str.get(1)
                use_tab["day"] = use_tab["日付"].str.split("/").str.get(2)
                use_tab["place"] = use_tab["開催"].apply(self.to_place)
                use_tab["kai"] = use_tab["開催"].str[0]
                use_tab.loc[use_tab["kai"].str.isdigit()==False, "kai"] = "None"
                use_tab["nichi"] = use_tab["開催"].str[-1]
                use_tab.loc[use_tab["nichi"].str.isdigit()==False, "nichi"] = "None"
                use_tab["race_rank"] = use_tab["レース名"].apply(self.to_race_rank)
                race_jockey = self.tab.loc[self.tab["horse_url"]==url, "騎手"].iloc[0]
                use_tab["same_jockey"] = use_tab["騎手"].apply(lambda x: "y" if x == race_jockey else "n")
                use_tab["GD"] = use_tab["距離"].str[0]
                use_tab["dist"] = use_tab["距離"].str[1:]
                use_tab["not_central"] = use_tab["place"].apply(lambda x: "y" if x=="None" else "n")
                use_tab["is_overseas"] = use_tab["枠 番"].apply(lambda x: "y" if np.isnan(x) else "n")
                use_tab["time"] = use_tab["タイム"].apply(self.time_to_float)
                use_tab["4_corner"] = use_tab["通過"].fillna("None").str.split("-").str.get(-1)
                use_tab["first_3f"] = use_tab["ペース"].str.split("-").str.get(0)
                use_tab["last_3f"] = use_tab["ペース"].str.split("-").str.get(1)
                use_tab["weight"] = use_tab["馬体重"].str.split("(").str.get(0)
                try:
                    use_tab["weight_change"] = use_tab["馬体重"].str.split("(").str.get(1).str.replace(")", "")
                except AttributeError:
                    use_tab["weight_change"] = 0
                use_tab["prize"] = use_tab["賞金"].fillna(0)
                use_tab = use_tab.drop(["日付", "開催", "R", "映 像", "オ ッ ズ", "人 気", "騎手", "距離", 
                                    "馬場 指数", "タイム", "ﾀｲﾑ 指数", "ペース", "馬体重", "厩舎 ｺﾒﾝﾄ",
                                    "備考", "勝ち馬 (2着馬)", "賞金"], axis=1)
                row = np.ravel(use_tab.values).tolist()
                pre_tab = h_tab[ind+1:]
                first_dist = "y" if not self.tab["dist"][0] in pre_tab["距離"].str[1:].tolist() else "n"
                first_gd = "y" if not self.tab["GD"][0] in pre_tab["距離"].str[0].tolist() else "n"
                first_place = "y" if not self.tab["place"][0] in pre_tab["開催"].apply(self.to_place).tolist() else "n"
                first_jockey = "y" if not race_jockey in pre_tab["騎手"].tolist() else "n"
                row = [interval, first_gd, first_dist, first_place, first_jockey] + row
                row_list.append(row)
        
        row_len_list = [len(row)==95 for row in row_list]
        if not any(row_len_list):
            row_list[0] = row_list[0] + ["None"]*(95-len(row_list[0]))
        base_columns = ["weather", "race_name", "head", "waku", "num", "result", "carry", "condition", "diff", "corner", "horse_last_3f",
                        "year", "month", "day", "place", "kai", "nichi", "race_rank", "same_jockey", "GD", "dist", "not_central",
                        "is_overseas", "time", "4_corner", "first_3f", "last_3f", "weight", "weight_change", "prize"]
        pre1_columns = ["pre1_" + x for x in base_columns]
        pre2_columns = ["pre2_" + x for x in base_columns]
        pre3_columns = ["pre3_" + x for x in base_columns]
        add_columns = ["interval", "first_gd", "first_dist", "first_place", "first_jockey"] + pre1_columns + pre2_columns + pre3_columns
        self.add_tab = pd.DataFrame(row_list, columns=add_columns)
    
    def use_selenium(self, race_url, horse_url_list, ind_list):
        self.browser.get(race_url)
        time_tds = self.browser.find_elements(By.CSS_SELECTOR, "td:nth-child(10)")
        relative_times = [td.text for td in time_tds]
        if len(relative_times) > len(horse_url_list):
            relative_times = relative_times[:len(horse_url_list)]
        relative_cond = self.browser.find_element(By.CSS_SELECTOR, "table.result_table_02 td").text.split("(")[0]
        row_list = []
        for horse_url, ind in zip(horse_url_list, ind_list):
            self.browser.get(horse_url)
            pre_time_tds = self.browser.find_elements(By.CSS_SELECTOR, "td:nth-child(20)")
            pre_times =[td.text for td in pre_time_tds]
            pre_times = pre_times[ind+1:ind+4]
            if not pre_times:
                row_list.append(["None"])
            else:
                pre_cond_tds = self.browser.find_elements(By.CSS_SELECTOR, "td:nth-child(17)")
                pre_conds = [td.text for td in pre_cond_tds]
                pre_conds = pre_conds[ind+1:ind+4]
                row = ["None"]*(len(pre_conds)+len(pre_times))
                row[::2] = pre_conds
                row[1::2] = pre_times
                row_list.append(row)
        row_len_list = [len(row)==6 for row in row_list]
        if not any(row_len_list):
            row_list[0] = row_list[0] + ["None"]*(6-len(row_list[0]))
        selenium_tab = pd.DataFrame(row_list, columns=["pre1_relative_cond", "pre1_relative_time", "pre2_relative_cond",
                                                       "pre2_relative_time", "pre3_relative_cond", "pre3_relative_time"])
        selenium_tab["relative_cond"] = relative_cond
        selenium_tab["relative_time"] = relative_times
        return selenium_tab
            
    def make_df(self, race_url_list, hyde=False, no_odds=True):
        self.tab_list = []
        self.rr_list = []
        self.gd_list = []
        self.dist_list = []
        self.horse_url_list_list = []
        self.ind_list_list = []
        self.pass_list = []
        self.add_pass_list = []
        for i in tqdm.tqdm(range(len(race_url_list))):
            try:
                tab = self.make_table(race_url_list[i], no_odds=no_odds)
            except (AttributeError, TypeError, KeyError, ImportError, ValueError):
                self.pass_list.append(i)
            else:
                self.tab_list.append(tab)
                race_rank = tab["race_rank"].iloc[0]
                self.rr_list.append(race_rank)
                gd = tab["GD"].iloc[0]
                self.gd_list.append(gd)
                dist = tab["dist"].iloc[0]
                self.dist_list.append(dist)
                self.horse_url_list_list.append(self.url_list)
                self.ind_list_list.append(self.ind_list)
        self.selenium_tab_list = []
        if hyde:
            options = Options()
            options.add_argument('--headless')
            self.browser = webdriver.Chrome(options=options)
        else:
            self.browser = webdriver.Chrome()
        self.browser.get("https://regist.netkeiba.com/account/?pid=login")
        login_id = ""
        password = ""
        self.browser.find_element(By.NAME, "login_id").send_keys(login_id)
        self.browser.find_element(By.NAME, "pswd").send_keys(password)
        self.browser.find_element(By.CSS_SELECTOR, "div > form > div > div.loginBtn__wrap > input[type=image]").click()
        for i in tqdm.tqdm(range(len(race_url_list))):
            if not i in self.pass_list:
                try:
                    selenium_tab = self.use_selenium(race_url_list[i], self.horse_url_list_list[i], self.ind_list_list[i])
                except (AttributeError, TypeError, KeyError, ImportError, ValueError):
                    self.add_pass_list.append(i)
                else:
                    self.selenium_tab_list.append(selenium_tab)
        for i in self.add_pass_list:
            self.tab_list.remove(self.tab_list[i])
            self.rr_list.remove(self.rr_list[i])
            self.gd_list.remove(self.gd_list[i])
            self.dist_list.remove(self.dist_list[i])
        self.df = pd.concat(self.tab_list, ignore_index=True)
        self.add_df = pd.concat(self.selenium_tab_list, ignore_index=True)
        self.df = pd.concat([self.df, self.add_df], axis=1)
        print("number of race:", len(self.tab_list))
        print("race_rank:", collections.Counter(self.rr_list))
        print("gd:", collections.Counter(self.gd_list))
        print("dist:", collections.Counter(self.dist_list))
        return self.df
        
    def df_race_url(self, df, race_urls): #race_urlsとdfのレース数が同じで共に新しい順に並んでいるとする
        df["race_url"] = 0
        for i, code in enumerate(df.race.unique().tolist()):
            df.loc[df.race==code, "race_url"] = race_urls[i]
        return df

    def make_csv(self, df, name):
        df.to_csv("/Users/csv_folder/"+name+".csv")

    def dcd(self, df):
        df.to_csv("/Users/df.csv")
        df = pd.read_csv("/Users/df.csv")
        os.remove("/Users/df.csv")
        gc.collect();
        return df

    def get_html(self, url):
        return requests.get(url)
    
    def to_race_rank(self, name, div="None"):
        ranks = []
        for i in self.rank_list:
            if i in name:
                ranks.append(i)
        if not ranks:
            for j in self.rank_list:
                if j in div:
                    ranks.append(j)
        if not ranks:
            ranks.append("None")
        if ranks[0]=="500万":
            ranks[0]="1勝"
        if ranks[0]=="1000万":
            ranks[0]="2勝"
        if ranks[0]=="1600万":
            ranks[0]="3勝"
        return ranks[0]
        
    def to_place(self, info):
        places = []
        for i in self.place_list:
            if i in info:
                places.append(i)
        if not places:
            places.append("None")
        return places[0]

    def day_interval(self,a,b,c):
        return (b+(a*12))*30+c
            
    def time_to_float(self, time):
        try:
            m,s = time.split(":")
        except (AttributeError, ValueError):
            m = 0
            s = 0
        return float(m)*60 + float(s)


In [16]:
class PredictScr:
    def __init__(self):
        self.rank_list = ["新馬", "未勝利", "1勝", "500万", "2勝", "1000万", "3勝", "1600万", "OP", "オープン", "L", "G3", "G2", "G1", 
                          "１勝", "２勝", "３勝"]
        self.place_list = ["札幌", "函館", "福島", "新潟", "東京", "中山", "中京", "京都", "阪神", "小倉"]

    def search(self, search_url, places=[], hyde=False):
        if hyde:
            options = Options()
            options.add_argument('--headless')
            browser = webdriver.Chrome(options=options)
        else:
            browser = webdriver.Chrome()
        browser.get(search_url)
        browser.maximize_window()
        urls = []
        if not places:
            titles =  browser.find_elements(By.CLASS_NAME, "RaceList_DataTitle")
            length = len([title.text for title in titles if not not title.text])
            for i in range(length):
                lis = browser.find_elements(By.CSS_SELECTOR, "#RaceTopRace > div > dl:nth-child({}) > dd > ul > li".format(i+1))
                urls = urls + [li.find_element(By.CSS_SELECTOR, "a").get_attribute("href") for li in lis if ("新馬" not in li.find_element(By.CLASS_NAME, "ItemTitle").text)&("障害" not in li.find_element(By.CLASS_NAME, "ItemTitle").text)]
        else:
            for place in places:
                titles =  browser.find_elements(By.CLASS_NAME, "RaceList_DataTitle")
                titles = [title for title in titles if not not title.text]
                number = [place in title.text for title in titles].index(True) + 1
                lis = browser.find_elements(By.CSS_SELECTOR, "#RaceTopRace > div > dl:nth-child({}) > dd > ul > li".format(number))
                urls = urls + [li.find_element(By.CSS_SELECTOR, "a").get_attribute("href") for li in lis if ("新馬" not in li.find_element(By.CLASS_NAME, "ItemTitle").text)&("障害" not in li.find_element(By.CLASS_NAME, "ItemTitle").text)]
        urls = [url.replace("result", "shutuba") for url in urls]
        return urls

    def make_table(self, race_url):
        tab = pd.read_html(race_url)[0]
        tab.columns = [col[0] for col in tab.columns.to_list()]
        self.del_index = tab[tab["印"]=="取消"].index.tolist()
        tab = tab[tab["印"]!="取消"]
        self.odds_tab = tab[["Unnamed: 9_level_0", "人気"]]
        tab = tab.drop(["印", "Unnamed: 9_level_0", "人気", "お気に入り馬"], axis=1)
        tab = tab.rename(columns={"枠": "waku", "馬 番": "num", "馬名": "horse_name", "性齢": "sex_age",
                                  "斤量": "carry", "騎手": "jockey", "厩舎": "kyusya", "馬体重 (増減)": "weight(change)"})
        tab["head"] = len(tab)
        try:
            tab["weight"] = tab["weight(change)"].str.split("(").str.get(0)
            tab["weight_change"] = tab["weight(change)"].str.split("(").str.get(1).str.replace(")", "")
        except AttributeError:
            tab["weight"] = np.nan
            tab["weight_change"] = np.nan
        tab.loc[tab["weight_change"]=="前計不", "weight_change"] = np.nan
        tab["sex"] = tab["sex_age"].str[0]
        tab["age"] = tab["sex_age"].str[1]
        tab["horse_EW"] = tab["kyusya"].apply(self.to_region)
        html = requests.get(race_url)
        soup = BeautifulSoup(html.content, "html.parser")
        race_info = soup.find(class_="RaceData01").text.replace("\n", "").split("/")
        race_info2 = soup.find(class_="RaceData02").select("span")
        clock = race_info[0].split(":")[0] + re.search(r'\d+', race_info[0].split(":")[1]).group()
        try:
            rank_num = re.search(r'\d+', soup.find(class_="RaceName").select_one("span")["class"][1]).group()
        except TypeError:
            rank_num = 0
        rank_name = race_info2[4].text
        tab["race_rank"] = self.to_race_rank(rank_name, rank_num)
        tab["GD"] = re.search('[障芝ダ]', race_info[1]).group()
        tab["turn"] = re.search('[右左直]', race_info[1]).group()
        tab["dist"] = re.search(r'\d+', race_info[1]).group()
        try:
            tab["course"] = re.search('外', race_info[1]).group()
        except AttributeError:
            tab["course"] = "内"
        try:
            tab["weather"] = re.search('[晴曇小雨雪]', race_info[2]).group()
            tab["condition"] = re.search('[良稍重不]', race_info[3]).group()
        except IndexError:
            tab["weather"] = np.nan
            tab["condition"] = np.nan
        tab["year"] = datetime.now().year
        tab["month"] = datetime.now().month
        tab["day"] = datetime.now().day
        tab["kai"] = re.search(r'\d+', race_info2[0].text).group()
        tab["nichi"] = re.search(r'\d+', race_info2[2].text).group()
        tab["place"] = race_info2[1].text
        tab["prize"] = re.search(r'\d+', race_info2[-1].text).group()
        url_elements = soup.select("td.HorseInfo > div > div > span > a")
        self.horse_urls = [url_element.get("href") for i, url_element in enumerate(url_elements) if i not in self.del_index]
        tab["horse_url"] = self.horse_urls
        self.tab = tab
        self.horse_table()
        tab = pd.concat([self.tab, self.add_tab], axis=1)
        tab["race"] = tab["year"].astype(str) + tab["month"].astype(str) + tab["day"].astype(str) + clock
        tab = tab.drop(["sex_age", "kyusya", "weight(change)"], axis=1)
        int8_col = ["head", "age", "month", "day", "kai", "nichi"]
        int16_col = ["year", "dist"]
        tab[int8_col] = tab[int8_col].astype(float)
        tab[int16_col] = tab[int16_col].astype(float)
        return tab

    def horse_table(self):
        row_list = []
        self.no_sinba = True
        for url in self.horse_urls:
            h_tabs = pd.read_html(url)
            if len(h_tabs) < 4:
                row = [0, "y", "y", "y", "y"]
                row_list.append(row)
                self.no_sinba = False
            else:
                h_tab = h_tabs[3]
                if len(h_tab.columns) < 20:
                    h_tab = h_tabs[4]
                h_info = h_tabs[1]  
                use_tab = h_tab.iloc[:3]
                race_year, race_month, race_day = self.tab.loc[self.tab["horse_url"]==url, ["year", "month", "day"]].iloc[0]
                pre_year, pre_month, pre_day = use_tab.iloc[0,0].split("/")
                interval = self.day_interval(int(race_year)-int(pre_year), int(race_month)-int(pre_month), int(race_day)-int(pre_day))
                use_tab["year"] = use_tab["日付"].str.split("/").str.get(0)
                use_tab["month"] = use_tab["日付"].str.split("/").str.get(1)
                use_tab["day"] = use_tab["日付"].str.split("/").str.get(2)
                use_tab["place"] = use_tab["開催"].apply(self.to_place)
                use_tab["kai"] = use_tab["開催"].str[0]
                use_tab.loc[use_tab["kai"].str.isdigit()==False, "kai"] = "None"
                use_tab["nichi"] = use_tab["開催"].str[-1]
                use_tab.loc[use_tab["nichi"].str.isdigit()==False, "nichi"] = "None"
                use_tab["race_rank"] = use_tab["レース名"].apply(self.to_race_rank)
                race_jockey = self.tab.loc[self.tab["horse_url"]==url, "jockey"].iloc[0]
                use_tab["same_jockey"] = use_tab["騎手"].apply(lambda x: "y" if x == race_jockey else "n")
                use_tab["GD"] = use_tab["距離"].str[0]
                use_tab["dist"] = use_tab["距離"].str[1:]
                use_tab["not_central"] = use_tab["place"].apply(lambda x: "y" if x=="None" else "n")
                use_tab["is_overseas"] = use_tab["枠 番"].apply(lambda x: "y" if np.isnan(x) else "n")
                use_tab["time"] = use_tab["タイム"].apply(self.time_to_float)
                use_tab["4_corner"] = use_tab["通過"].fillna("None").str.split("-").str.get(-1)
                use_tab["first_3f"] = use_tab["ペース"].str.split("-").str.get(0)
                use_tab["last_3f"] = use_tab["ペース"].str.split("-").str.get(1)
                use_tab["weight"] = use_tab["馬体重"].str.split("(").str.get(0)
                try:
                    use_tab["weight_change"] = use_tab["馬体重"].str.split("(").str.get(1).str.replace(")", "")
                except AttributeError:
                    use_tab["weight_change"] = 0
                use_tab["prize"] = use_tab["賞金"].fillna(0)
                use_tab = use_tab.drop(["日付", "開催", "R", "映 像", "オ ッ ズ", "人 気", "騎手", "距離", 
                                        "馬場 指数", "タイム", "ﾀｲﾑ 指数", "ペース", "馬体重", "厩舎 ｺﾒﾝﾄ",
                                        "備考", "勝ち馬 (2着馬)", "賞金"], axis=1)
                row = np.ravel(use_tab.values).tolist()
                first_dist = "y" if not self.tab["dist"][0] in h_tab["距離"].str[1:].tolist() else "n"
                first_gd = "y" if not self.tab["GD"][0] in h_tab["距離"].str[0].tolist() else "n"
                first_place = "y" if not self.tab["place"][0] in h_tab["開催"].apply(self.to_place).tolist() else "n"
                first_jockey = "y" if not race_jockey in h_tab["騎手"].tolist() else "n"
                row = [interval, first_gd, first_dist, first_place, first_jockey] + row
                row_list.append(row)
        
        row_len_list = [len(row)==95 for row in row_list]
        if not any(row_len_list):
            row_list[0] = row_list[0] + ["None"]*(95-len(row_list[0]))
        base_columns = ["weather", "race_name", "head", "waku", "num", "result", "carry", "condition", "diff", "corner", "horse_last_3f",
                        "year", "month", "day", "place", "kai", "nichi", "race_rank", "same_jockey", "GD", "dist", "not_central",
                        "is_overseas", "time", "4_corner", "first_3f", "last_3f", "weight", "weight_change", "prize"]
        pre1_columns = ["pre1_" + x for x in base_columns]
        pre2_columns = ["pre2_" + x for x in base_columns]
        pre3_columns = ["pre3_" + x for x in base_columns]
        add_columns = ["interval", "first_gd", "first_dist", "first_place", "first_jockey"] + pre1_columns + pre2_columns + pre3_columns
        self.add_tab = pd.DataFrame(row_list, columns=add_columns)

    def use_selenium(self, race_url, horse_urls, del_index):
        self.browser.get(race_url)
        odds_elements = self.browser.find_elements(By.CSS_SELECTOR, "td.Txt_R.Popular")
        oddss = [odd_element.text for i, odd_element in enumerate(odds_elements) if i not in del_index]
        pop_elements = self.browser.find_elements(By.CSS_SELECTOR, "td.Popular.Popular_Ninki.Txt_C")
        pops = [pop_element.text for i, pop_element in enumerate(pop_elements) if i not in del_index]
        row_list = []
        for horse_url in horse_urls:
            self.browser.get(horse_url)
            pre_time_tds = self.browser.find_elements(By.CSS_SELECTOR, "td:nth-child(20)")
            if not pre_time_tds:
                row_list.append(["None"])
            else:
                pre_times =[td.text for td in pre_time_tds]
                pre_times = pre_times[:3]
                pre_cond_tds = self.browser.find_elements(By.CSS_SELECTOR, "td:nth-child(17)")
                pre_conds = [td.text for td in pre_cond_tds]
                pre_conds = pre_conds[:3]
                row = ["None"]*(len(pre_conds)+len(pre_times))
                row[::2] = pre_conds
                row[1::2] = pre_times
                row_list.append(row)
        row_len_list = [len(row)==6 for row in row_list]
        if not any(row_len_list):
            row_list[0] = row_list[0] + ["None"]*(6-len(row_list[0]))
        selenium_tab = pd.DataFrame(row_list, columns=["pre1_relative_cond", "pre1_relative_time", "pre2_relative_cond",
                                                       "pre2_relative_time", "pre3_relative_cond", "pre3_relative_time"])
        selenium_tab["odds"] = oddss
        selenium_tab["popular"] = pops
        return selenium_tab
        
    def make_df(self, race_url_list, hyde=False):
        tab_list = []
        horse_urls_list = []
        del_index_list = []
        pass_list = []
        for i in tqdm.tqdm(range(len(race_url_list))):
            tab = self.make_table(race_url_list[i])
            if self.no_sinba:
                tab_list.append(tab)
                horse_urls_list.append(self.horse_urls)
                del_index_list.append(self.del_index)
            else:
                pass_list.append(i)
        selenium_tab_list = []
        if hyde:
            options = Options()
            options.add_argument('--headless')
            self.browser = webdriver.Chrome(options=options)
        else:
            self.browser = webdriver.Chrome()
        self.browser.get("https://regist.netkeiba.com/account/?pid=login")
        login_id = ""
        password = ""
        self.browser.find_element(By.NAME, "login_id").send_keys(login_id)
        self.browser.find_element(By.NAME, "pswd").send_keys(password)
        self.browser.find_element(By.CSS_SELECTOR, "div > form > div > div.loginBtn__wrap > input[type=image]").click()
        selenium_race_url_list = [race_url for i, race_url in enumerate(race_url_list) if i not in pass_list]
        for i in tqdm.tqdm(range(len(selenium_race_url_list))):
            selenium_tab = self.use_selenium(selenium_race_url_list[i], horse_urls_list[i], del_index_list[i])
            selenium_tab_list.append(selenium_tab)
        race_df = pd.concat(tab_list, ignore_index=True)
        add_df = pd.concat(selenium_tab_list, ignore_index=True)
        df = pd.concat([race_df, add_df], axis=1)
        for index in pass_list:
            print(f'新馬を含むため消したレース: {race_url_list[index]}')
        return df
   
    def make_csv(self, df, name):
        df.to_csv("/Users/predict_folder/"+name+".csv")
        
    def to_region(self, kyusya):
        if "栗東" in kyusya:
            ew = "西"
        elif "美浦" in kyusya:
            ew = "東"
        elif "地方" in kyusya:
            ew = "地"
        elif "海外" in kyusya:
            ew = "外"
        else:
            ew = "unknown"
        return ew
        
    def to_race_rank(self, rank_name, rank_num="None"):
        ranks = []
        if rank_num == "1":
            ranks.append("G1")
        if rank_num == "2":
            ranks.append("G2")
        if rank_num == "3":
            ranks.append("G3")
        if rank_num == "5":
            ranks.append("オープン")
        if rank_num == "15":
            ranks.append("L")
        if not ranks:
            for i in self.rank_list:
                if i in rank_name:
                    ranks.append(i)
        if not ranks:
            ranks.append("None")
        if (ranks[0]=="500万")|(ranks[0]=="１勝"):
            ranks[0]="1勝"
        if (ranks[0]=="1000万")|(ranks[0]=="２勝"):
            ranks[0]="2勝"
        if (ranks[0]=="1600万")|(ranks[0]=="３勝"):
            ranks[0]="3勝"
        return ranks[0]
        
    def to_place(self, info):
        places = []
        for i in self.place_list:
            if i in info:
                places.append(i)
        if not places:
            places.append("None")
        return places[0]
        
    def time_to_float(self, time):
        try:
            m,s = time.split(":")
        except AttributeError:
            m = 0
            s = 0
        return float(m)*60 + float(s)
        
    def day_interval(self,a,b,c):
        return (b+(a*12))*30+c

In [17]:
mc = MakeCSV()
ps = PredictScr()

In [5]:
urls = mc.search(place="中山", s_year=2023, f_year=2023, s_month=12, f_month=12, rank=["OP", "G3", "G2", "G1"], gd="grass")

レース数: 6


In [6]:
urls

['https://db.netkeiba.com/race/202306050911/',
 'https://db.netkeiba.com/race/202306050811/',
 'https://db.netkeiba.com/race/202306050611/',
 'https://db.netkeiba.com/race/202306050511/',
 'https://db.netkeiba.com/race/202306050211/',
 'https://db.netkeiba.com/race/202306050111/']

In [7]:
df = mc.make_df(urls, no_odds=False)

100%|████████████████████████████████████████████████████████████████████| 6/6 [01:32<00:00, 15.45s/it]
100%|███████████████████████████████████████████████████████████████████| 6/6 [11:33<00:00, 115.59s/it]

number of race: 6
race_rank: Counter({'G1': 2, 'L': 2, 'G3': 1, 'G2': 1})
gd: Counter({'芝': 6})
dist: Counter({2000: 1, 2500: 1, 1800: 1, 1600: 1, 1200: 1, 2: 1})





In [8]:
df = mc.dcd(df)

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,result,waku,num,horse_name,carry,jockey,odds,popular,head,horse_EW,weight,weight_change,sex,age,race_rank,GD,turn,dist,course,weather,condition,year,month,day,...,pre3_nichi,pre3_race_rank,pre3_same_jockey,pre3_GD,pre3_dist,pre3_not_central,pre3_is_overseas,pre3_time,pre3_4_corner,pre3_first_3f,pre3_last_3f,pre3_weight,pre3_weight_change,pre3_prize,race,class,reg,pre1_relative_cond,pre1_relative_time,pre2_relative_cond,pre2_relative_time,pre3_relative_cond,pre3_relative_time,relative_cond,relative_time
0,0,1,7,13,レガレイラ,55.0,ルメール,3.1,1.0,15,東,454,-2,牝,2,G1,芝,右,2000,内,晴,良,2023,12,28,...,,,,,,,,,,,,,,,202312281540,0,120.2,-17,81,-16.0,73.0,,,-12,96
1,1,2,3,6,シンエンペラー,56.0,ムルザバ,3.1,2.0,15,西,482,2,牡,2,G1,芝,右,2000,内,晴,良,2023,12,28,...,,,,,,,,,,,,,,,202312281540,0,120.3,-10,94,-20.0,77.0,,,-12,94
2,2,3,3,5,サンライズジパング,56.0,菅原明良,128.7,13.0,15,西,512,-2,牡,2,G1,芝,右,2000,内,晴,良,2023,12,28,...,6.0,未勝利,n,ダ,1800.0,n,n,114.1,3.0,36.5,39.0,514.0,12.0,550.0,202312281540,0,120.6,-7,79,-25.0,74.0,-5.0,83.0,-12,92
3,3,4,2,4,アドミラルシップ,56.0,ドイル,69.2,11.0,15,東,460,2,牡,2,G1,芝,右,2000,内,晴,良,2023,12,28,...,,,,,,,,,,,,,,,202312281540,1,120.7,-6,45,,,,,-12,90
4,4,5,8,18,ミスタージーティー,56.0,坂井瑠星,15.1,7.0,15,西,474,2,牡,2,G1,芝,右,2000,内,晴,良,2023,12,28,...,,,,,,,,,,,,,,,202312281540,1,120.7,-16,70,,,,,-12,91


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 136 columns):
 #    Column              Non-Null Count  Dtype  
---   ------              --------------  -----  
 0    Unnamed: 0          87 non-null     int64  
 1    result              87 non-null     int64  
 2    waku                87 non-null     int64  
 3    num                 87 non-null     int64  
 4    horse_name          87 non-null     object 
 5    carry               87 non-null     float64
 6    jockey              87 non-null     object 
 7    odds                87 non-null     float64
 8    popular             87 non-null     float64
 9    head                87 non-null     int64  
 10   horse_EW            87 non-null     object 
 11   weight              87 non-null     int64  
 12   weight_change       87 non-null     int64  
 13   sex                 87 non-null     object 
 14   age                 87 non-null     int64  
 15   race_rank           87 non-null     obje

In [11]:
df = mc.df_race_url(df, urls)
mc.make_csv(df, "Nakayama_Dec")

In [12]:
race_url = "https://race.netkeiba.com/race/shutuba.html?race_id=202406010512"
pdf = ps.make_df([race_url])

100%|████████████████████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.90s/it]
100%|████████████████████████████████████████████████████████████████████| 1/1 [00:37<00:00, 37.66s/it]


In [13]:
pdf.head()

Unnamed: 0,waku,num,horse_name,carry,jockey,head,weight,weight_change,sex,age,horse_EW,race_rank,GD,turn,dist,course,weather,condition,year,month,day,kai,nichi,place,prize,...,pre3_place,pre3_kai,pre3_nichi,pre3_race_rank,pre3_same_jockey,pre3_GD,pre3_dist,pre3_not_central,pre3_is_overseas,pre3_time,pre3_4_corner,pre3_first_3f,pre3_last_3f,pre3_weight,pre3_weight_change,pre3_prize,race,pre1_relative_cond,pre1_relative_time,pre2_relative_cond,pre2_relative_time,pre3_relative_cond,pre3_relative_time,odds,popular
0,1,1,エールトゥヘヴン,58.0,横山武,16.0,468,-4,牡,5.0,東,1勝,芝,右,1600.0,外,晴,良,2024.0,1.0,16.0,1.0,5.0,中山,800,...,東京,3,7,1勝,n,芝,1600,n,n,92.9,11,34.0,35.1,466,8,320.0,20241161625,,,-18,71,-25,92,6.5,3
1,1,2,ラコンタール,57.0,原,16.0,482,-2,牡,5.0,東,1勝,芝,右,1600.0,外,晴,良,2024.0,1.0,16.0,1.0,5.0,中山,800,...,東京,1,4,1勝,n,ダ,1300,n,n,80.6,12,29.6,37.3,486,4,0.0,20241161625,,,-17,65,-1,68,204.7,15
2,2,3,ジュドー,57.0,ルメール,16.0,486,6,牡,4.0,東,1勝,芝,右,1600.0,外,晴,良,2024.0,1.0,16.0,1.0,5.0,中山,800,...,東京,4,4,1勝,n,芝,1600,n,n,93.7,5,35.2,34.6,480,0,200.0,20241161625,,,-21,88,-15,94,3.1,1
3,2,4,ロゼル,57.0,ピーヒュレ,16.0,488,6,牡,4.0,東,1勝,芝,右,1600.0,外,晴,良,2024.0,1.0,16.0,1.0,5.0,中山,800,...,東京,4,8,1勝,n,ダ,2100,n,n,134.4,12,31.9,37.2,480,4,0.0,20241161625,,,-18,74,-5,76,18.8,7
4,3,5,ヴァンガーズハート,58.0,三浦,16.0,520,-4,セ,5.0,東,1勝,芝,右,1600.0,外,晴,良,2024.0,1.0,16.0,1.0,5.0,中山,800,...,札幌,1,5,1勝,n,芝,1500,n,n,91.5,13,30.8,35.3,526,-6,0.0,20241161625,,,-18,74,-5,70,14.6,6


In [14]:
pdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 130 columns):
 #    Column              Non-Null Count  Dtype  
---   ------              --------------  -----  
 0    waku                16 non-null     int64  
 1    num                 16 non-null     int64  
 2    horse_name          16 non-null     object 
 3    carry               16 non-null     float64
 4    jockey              16 non-null     object 
 5    head                16 non-null     float64
 6    weight              16 non-null     object 
 7    weight_change       16 non-null     object 
 8    sex                 16 non-null     object 
 9    age                 16 non-null     float64
 10   horse_EW            16 non-null     object 
 11   race_rank           16 non-null     object 
 12   GD                  16 non-null     object 
 13   turn                16 non-null     object 
 14   dist                16 non-null     float64
 15   course              16 non-null     obje

In [18]:
ps.make_csv(pdf, "predict_race")