In [23]:
from bs4 import BeautifulSoup
import bs4
import browser_cookie3
import requests
import pandas as pd
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, Date, Float, Text

pd.set_option('display.max_columns', 500)

In [94]:
import re
import datetime
Base = declarative_base()


def parse_time(time_str: str) -> float:
    try:
        minutes, seconds = time_str.split(":")
        return float(minutes) * 60 + float(seconds)
    except:
        return None


def to_float(num_str: str) -> float:
    try:
        return float(num_str)
    except:
        return None

class HorsePed(Base):
    __tablename__ = "horse_ped"
    _id = Column(String(15), primary_key=True)
    father = Column(String(15)) # 父
    mother = Column(String(15)) # 母

    
    def from_table_ichiran(self, table: bs4.element.ResultSet) -> None:
        """
        馬のTOPページから父母のIDを取得する
        血統のページでないので注意

        Args:
            table (bs4.element.ResultSet): 馬のTOPページの血統表
        """
        ids =[td.find("a").get("href") for td in table.find_all("td")]
        self.father = ids[0].replace("/horse/ped/", "").replace("/", "")
        self.mother = ids[3].replace("/horse/ped/", "").replace("/", "")
    
    def set_id(self, id: int) -> None:
        self._id = id  
    

class HorseResult(Base):
    __tablename__ = "horse_result"
    
    _id = Column(Integer, primary_key=True)
    race_id = Column(Integer, primary_key=True)
    kishu_id = Column(Integer)
    
    date = Column(Date)
    place = Column(Text)
    weather = Column(String(2))
    R = Column(Integer)
    name = Column(Text)
    movie_link = Column(Text)
    num = Column(Integer)  # 頭数
    wakuban = Column(Integer)  # 枠番
    umaban = Column(Integer)  # 馬番
    odds = Column(Float)  # 単勝オッズ
    ninki = Column(Integer)  # 人気
    chakujun = Column(Integer)  # 着順
    kishu_name = Column(String(30))  # 騎手
    kinryo = Column(Integer)  # 斤量
    track = Column(String(1))  # 芝/ダート/障害
    distance = Column(Integer)  # 距離
    firm = Column(String(1))  # 良/稍重/重/不良
    truck_index =Column(Integer) # 馬場指数
    time = Column(Float)  # タイム
    chakusa = Column(String(10))  # 着差
    time_index = Column(Integer)  # タイム指数
    tsuka_1 = Column(Integer)
    tsuka_2 = Column(Integer)
    tsuka_3 = Column(Integer)
    tsuka_4 = Column(Integer)
    pace = Column(String(25))  # ペース
    nobori = Column(Float)  # 上り
    bataiju = Column(Integer)  # 馬体重(左)
    zougen = Column(Integer)  # 馬体重(右)
    umaya_comment = Column(Text)  # 厩舎コメント
    bikou = Column(String(30))  # 備考
    winner = Column(String(30))  # 勝ち馬
    shokin = Column(Float)  # 賞金
    
    def from_tr(self, tr: bs4.element.ResultSet) -> None:
        tds = tr.find_all("td")

        self.date = datetime.date.fromisoformat(tds[0].get_text(strip=True).replace("/", "-"))
        self.place = re.sub(r"\d", "", tds[1].get_text(strip=True))
        self.weather = tds[2].get_text(strip=True)
        self.R = tds[3].get_text(strip=True)
        self.name = tds[4].get_text(strip=True)
        self.race_id = re.findall(r"\d{5,}", tds[4].find("a").get("href"))[0]
        self.movie_link = (tds[5].find("a") or tds[5]).get(
            "href", None
        )  # 動画がない場合はNone
        self.num = tds[6].get_text(strip=True)
        self.wakuban = tds[7].get_text(strip=True)
        self.umaban = tds[8].get_text(strip=True)
        self.odds = to_float(tds[9].get_text(strip=True))
        self.ninki = (
            tds[10].get_text(strip=True)
            if tds[10].get_text(strip=True).isdecimal()
            else None
        )
        chakujun_num = re.findall(r"\d+", tds[11].get_text(strip=True))
        if chakujun_num == []:
            self.chakujun = None
        else:
            self.chakujun = int(chakujun_num[0])
        self.kishu_name = tds[12].get_text(strip=True)
        self.kishu_id = re.findall(r"\d{5,}", tds[12].find("a").get("href"))[0]
        self.kinryo = tds[13].get_text(strip=True)
        self.track = tds[14].get_text(strip=True)[0]
        self.distance = int(tds[14].get_text(strip=True)[1:])
        self.firm = tds[15].get_text(strip=True)
        self.truck_index = tds[16].get_text(strip=True)
        self.time = parse_time(tds[17].get_text(strip=True))
        self.chakusa = (
            tds[18].get_text(strip=True)
            if tds[18].get_text(strip=True)
            else None
        )
        self.time_index = (
            int(tds[19].get_text(strip=True))
            if tds[19].get_text(strip=True)
            else None
        )
        self.parse_tsuka(tds[20].get_text(strip=True))
        self.pace = tds[21].get_text(strip=True)
        self.nobori = to_float(tds[22].get_text(strip=True))
        self.parse_taiju(tds[23].get_text(strip=True))
        self.umaya_comment = (tds[24].find("a") or tds[24]).get("href", None)
        self.bikou = tds[25].get_text(strip=True)
        self.winner = tds[26].get_text(strip=True)
        self.shokin = to_float(tds[27].get_text(strip=True))
        
    def parse_tsuka(self, tsuka: str) -> None:

        if len(tsuka) == 0:  # 空白の時
            return
        tsukas = tsuka.split("-")
        if len(tsukas) == 0:
            return
        if len(tsukas) >= 1:
            self.tsuka_1 = int(tsukas[0])
        if len(tsukas) >= 2:
            self.tsuka_2 = int(tsukas[1])
        if len(tsukas) >= 3:
            self.tsuka_3 = int(tsukas[2])
        if len(tsukas) >= 4:
            self.tsuka_4 = int(tsukas[3])
    
    def parse_taiju(self, taiju: str) -> None:
        if taiju:
            taiju = taiju.split("(")
            try:
                self.bataiju = int(taiju[0])
            except:
                pass
            try:
                self.zougen = int(taiju[1][:-1])
            except:
                pass
            
    def set_id(self, id: int) -> None:
        self._id = id
        

In [87]:
url = r"https://db.netkeiba.com/horse/2019104462/"
cj = browser_cookie3.firefox()
r = requests.get(url, cookies=cj)

_id = re.findall(r"\d{5,}", url)[0]

In [88]:
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, "html.parser")
table = soup.find_all("table")

if len(table) == 0:
    exit(0) 
res =[td.find("a").get("href") for td in table[0].find_all("td")]

AttributeError: 'NoneType' object has no attribute 'get'

In [95]:
x=soup.find("table",class_="blood_table")
k = HorsePed()
k.set_id(_id)
k.from_table_ichiran(x)
k.__dict__

{'_sa_instance_state': <sqlalchemy.orm.state.InstanceState at 0x20202489f50>,
 '_id': '2019104462',
 'father': '2001103114',
 'mother': '000a013886'}

In [86]:
[td.find("a").get("href") for td in x.find_all("td")]

['/horse/2001103114/',
 '/horse/000a00033a/',
 '/horse/000a0012bf/',
 '/horse/000a000f2b/',
 '/horse/000a001042/',
 '/horse/000a0078a6/',
 '/horse/000a007459/',
 '/horse/000a0013f0/',
 '/horse/000a0072a0/',
 '/horse/000a008c1e/',
 '/horse/000a0019b6/',
 '/horse/000a000e20/',
 '/horse/000a006c87/',
 '/horse/000a008c1d/',
 '/horse/000a0019ba/',
 '/horse/000a008c1c/',
 '/horse/1988107320/',
 '/horse/000a000258/',
 '/horse/000a000e04/',
 '/horse/000a000f8c/',
 '/horse/000a00702e/',
 '/horse/000a007055/',
 '/horse/000a001383/',
 '/horse/000a00770e/',
 '/horse/000a000768/',
 '/horse/000a000dd2/',
 '/horse/000a00100a/',
 '/horse/000a0078e1/',
 '/horse/000a0069d4/',
 '/horse/000a0013b8/',
 '/horse/000a00773e/',
 '/horse/000a013886/',
 '/horse/000a01198f/',
 '/horse/000a0103cf/',
 '/horse/000a001c5e/',
 '/horse/000a001340/',
 '/horse/000a0085ee/',
 '/horse/000a0000d2/',
 '/horse/000a000e04/',
 '/horse/000a009701/',
 '/horse/000a011994/',
 '/horse/000a01198e/',
 '/horse/000a001ecc/',
 '/horse/00

In [74]:
res.insert(0, "2019104462")

In [None]:
res[]

In [65]:
soup = BeautifulSoup(r.text, "html.parser")
table = soup.find_all("table")
if len(table) == 0:
    exit(0) 

_id = re.findall(r"\d{5,}", url)[0]
tr = table[0].find_all("tr")[2]
hr = HorseResult()
hr.set_id(_id)
hr.from_tr(tr)
hr.__dict__

{'_sa_instance_state': <sqlalchemy.orm.state.InstanceState at 0x20203c553d0>,
 '_id': '2019104462',
 'date': datetime.date(2022, 11, 20),
 'place': '阪神',
 'weather': '曇',
 'R': '11',
 'name': 'マイルチャンピオンS(G1)',
 'race_id': '202209050611',
 'movie_link': '/race/movie/202209050611',
 'num': '17',
 'wakuban': '5',
 'umaban': '10',
 'odds': 9.2,
 'ninki': '6',
 'chakujun': 1,
 'kishu_name': 'レーン',
 'kishu_id': '05585',
 'kinryo': '56',
 'track': '芝',
 'distance': 1600,
 'firm': '良',
 'truck_index': '-6',
 'time': 92.5,
 'chakusa': '-0.2',
 'time_index': 114,
 'tsuka_1': 14,
 'tsuka_2': 13,
 'pace': '35.1-34.0',
 'nobori': 33.0,
 'bataiju': 486,
 'zougen': -4,
 'umaya_comment': '/?pid=horse_comment&id=2019104462&rid=202209050611',
 'bikou': '',
 'winner': '(ダノンザキッド)',
 'shokin': None}

In [None]:
pd.read_html(r.text)[0]

In [21]:
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, "html.parser")