In [11]:
import glob
import os
import re
import time
import csv
from typing import Any

import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from tqdm.notebook import tqdm
import pandas as pd

html_dir_path = '/Users/eee/python/jra_ml/data/html'
csv_dir_path = '/Users/eee/python/jra_ml/data/table'

class Scrape:
    def __init__(self, html_dir_path, csv_dir_path):
        self.hfm = HtmlFileManager(html_dir_path)
        self.cfm = CsvFileManager(csv_dir_path)
        self.url = 'https://db.netkeiba.com/'
        self.response = ''
        self.bs = ''
        
        
    def set_bs(self, bs):
        self.bs = bs
        
        
    def set_url(self, url):
        self.url = url
        
        
    def get_response(self) -> Any:
        ua = UserAgent()
        header = {'user-agent': ua.chrome}
        
#        print('Getting server response...')
        
        self.response = requests.get(self.url, 'lxml', headers=header)
        self.response.encoding = self.response.apparent_encoding
        self.bs = BeautifulSoup(self.response.content, 'lxml')
        
        time.sleep(1)
        
    
    # カレンダーから各日のレースリスト IDをスクレイピング
    def scrape_race_schedule(self, url):
#        print(f'    Scraping race_list from {url}')
        
        self.set_url(url)
        self.hfm.convert_html_file_name(self.url)
        if self.hfm.file_exists():
            self.set_bs(self.hfm.load_html())
        else:
            self.get_response()
            self.hfm.save_html(self.bs)            
        
        race_list_path = []
        race_cal_tag = self.bs.find_all(href=re.compile('\?pid=race_top&'))
        
        pre_month_date_url = ''
        if race_cal_tag[1].get('title') == '前へ':
            pre_month_path = race_cal_tag[1].get('href')
            pre_month_date_url = base_url + pre_month_path        

        dc = re.compile('^[0-9]')
        race_list_tag = self.bs.find_all(href=re.compile('/race/list/'))
        for rlt in race_list_tag:
            if dc.match(rlt.contents[0]):
                race_list_path.append(rlt.get('href'))
                
        # 無料会員で見れる範囲に制限。
#        if pre_month_date_url == 'https://db.netkeiba.com/?pid=race_top&date=20201205'
#        if pre_month_date_url == 'https://db.netkeiba.com/?pid=race_top&date=20150201':
        #if pre_month_date_url == 'https://db.netkeiba.com/?pid=race_top&date=20080503':
        if pre_month_date_url == 'https://db.netkeiba.com/?pid=race_top&date=20000506':
    
            return race_list_path

        if pre_month_date_url != '':
            temp_list = self.scrape_race_schedule(pre_month_date_url)
            race_list_path.extend(temp_list)
            return race_list_path
        else:
            return race_list_path
        
    # 各レースのURL IDをスクレイピング
    def scrape_race_id(self, url: str) -> list:
#        print(f'    Scraping race_id from {url}')
        
        self.set_url(url)
        self.hfm.convert_html_file_name(self.url)
        if self.hfm.file_exists():
            self.set_bs(self.hfm.load_html())
        else:
            self.get_response()
            self.hfm.save_html(self.bs)
        
        race_id_list = []
        race_list_tag = self.bs.find_all(class_=re.compile('race_top_data_info'))
        
        for rlt in race_list_tag:
            race_id = rlt.contents[3].contents[1].get('href')
            race_type = rlt.contents[3].contents[4].contents[0]
            if '障' not in race_type:
                race_id_list.append(race_id)
                
#        print(f'    ---- # of race: {len(race_id_list)}.')
                
        return race_id_list
    
    def scrape_race_info(self, url: str, race_id: str):
#        print(f'    Scraping race_info from {url}')
        
        self.set_url(url)
        self.hfm.convert_html_file_name(self.url)
        if self.hfm.file_exists():
            self.set_bs(self.hfm.load_html())
        else:
            self.get_response()
            self.hfm.save_html(self.bs)
        
        course_info_list = []
        horse_id_list = []        

        course_info_list = s.scrape_course_info(race_id)

        horse_id_list = s.scrape_horse_id()

        race_result_df = s.scrape_race_result()
        race_result_df = race_result_df.assign(race_id=race_id)
        horse_id_list = [[hil[0].split('/')[2], hil[1]] for hil in horse_id_list]
        race_result_df = race_result_df.assign(horse_id=[name[0] for name in horse_id_list])
        
        return course_info_list, horse_id_list, race_result_df
    
    
    def scrape_course_info(self, race_id):
#        print('    ---- Scraping course_info.')
        race_info_list = []

        course_info_tag = self.bs.find_all('diary_snap_cut')[0]
        temp_list = str(course_info_tag.contents[1]).split('\xa0')
        course_info_list = [t for t in temp_list if '/' !=  t]

        if '芝' in course_info_list[0]:
            race_type = '芝'
            split_str = '芝'
        else:
            race_type = 'ダート'
            split_str = 'ダ'

        if '右' in course_info_list[0]:
            around = '右'
            split_str = '右'
        elif '左' in course_info_list[0]:
            around = '左'
            split_str = '左'
        elif '直線' in course_info_list[0]:
            around = '直線'
            split_str = '直線'
        else:
            pass


        if '外・内' in course_info_list[0]:
            course_position = '外・内'
            split_str = '外・内'
        elif '外-内' in course_info_list[0]:
            course_position = '外-内'
            split_str = '外-内'
        elif '内2周' in course_info_list[0]:
            course_position = '内'
            split_str = '内2周'
        elif '外2周' in course_info_list[0]:
            course_position = '外'
            split_str = '外2周'
        elif '外' in course_info_list[0]:
            course_position = '外'
            split_str = '外'
        elif '内' in course_info_list[0]:
            course_position = '内'
            split_str = '内'
        else:
            course_position = 'N/A'        

        patt = '/race/' + race_id
        course_name_tag = self.bs.find(href=re.compile(patt))
        course_name = course_name_tag.contents[0]

        course_len = int(course_info_list[0].split(split_str)[1].rstrip('m'))
        weather = course_info_list[1].split(':')[1]
        ground_state = course_info_list[2].split(':')[1]

        race_date_class_tag = self.bs.find_all('p')[4]
    #    race_date =  race_date_class_tag.contents[0].split(' ')[0]
    #    race_class = race_date_class_tag.contents[0].split(' ')[2].replace('\xa0', '')
        race_date = race_date_class_tag.contents[0].split(' ')[0]
        race_class = race_date_class_tag.contents[0].split(' ')[2]
        race_class = race_class.split('\xa0')[0]


        race_info_list = [race_id, course_name, course_len, around, course_position, weather, race_type, ground_state, race_date, race_class]
        return race_info_list
    
    
    def scrape_horse_id(self):
#        print('    ---- Scraping horse_id.')
        
        horse_id_list = []
        
        horse_id_tag = self.bs.find_all(href=re.compile('^/horse'))
        horse_id_list = [[hit.get('href'), hit.contents[0]] for hit in horse_id_tag]
            
        return horse_id_list


    
    def scrape_race_result(self):
 #       print('    ---- Scraping race_result.')

        html_path = os.path.join(self.hfm.dir_path, self.hfm.file_name)
        temp_df = pd.read_html(html_path, header=0)

        race_result_df = temp_df[0]
        
        return race_result_df



    def scrape_horse_pedigree(self, url: str, horse_id_name: list):
#        print(f'    -------- Scraping horse_pedigree of {horse_id_name[1]} from {url}')    
        
        self.set_url(url)
        self.hfm.convert_html_file_name(self.url)
        if self.hfm.file_exists():
            self.set_bs(self.hfm.load_html())
        else:
            self.get_response()
            self.hfm.save_html(self.bs)
        


        horse_pedigree_tag = self.bs.find_all(href=re.compile('/horse/[0-9]'))
        horse_pedigree_tag = [pt for pt in horse_pedigree_tag if not pt.get('title')]
        
        # initialize
        horse_pedigree_list = horse_id_name

        # add data to output list
        for hpt in horse_pedigree_tag:
            temp_con = str(hpt.contents[0])
            temp_con = temp_con.replace('<span class="red">', '')
            temp_con = temp_con.replace('</span', '')
            temp_con = temp_con.replace('\n', '')
            temp_con = temp_con.replace('>', '')

            horse_pedigree_list.append(temp_con+'('+hpt.get('href').split('/')[2]+')')
            
        return horse_pedigree_list

    def scrape_horse_result(self, url: str, horse_id_name: list):
#        print(f'    -------- Scraping horse_result of {horse_id_name[1]} from {url}')
  
        self.set_url(url)
        self.hfm.convert_html_file_name(self.url)
        if self.hfm.file_exists():
            pass
        else:
            self.get_response()
            self.hfm.save_html(self.bs)

        html_path = os.path.join(self.hfm.dir_path, self.hfm.file_name)
        temp_df = pd.read_html(html_path, header=0)
            
        horse_result_df = temp_df[0]
        horse_result_df = horse_result_df.assign(horse_id=horse_id_name[0])

        return horse_result_df

class FileManager(object):
    def __init__(self, dir_path):
        self.dir_path = dir_path
        self.file_name = ''
        
    def set_dir_pat(self, dir_path):
        self.dir_path = dir_path
    
    def set_file_name(self, file_name):
        self.file_name = file_name

    def file_exists(self) -> bool:
        file_list = os.listdir(self.dir_path)
        return True if self.file_name in file_list else False
            
class HtmlFileManager(FileManager):
    def __init__(self, dir_path):
        super().__init__(dir_path)
        
    def load_html(self) -> Any:
        file_path = os.path.join(self.dir_path, self.file_name)
        bs = BeautifulSoup(open(file_path), 'lxml')        
        return bs

    def save_html(self, bs):
        file_path = os.path.join(self.dir_path, self.file_name)
#        response.encoding = response.apparent_encoding
        with open(file_path, 'w') as f:
            f.write(str(bs))
            
    def convert_html_file_name(self, url) -> str:
        patt_dict = {
            'race_cal_': '\?pid=race_top',
            'race_list_': 'race/list/',
            'race_info_': 'race/[0-9]',
            'horse_result_': 'horse/result/',
            'horse_pedigree_': 'horse/ped/'
        }
        
        url = url.replace('https://db.netkeiba.com/', '')
        html_name = ''
        for k, v in patt_dict.items():
            if re.search(v, url):
                url_id = re.sub(r"\D", "", url)
                html_name = k + url_id + '.html'
                self.file_name = html_name
                break        

            
class CsvFileManager(FileManager):
    def __init__(self, dir_path):
        super().__init__(dir_path)
        
    def make_csvfile(self, column):
        file_path = os.path.join(self.dir_path, self.file_name)
        with open(file_path, 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(column)    

    def write_list_to_csv(self, data):
        file_path = os.path.join(self.dir_path, self.file_name)
        with open(file_path, 'a') as f:
            writer = csv.writer(f)
            writer.writerow(data)
            
    def write_df_to_csv(self, data: Any, index_name: str):
        file_path = os.path.join(self.dir_path, self.file_name)
        data.set_index(index_name).to_csv(file_path, mode='a', header=False)

In [None]:
if __name__ == '__main__':
    base_url = 'https://db.netkeiba.com'
#    race_top_url = 'https://db.netkeiba.com/?pid=race_top'
    race_top_url = 'https://db.netkeiba.com/?pid=race_top&date=20080105'
#    race_top_url = 'https://db.netkeiba.com/?pid=race_top&date=20080202'  
    # table column name
    race_result_column = ['rae_id', '着順', '枠番', '馬番', '馬名', '性齢', '斤量', '騎手', 'タイム', '着差', '単勝', '人気', '馬体重', '調教師', 'horse_id']
    course_info_column = ['race_id', 'coures_name', 'course_len', 'around', 'course_position', 'weather', 'race_type', 'ground_state', 'date' , 'race_class']
    horse_pedigree_column = ['horse_id', 'horse_name', 
            'f_1_1', 'f_2_1', 'f_3_1', 'f_4_1', 'f_5_1',
            'm_5_1', 'm_4_1', 'f_5_2', 'm_5_2', 'm_3_1',
            'f_4_2', 'f_5_3', 'm_5_3', 'm_4_2', 'f_5_4',
            'm_5_4', 'm_2_1', 'f_3_2', 'f_4_3', 'f_5_5',
            'm_5_5', 'm_4_3', 'f_5_6', 'm_5_6', 'm_3_2',
            'f_4_4', 'f_5_7', 'm_5_7', 'm_4_4', 'f_5_8',
            'm_5_8', 'm_1_1', 'f_2_2', 'f_3_3', 'f_4_5',
            'f_5_9', 'm_5_9', 'm_4_5', 'f_5_10', 'm_5_10',
            'm_3_3', 'f_4_6', 'f_5_11', 'm_5_11', 'm_4_6',
            'f_5_12', 'm_5_12', 'm_2_2', 'f_3_4', 'f_4_7',
            'f_5_13', 'm_5_13', 'm_4_7', 'f_5_14', 'm_5_14',
            'm_3_4', 'f_4_8', 'f_5_15', 'm_5_15', 'm_4_8',
            'f_5_16', 'm_5_16'
            ]
    horse_result_column = ['horse_id', '日付', '開催', '天気', 'R', 'レース名', '映像',
            '頭数', '枠番', '馬番', 'オッズ', '人気', '着順', '騎手', '斤量',
            '距離', '馬場', '馬場指数',  'タイム', '着差', 'ﾀｲﾑ指数', '通過',
            'ペース', '上り', '馬体重', '厩舎ｺﾒﾝﾄ', '備考', '勝ち馬(2着馬)', '賞金'
            ]

    # table file name
    cwd_path = os.getcwd()
    course_info_file = 'course_info.csv'
    race_result_file = 'race_result.csv'
    horse_pedigree_file = 'horse_pedigree.csv'
    horse_result_file = 'horse_result.csv'
    
    s = Scrape(html_dir_path, csv_dir_path)
    # make table file
    s.cfm.set_file_name(course_info_file)
    s.cfm.make_csvfile(course_info_column)
    
    s.cfm.set_file_name(race_result_file)
    s.cfm.make_csvfile(race_result_column)

    s.cfm.set_file_name(horse_pedigree_file)
    s.cfm.make_csvfile(horse_pedigree_column)
    
    s.cfm.set_file_name(horse_result_file)
    s.cfm.make_csvfile(horse_result_column)
    

    
    print("Start scraping!!!")
    
    # Get race schedule.
    race_list_path = s.scrape_race_schedule(race_top_url)
    
    # Get race id(only dirt or turf) per day.
    race_id_list = []
    for rlp in race_list_path:
        race_list_url = base_url + rlp
        temp_list = s.scrape_race_id(race_list_url)
        race_id_list.extend(temp_list)

    # Get race detail (course_info, race_result) per race.
    for ril in race_id_list:
        race_id_url = base_url + ril
        race_id = ril.split('/')[2]
        course_info_list, horse_id_list, race_result_df = s.scrape_race_info(race_id_url, race_id)      

        s.cfm.set_file_name(course_info_file)
        s.cfm.write_list_to_csv(course_info_list)
        
        s.cfm.set_file_name(race_result_file)
        s.cfm.write_df_to_csv(race_result_df, 'race_id')
        
        # Get horse info(horse_pedigree, horse_result) per horse.
        scraped_data = set()
        for hil in horse_id_list:
            if hil[0] not in scraped_data:
                horse_pedigree_url = base_url + '/horse/ped/' + hil[0]
                horse_pedigree_list = s.scrape_horse_pedigree(horse_pedigree_url, hil)
                
                s.cfm.set_file_name(horse_pedigree_file)
                s.cfm.write_list_to_csv(horse_pedigree_list)
                
                horse_result_url = base_url + '/horse/result/' + hil[0]
                horse_result_df = s.scrape_horse_result(horse_result_url, hil)
                
                s.cfm.set_file_name(horse_result_file)
                s.cfm.write_df_to_csv(horse_result_df, 'horse_id')
        
                scraped_data.add(hil[0])
            
    print("End scraping!!!")

Start scraping!!!
