In [2]:
# 必要なインポートを実施
# スクレイピング用フレームワーク
from bs4 import BeautifulSoup
# リクエスト取得ライブラリ
import requests
# お馴染みPandas
import pandas as pd
# お馴染みSeriesとDataFrame
from pandas import Series,DataFrame
from datetime import datetime as dt
# 正規表現ライブラリ
import re
# データフレームを綺麗に表示させるためのライブラリ
from IPython.display import display, HTML


In [3]:
# 競馬情報の取得をWebスクレイピングで実施
HOME_URL = 'https://www.nankankeiba.com/'
PLACE = '川崎'

# URLからコンテンツを取得する
def url_to_soup(url):
    req = requests.get(url)
    return BeautifulSoup(req.content, 'html.parser')

# 各馬の過去１０レースリンクを取得(出走表のページを設定)
def horse_page_link(url):
    soup = url_to_soup(url)
    link_list = [HOME_URL + x.get('href') for x in soup.find_all('a', class_='tx-mid tx-low') ]
    return link_list

hors_page_link_list = horse_page_link(HOME_URL + 'race_info/2018082321060403.do')
print(len(hors_page_link_list))

12


In [4]:
# HTMLのタグを排除する正規表現
p = re.compile(r"<[^>]*?>")

tag_to_text = lambda x: p.sub("", x).split('\n') 
split_tr = lambda x: str(x).split('</tr>')

# tableタグを取得し、trタグでsqlitする
def get_previous_race_row(soup):
    race_table = soup.select("table.tb01")[2]
    return [tag_to_text(x)  for x in split_tr(race_table)]

# 各馬の過去10レースを取得し、データフレームに入れ込む
def horse_data(url):
    soup = url_to_soup(url)

    # 過去のレースデータ
    pre_race_data = get_previous_race_row(soup)
    df = pd.DataFrame(pre_race_data)[1:][[2,3,10,11,13,14,15,19,23]].dropna().rename(columns={
        2:'date', 3:'place', 10:'len', 11:'wether', 13:'rank', 14:'popularity', 15:'time',19:'weight',23:'money'})
    horse_name = soup.find('h2', id='tl-prof').get_text()
    return horse_name, df


In [22]:
# 馬場状態のカラム内容を文字列によって変更する
def add_soil_columns(row):
    if row['wether'][-2:] =='/重':
        row['soil'] = 3
    elif row['wether'][-2:] =='稍重':
        row['soil'] = 2
    elif row['wether'][-2:] =='/良':
        row['soil'] = 1
    elif row['wether'][-2:] =='不良':
        row['soil'] = 4
    else :
        row['soil'] = 0
    return row

# 天気のカラム内容を文字列によって変更する
# def add_wether_columns(row):
#     if row['wether'].startswith('晴'):
#         row['wetherNum'] = 1
#     elif row['wether'].startswith('曇'):
#         row['wetherNum'] = 2
#     elif row['wether'].startswith('雨'):
#         row['wetherNum'] = 3
#     else : row['wetherNum'] = 0
#     return row
def add_wether_columns(row):
        row['sunny'] = 1 if row['wether'].startswith('晴') else 0
        row['cloudy'] = 1 if row['wether'].startswith('曇') else 0
        row['rainny'] = 1 if row['wether'].startswith('雨') else 0
        return row
    

# レースデータのカラムを加工
def add_race_data(df):
    df_ =pd.DataFrame()
    for idx, row in df.iterrows():
        if row['popularity'] == '':
            continue

        # 馬場状態
        row = add_soil_columns(row)
        row = add_wether_columns(row)

        row['money']=int(row['money'].replace(',','')) 
        row['horse_cnt'] = int(row['rank'].split('/')[1])
        row['result_rank'] = int(row['rank'].split('/')[0])
        row['len'] = int(row['len'][0:4])
        row['popularity'] = int(row['popularity'])
        row['weight'] = int(row['weight'])

        # 　競馬場の一致
        row['same_place'] = 1 if row['place'].startswith(PLACE)  else 0

        # タイム(秒)
        try:
            time = dt.strptime(row['time'], '%M:%S.%f')
            row['sec'] = time.minute*60 + time.second + time.microsecond/1000000 
        except ValueError:
            time = dt.strptime(row['time'], '%S.%f')
            row['sec'] = time.second + time.microsecond/1000000

        row['sec'] = int(row['sec']) 

        df_ = df_.append(row, ignore_index=True)
    return df_

df_list = []
# 取得した出走馬の過去レースをデータフレームに格納
for url_link in hors_page_link_list:
    name, df = horse_data(url_link)
    df = add_race_data(df)
    display(df)
    df_list.append(df)
    

Unnamed: 0,cloudy,date,horse_cnt,len,money,place,popularity,rainny,rank,result_rank,same_place,sec,soil,sunny,time,weight,wether
0,0.0,18/8/23,12.0,900.0,0.0,川崎,6.0,0.0,7/12,7.0,1.0,57.0,1.0,1.0,57.6,529.0,晴/良
1,0.0,18/7/20,12.0,900.0,135000.0,川崎,8.0,0.0,4/12,4.0,1.0,56.0,1.0,1.0,56.8,515.0,晴/良
2,0.0,18/7/2,12.0,900.0,0.0,川崎☆,11.0,0.0,8/12,8.0,1.0,57.0,1.0,1.0,57.5,503.0,晴/良
3,0.0,18/6/13,10.0,1400.0,0.0,川崎,8.0,0.0,10/10,10.0,1.0,100.0,3.0,1.0,1:40.7,501.0,晴/重
4,1.0,17/10/11,10.0,1400.0,0.0,川崎,2.0,0.0,8/10,8.0,1.0,94.0,2.0,0.0,1:34.6,520.0,曇/稍重
5,1.0,17/8/22,10.0,1500.0,160000.0,川崎,5.0,0.0,3/10,3.0,1.0,99.0,2.0,0.0,1:39.6,523.0,曇/稍重
6,0.0,17/8/1,11.0,1400.0,0.0,川崎,3.0,1.0,8/11,8.0,1.0,97.0,3.0,0.0,1:37.0,521.0,雨/重
7,1.0,17/7/24,10.0,1500.0,0.0,船橋,5.0,0.0,9/10,9.0,0.0,103.0,1.0,0.0,1:43.9,515.0,曇/良
8,1.0,17/7/4,11.0,1400.0,96000.0,川崎,4.0,0.0,5/11,5.0,1.0,94.0,1.0,0.0,1:34.2,523.0,曇/良
9,0.0,17/6/23,9.0,1500.0,280000.0,船橋,7.0,0.0,2/9,2.0,0.0,100.0,3.0,1.0,1:40.8,517.0,晴/重


Unnamed: 0,cloudy,date,horse_cnt,len,money,place,popularity,rainny,rank,result_rank,same_place,sec,soil,sunny,time,weight,wether
0,0.0,18/8/23,12.0,900.0,0.0,川崎,11.0,0.0,11/12,11.0,1.0,58.0,1.0,1.0,58.6,492.0,晴/良
1,0.0,18/7/2,12.0,900.0,0.0,川崎☆,12.0,0.0,12/12,12.0,1.0,58.0,1.0,1.0,58.0,499.0,晴/良
2,0.0,18/6/12,11.0,1400.0,0.0,川崎☆,10.0,0.0,11/11,11.0,1.0,99.0,3.0,1.0,1:39.3,502.0,晴/重
3,0.0,18/3/1,11.0,900.0,0.0,川崎,11.0,0.0,11/11,11.0,1.0,58.0,4.0,1.0,58.4,506.0,晴/不良
4,0.0,17/10/10,9.0,1400.0,0.0,川崎☆,7.0,0.0,9/9,9.0,1.0,95.0,2.0,1.0,1:35.8,496.0,晴/稍重
5,1.0,17/9/7,12.0,900.0,0.0,川崎,12.0,0.0,12/12,12.0,1.0,59.0,3.0,0.0,59.1,495.0,曇/重
6,0.0,17/4/3,10.0,900.0,0.0,川崎,9.0,0.0,10/10,10.0,1.0,59.0,2.0,1.0,59.4,473.0,晴/稍重
7,0.0,17/3/1,10.0,900.0,0.0,川崎,10.0,0.0,10/10,10.0,1.0,57.0,1.0,1.0,57.3,468.0,晴/良
8,0.0,17/1/2,12.0,900.0,0.0,川崎,9.0,0.0,10/12,10.0,1.0,57.0,1.0,1.0,57.1,475.0,晴/良
9,0.0,16/12/16,9.0,1500.0,0.0,川崎☆,9.0,0.0,9/9,9.0,1.0,104.0,3.0,1.0,1:44.5,480.0,晴/重


Unnamed: 0,cloudy,date,horse_cnt,len,money,place,popularity,rainny,rank,result_rank,same_place,sec,soil,sunny,time,weight,wether
0,0.0,18/8/23,12.0,900.0,0.0,川崎,4.0,0.0,10/12,10.0,1.0,58.0,1.0,1.0,58.3,493.0,晴/良
1,1.0,17/7/5,12.0,1600.0,340000.0,川崎☆,3.0,0.0,2/12,2.0,1.0,107.0,3.0,0.0,1:47.5,492.0,曇/重
2,0.0,17/6/14,12.0,1400.0,1000000.0,川崎,1.0,0.0,1/12,1.0,1.0,93.0,3.0,1.0,1:33.4,491.0,晴/重
3,1.0,17/5/16,12.0,1400.0,360000.0,川崎☆,7.0,0.0,3/12,3.0,1.0,92.0,2.0,0.0,1:32.7,485.0,曇/稍重
4,1.0,17/4/6,9.0,1400.0,200000.0,川崎,4.0,0.0,3/9,3.0,1.0,93.0,1.0,0.0,1:33.6,487.0,曇/良
5,0.0,17/1/4,12.0,1400.0,0.0,川崎,8.0,0.0,11/12,11.0,1.0,93.0,1.0,1.0,1:33.3,481.0,晴/良
6,0.0,16/12/15,11.0,900.0,132000.0,川崎☆,9.0,0.0,4/11,4.0,1.0,56.0,4.0,1.0,56.1,482.0,晴/不良
7,0.0,16/11/3,10.0,1400.0,120000.0,川崎,5.0,0.0,4/10,4.0,1.0,94.0,3.0,1.0,1:34.3,483.0,晴/重
8,1.0,16/10/5,11.0,1400.0,180000.0,川崎☆,8.0,0.0,3/11,3.0,1.0,91.0,2.0,0.0,1:31.7,476.0,曇/稍重
9,0.0,16/9/9,12.0,1400.0,0.0,川崎☆,7.0,0.0,8/12,8.0,1.0,92.0,1.0,1.0,1:32.9,479.0,晴/良


Unnamed: 0,cloudy,date,horse_cnt,len,money,place,popularity,rainny,rank,result_rank,same_place,sec,soil,sunny,time,weight,wether
0,0.0,18/8/23,12.0,900.0,0.0,川崎,7.0,0.0,6/12,6.0,1.0,57.0,1.0,1.0,57.0,462.0,晴/良
1,0.0,18/8/11,13.0,1000.0,0.0,船橋,7.0,0.0,7/13,7.0,0.0,64.0,2.0,1.0,1:04.3,453.0,晴/稍重
2,0.0,18/7/20,12.0,900.0,225000.0,川崎,4.0,0.0,3/12,3.0,1.0,56.0,1.0,1.0,56.5,456.0,晴/良
3,0.0,18/7/2,12.0,900.0,342000.0,川崎☆,7.0,0.0,2/12,2.0,1.0,56.0,1.0,1.0,56.2,458.0,晴/良
4,0.0,18/6/12,10.0,900.0,0.0,川崎,6.0,0.0,6/10,6.0,1.0,58.0,3.0,1.0,58.0,455.0,晴/重
5,1.0,18/5/30,11.0,800.0,150000.0,浦和,7.0,0.0,4/11,4.0,0.0,48.0,1.0,0.0,48.9,450.0,曇/良
6,0.0,18/5/16,12.0,900.0,0.0,川崎,5.0,0.0,11/12,11.0,1.0,57.0,1.0,1.0,57.9,457.0,晴/良
7,1.0,18/4/24,11.0,800.0,120000.0,浦和,4.0,0.0,4/11,4.0,0.0,49.0,1.0,0.0,49.2,455.0,曇/良
8,0.0,18/4/3,12.0,900.0,0.0,川崎,6.0,0.0,8/12,8.0,1.0,56.0,1.0,1.0,56.9,456.0,晴/良
9,0.0,18/3/21,11.0,1400.0,80000.0,浦和,4.0,1.0,5/11,5.0,0.0,94.0,4.0,0.0,1:34.2,457.0,雨/不良


Unnamed: 0,cloudy,date,horse_cnt,len,money,place,popularity,rainny,rank,result_rank,same_place,sec,soil,sunny,time,weight,wether
0,0.0,18/8/23,12.0,900.0,0.0,川崎,9.0,0.0,8/12,8.0,1.0,57.0,1.0,1.0,57.6,478.0,晴/良
1,0.0,18/7/20,12.0,900.0,0.0,川崎,7.0,0.0,9/12,9.0,1.0,57.0,1.0,1.0,57.2,470.0,晴/良
2,0.0,18/7/2,12.0,900.0,0.0,川崎☆,5.0,0.0,6/12,6.0,1.0,57.0,1.0,1.0,57.1,471.0,晴/良
3,0.0,18/6/12,10.0,900.0,108000.0,川崎,3.0,0.0,5/10,5.0,1.0,57.0,3.0,1.0,57.9,477.0,晴/重
4,0.0,18/5/16,12.0,900.0,0.0,川崎,6.0,0.0,10/12,10.0,1.0,57.0,1.0,1.0,57.5,483.0,晴/良
5,1.0,18/5/3,12.0,1500.0,0.0,船橋,5.0,0.0,11/12,11.0,0.0,101.0,2.0,0.0,1:41.9,485.0,曇/稍重
6,0.0,18/4/3,12.0,900.0,0.0,川崎,3.0,0.0,6/12,6.0,1.0,56.0,1.0,1.0,56.6,486.0,晴/良
7,1.0,17/12/15,12.0,1400.0,340000.0,川崎☆,4.0,0.0,2/12,2.0,1.0,92.0,1.0,0.0,1:32.0,481.0,曇/良
8,0.0,17/11/24,12.0,1500.0,0.0,浦和,8.0,0.0,7/12,7.0,0.0,101.0,4.0,1.0,1:41.6,478.0,晴/不良
9,1.0,17/11/8,11.0,900.0,306000.0,川崎,1.0,0.0,2/11,2.0,1.0,56.0,1.0,0.0,56.5,481.0,曇/良


Unnamed: 0,cloudy,date,horse_cnt,len,money,place,popularity,rainny,rank,result_rank,same_place,sec,soil,sunny,time,weight,wether
0,0.0,18/8/23,12.0,900.0,342000.0,川崎,8.0,0.0,2/12,2.0,1.0,56.0,1.0,1.0,56.1,433.0,晴/良
1,0.0,18/7/19,12.0,1400.0,0.0,川崎,9.0,0.0,11/12,11.0,1.0,94.0,1.0,1.0,1:34.7,427.0,晴/良
2,0.0,18/7/2,12.0,900.0,0.0,川崎☆,4.0,0.0,11/12,11.0,1.0,57.0,1.0,1.0,57.9,425.0,晴/良
3,0.0,18/6/12,10.0,900.0,342000.0,川崎,4.0,0.0,2/10,2.0,1.0,57.0,3.0,1.0,57.4,424.0,晴/重
4,0.0,18/5/16,12.0,900.0,108000.0,川崎,9.0,0.0,5/12,5.0,1.0,56.0,1.0,1.0,56.5,421.0,晴/良
5,1.0,18/5/2,12.0,1200.0,0.0,船橋,6.0,0.0,7/12,7.0,0.0,78.0,1.0,0.0,1:18.3,405.0,曇/良
6,0.0,18/4/4,11.0,1400.0,120000.0,川崎,7.0,0.0,4/11,4.0,1.0,94.0,1.0,1.0,1:34.2,418.0,晴/良
7,0.0,18/2/27,12.0,1400.0,0.0,川崎,11.0,0.0,8/12,8.0,1.0,94.0,1.0,1.0,1:34.1,413.0,晴/良
8,0.0,18/1/11,8.0,1400.0,0.0,浦和,6.0,0.0,7/8,7.0,0.0,95.0,2.0,1.0,1:35.1,410.0,晴/稍重
9,0.0,17/12/20,11.0,1400.0,0.0,浦和,4.0,0.0,8/11,8.0,0.0,94.0,2.0,1.0,1:34.5,426.0,晴/稍重


Unnamed: 0,cloudy,date,horse_cnt,len,money,place,popularity,rainny,rank,result_rank,same_place,sec,soil,sunny,time,weight,wether
0,0.0,18/8/23,12.0,900.0,0.0,川崎,10.0,0.0,12/12,12.0,1.0,59.0,1.0,1.0,59.0,457.0,晴/良
1,0.0,18/3/16,14.0,1600.0,0.0,船橋☆,9.0,1.0,14/14,14.0,0.0,109.0,3.0,0.0,1:49.7,453.0,雨/重
2,1.0,18/2/26,12.0,900.0,0.0,川崎,1.0,0.0,12/12,12.0,1.0,58.0,1.0,0.0,58.5,452.0,曇/良
3,1.0,18/2/1,12.0,900.0,900000.0,川崎,4.0,0.0,1/12,1.0,1.0,55.0,2.0,0.0,55.9,459.0,曇/稍重
4,0.0,17/12/14,12.0,1400.0,0.0,川崎,7.0,0.0,9/12,9.0,1.0,94.0,1.0,1.0,1:34.6,464.0,晴/良
5,1.0,17/11/8,11.0,1400.0,0.0,川崎,4.0,0.0,8/11,8.0,1.0,94.0,1.0,0.0,1:34.5,473.0,曇/良
6,0.0,17/9/2,18.0,1000.0,0.0,新潟,7.0,0.0,6/18,6.0,0.0,56.0,1.0,1.0,56.3,464.0,晴/良
7,1.0,17/8/12,18.0,1000.0,500000.0,新潟,16.0,0.0,5/18,5.0,0.0,57.0,2.0,0.0,57.0,466.0,曇/稍重
8,1.0,17/4/15,16.0,1200.0,0.0,福島,15.0,0.0,13/16,13.0,0.0,72.0,1.0,0.0,1:12.0,458.0,曇/良
9,0.0,17/1/22,16.0,1600.0,0.0,中山,10.0,0.0,14/16,14.0,0.0,98.0,1.0,1.0,1:38.6,464.0,晴/良


Unnamed: 0,cloudy,date,horse_cnt,len,money,place,popularity,rainny,rank,result_rank,same_place,sec,soil,sunny,time,weight,wether
0,0.0,18/8/23,12.0,900.0,225000.0,川崎,3.0,0.0,3/12,3.0,1.0,56.0,1.0,1.0,56.3,439.0,晴/良
1,0.0,18/7/20,12.0,900.0,0.0,川崎,3.0,0.0,6/12,6.0,1.0,56.0,1.0,1.0,56.9,436.0,晴/良
2,0.0,18/7/2,12.0,900.0,225000.0,川崎☆,2.0,0.0,3/12,3.0,1.0,56.0,1.0,1.0,56.5,433.0,晴/良
3,0.0,17/10/11,8.0,1000.0,20000.0,門別,3.0,1.0,4/8,4.0,0.0,63.0,3.0,0.0,1:03.0,438.0,雨/重
4,1.0,17/9/28,11.0,1000.0,30000.0,門別,5.0,0.0,3/11,3.0,0.0,61.0,4.0,0.0,1:01.3,438.0,曇/不良
5,0.0,17/8/31,7.0,1000.0,20000.0,門別,5.0,0.0,4/7,4.0,0.0,61.0,2.0,1.0,1:01.9,436.0,晴/稍重
6,1.0,17/8/17,10.0,1000.0,10000.0,門別,4.0,0.0,5/10,5.0,0.0,62.0,3.0,0.0,1:02.4,442.0,曇/重
7,1.0,17/8/3,7.0,1000.0,10000.0,門別,2.0,0.0,5/7,5.0,0.0,65.0,1.0,0.0,1:05.0,442.0,曇/良
8,1.0,17/7/20,10.0,1000.0,0.0,門別,4.0,0.0,8/10,8.0,0.0,64.0,1.0,0.0,1:04.6,446.0,曇/良
9,0.0,17/7/6,11.0,1000.0,40000.0,門別,2.0,0.0,2/11,2.0,0.0,62.0,2.0,1.0,1:02.7,444.0,晴/稍重


Unnamed: 0,cloudy,date,horse_cnt,len,money,place,popularity,rainny,rank,result_rank,same_place,sec,soil,sunny,time,weight,wether
0,0.0,18/8/23,12.0,900.0,0.0,川崎,12.0,0.0,9/12,9.0,1.0,57.0,1.0,1.0,57.6,488.0,晴/良
1,0.0,18/7/20,12.0,900.0,0.0,川崎,12.0,0.0,10/12,10.0,1.0,57.0,1.0,1.0,57.3,484.0,晴/良
2,1.0,18/7/6,8.0,1400.0,0.0,川崎,7.0,0.0,8/8,8.0,1.0,95.0,4.0,0.0,1:35.4,485.0,曇/不良
3,1.0,18/6/14,9.0,1400.0,0.0,川崎☆,9.0,0.0,6/9,6.0,1.0,95.0,2.0,0.0,1:35.1,488.0,曇/稍重
4,0.0,18/5/18,12.0,1400.0,0.0,川崎,8.0,0.0,12/12,12.0,1.0,96.0,1.0,1.0,1:36.1,487.0,晴/良
5,0.0,18/5/4,14.0,1000.0,0.0,船橋,8.0,0.0,7/14,7.0,0.0,64.0,2.0,1.0,1:04.1,482.0,晴/稍重
6,1.0,18/4/27,12.0,1400.0,0.0,浦和,12.0,0.0,9/12,9.0,0.0,93.0,2.0,0.0,1:33.0,489.0,曇/稍重
7,0.0,18/4/6,10.0,1400.0,0.0,川崎,6.0,0.0,6/10,6.0,1.0,95.0,1.0,1.0,1:35.1,493.0,晴/良
8,0.0,18/3/23,11.0,1500.0,80000.0,浦和,11.0,0.0,5/11,5.0,0.0,100.0,3.0,1.0,1:40.9,494.0,晴/重
9,0.0,18/3/2,12.0,1400.0,0.0,川崎,11.0,0.0,11/12,11.0,1.0,95.0,3.0,1.0,1:35.7,495.0,晴/重


Unnamed: 0,cloudy,date,horse_cnt,len,money,place,popularity,rainny,rank,result_rank,same_place,sec,soil,sunny,time,weight,wether
0,0.0,18/8/23,12.0,900.0,108000.0,川崎,5.0,0.0,5/12,5.0,1.0,56.0,1.0,1.0,56.7,491.0,晴/良
1,0.0,18/7/20,12.0,900.0,0.0,川崎,6.0,0.0,7/12,7.0,1.0,57.0,1.0,1.0,57.1,483.0,晴/良
2,0.0,18/7/2,12.0,900.0,0.0,川崎☆,8.0,0.0,9/12,9.0,1.0,57.0,1.0,1.0,57.5,495.0,晴/良
3,0.0,16/5/23,12.0,900.0,0.0,川崎☆,5.0,0.0,7/12,7.0,1.0,56.0,1.0,1.0,56.4,483.0,晴/良
4,0.0,16/5/9,11.0,900.0,0.0,川崎☆,1.0,1.0,7/11,7.0,1.0,56.0,2.0,0.0,56.0,487.0,雨/稍重
5,1.0,16/4/1,12.0,900.0,0.0,川崎☆,3.0,0.0,7/12,7.0,1.0,57.0,1.0,0.0,57.1,483.0,曇/良
6,0.0,16/3/3,11.0,900.0,900000.0,川崎,3.0,0.0,1/11,1.0,1.0,55.0,1.0,1.0,55.7,489.0,晴/良
7,0.0,16/1/25,12.0,900.0,0.0,川崎,2.0,0.0,7/12,7.0,1.0,57.0,1.0,1.0,57.0,494.0,晴/良
8,0.0,16/1/4,12.0,900.0,270000.0,川崎,4.0,0.0,2/12,2.0,1.0,55.0,1.0,1.0,55.6,490.0,晴/良
9,0.0,15/10/22,12.0,900.0,90000.0,川崎,4.0,0.0,5/12,5.0,1.0,57.0,1.0,1.0,57.0,480.0,晴/良


Unnamed: 0,cloudy,date,horse_cnt,len,money,place,popularity,rainny,rank,result_rank,same_place,sec,soil,sunny,time,weight,wether
0,0.0,18/8/23,12.0,900.0,135000.0,川崎,1.0,0.0,4/12,4.0,1.0,56.0,1.0,1.0,56.4,482.0,晴/良
1,0.0,18/7/20,12.0,900.0,342000.0,川崎,2.0,0.0,2/12,2.0,1.0,56.0,1.0,1.0,56.4,473.0,晴/良
2,1.0,18/2/26,12.0,900.0,900000.0,川崎,2.0,0.0,1/12,1.0,1.0,55.0,1.0,0.0,55.9,479.0,曇/良
3,0.0,18/1/1,12.0,1400.0,112000.0,川崎,1.0,0.0,4/12,4.0,1.0,94.0,1.0,1.0,1:34.6,476.0,晴/良
4,0.0,17/12/14,11.0,1400.0,0.0,川崎,1.0,0.0,6/11,6.0,1.0,93.0,1.0,1.0,1:33.1,483.0,晴/良
5,1.0,17/2/18,16.0,1400.0,0.0,京都,6.0,0.0,9/16,9.0,0.0,86.0,2.0,0.0,1:26.8,468.0,曇/稍重
6,1.0,17/2/5,11.0,1600.0,600000.0,東京,5.0,0.0,5/11,5.0,0.0,101.0,1.0,0.0,1:41.6,476.0,曇/良


Unnamed: 0,cloudy,date,horse_cnt,len,money,place,popularity,rainny,rank,result_rank,same_place,sec,soil,sunny,time,weight,wether
0,0.0,18/8/23,12.0,900.0,900000.0,川崎,2.0,0.0,1/12,1.0,1.0,55.0,1.0,1.0,55.4,436.0,晴/良
1,0.0,18/7/19,12.0,1400.0,200000.0,川崎,4.0,0.0,3/12,3.0,1.0,92.0,1.0,1.0,1:32.1,436.0,晴/良
2,0.0,18/2/1,11.0,1400.0,160000.0,川崎,3.0,1.0,3/11,3.0,1.0,92.0,2.0,0.0,1:32.3,419.0,雨/稍重
3,0.0,18/1/3,11.0,1400.0,112000.0,川崎,4.0,0.0,4/11,4.0,1.0,94.0,1.0,1.0,1:34.6,418.0,晴/良
4,0.0,17/12/11,12.0,1500.0,0.0,川崎,6.0,0.0,6/12,6.0,1.0,100.0,1.0,1.0,1:40.6,427.0,晴/良
5,0.0,17/11/8,12.0,1200.0,40000.0,門別,3.0,1.0,2/12,2.0,0.0,75.0,3.0,0.0,1:15.4,430.0,雨/重
6,0.0,17/11/1,7.0,1200.0,30000.0,門別,4.0,1.0,3/7,3.0,0.0,74.0,4.0,0.0,1:14.5,436.0,雨/不良
7,0.0,17/10/19,9.0,1200.0,30000.0,門別,1.0,0.0,3/9,3.0,0.0,76.0,2.0,1.0,1:16.5,436.0,晴/稍重
8,0.0,17/10/5,10.0,1200.0,40000.0,門別,3.0,0.0,2/10,2.0,0.0,77.0,1.0,1.0,1:17.2,430.0,晴/良
9,1.0,17/9/20,9.0,1200.0,40000.0,門別,3.0,0.0,2/9,2.0,0.0,76.0,3.0,0.0,1:16.3,434.0,曇/重


In [15]:
# 該当のレース結果データを取得
def result_data(url):
    soup = url_to_soup(url)

    # 土の状態
    condition = soup.find(id="race-data02").get_text().replace('\n','').split(';')[1].split('　')[2][0:2]

    # レースの長さ
    race_len = int(soup.find(id="race-data01-a").get_text().replace('\n','').split('　')[3].replace(',','')[1:4])

    # 1位の馬番
    p = re.compile('<td class="al-center">')
    hukusyo_list = []
    hukusyo_list.append(int(p.sub("", str(soup.find_all('tr', class_='bg-1chaku')[0]).split('</td>')[2]).replace('\n','') ))

    # レース日
    race_date_str = soup.find(id="race-data01-a").get_text().replace('\n','').split(';')[0].split('日')[0]
    race_date = dt.strptime(race_date_str, '%Y年%m月%d')
    return hukusyo_list, condition, race_len, race_date

a, b, c, d = result_data('https://www.nankankeiba.com/result/2018082321060403.do')
print(d)
print(a)
print(b)
print(c)
# df = horse_data('https://www.nankankeiba.com/result/2018082321060403.do', d)
# df = add_race_data(df)

2018-08-23 00:00:00
[12]
不良
900


In [16]:
def add_grade(df):
    df_grade =pd.DataFrame()
    for idx, row in df.iterrows():
        if row['rank'] == '':
            continue
        row = add_wether_columns(row)
        horse_cnt = row['popularity'].split('/')[0]
        popularity = row['popularity'].split('/')[1]
        # 出走頭数 + ランク()
        row['grade'] = int(horse_cnt) - (int(row['rank']) - 1) + int(popularity)/4 + int(row[TODAY_WETHER]) * 1
        df_grade = df_grade.append(row, ignore_index=True)
    return df_grade

In [23]:
def horse_index(url):
    soup = url_to_soup(url)
    pre_race_data = get_previous_race_row(soup)
    df = pd.DataFrame(pre_race_data)[1:][[10,11,13,14]].dropna().rename(columns={10:'len', 11:'wether', 13:'rank', 14:'popularity'})
    df = add_grade(df)
    print(df)
    horse_name = soup.find('h2', id='tl-prof').get_text()
    return horse_name, df['grade'].mean()

In [18]:
def ture_data(url):
    soup = url_to_soup(url)
    return p.sub("", str(soup.find_all('tr', class_='bg-1chaku')[0]).split('</td>')[3]).replace('\n','')

In [19]:
def main(race_url):
    links = horse_page_link(race_url)
    for link in links:
        horse_name,  grade = horse_index(link)
        print(horse_name+ ':  ' + str(grade))

In [20]:
def pre_race_analysis(race_page):
    links = horse_page_link(race_page)
    max_grade = 0
    first_horse = ''
    for link in links:
        horse_name,  grade = horse_index(link)
        if grade > max_grade:
            max_grade = grade
            first_horse = horse_name
    
    result = ture_data(race_page.replace('race_info', 'result'))
    
    print('-------------------------------------')
    if first_horse == result:
        print('*********** correct !! ***********')
    print(f'predict -> {first_horse} : {max_grade}')
    print(f'true -> {result}')
    print('-------------------------------------')

In [24]:
TODAY_WETHER = 'rainny'
race_page ='https://www.nankankeiba.com/race_info/2018052320040310.do'
main(race_page)

[['', '', '年月日', '競馬場', 'R', 'レース名', '距離', '天候馬場', '馬番', '人気', '着順', 'タイム', '差/事故', '上3F', 'コーナー通過順', '体重', '騎手', '負担重量', '調教師', '獲得賞金（円）', ''], ['', '', '18/8/12', '大井☆', '9', '', '', '大井のＳＰＡＴ４プレミアムポイント賞 Ｂ２(三)', '', '', '1600', '曇/稍重', '6', '7', '10/11', '1:43.8', '2.8', '41.1', '5-5-6', '479', '的場文', '56.0', '納谷和', '0', ''], ['', '', '18/7/9', '大井☆', '11', '', '', 'オフト後楽園「ラウンジセブン」賞 Ｂ２(三)', '', '', '1800', '晴/良', '15', '5', '15/16', '1:58.2', '3.9', '43.3', '7-9-11', '486', '的場文', '56.0', '納谷和', '0', ''], ['', '', '18/6/27', '大井☆', '12', '', '', '明日もエキサイティングリレー賞 Ｂ２(二)', '', '', '1600', '晴/良', '3', '3', '11/14', '1:43.7', '2.4', '40.6', '5-5-6', '489', '御神訓', '56.0', '納谷和', '0', ''], ['', '', '18/5/23', '大井☆', '10', '', '', '新緑賞 Ｂ３(二)', '', '', '1600', '雨/良', '1', '4', '1/15', '1:41.4', '0.7', '39.1', '1-1-1', '483', '的場文', '56.0', '納谷和', '3,000,000', ''], ['', '', '18/4/19', '大井☆', '12', '', '', '北極星賞 Ｂ３(一)', '', '', '1600', '晴/重', '1', '7', '4/15', '1:43.6', '1.1', '39.5', '7-6-6', '