[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Kaggle-runa/MameLand_vol3/blob/main/src/notebook/04_%E6%96%B0%E8%A6%8F%E3%83%87%E3%83%BC%E3%82%BF%E3%81%A7%E3%81%AE%E4%BA%88%E6%B8%AC.ipynb
)

In [None]:
!pip install pyppeteer nest_asyncio

In [2]:
import asyncio
import re
import numpy as np
import pandas as pd
from pyppeteer import launch
from tqdm.notebook import tqdm
from urllib.request import urlopen
import dataclasses
import nest_asyncio
import joblib
from sklearn.preprocessing import OrdinalEncoder

import lightgbm as lgb

In [3]:
# google driveへのマウント
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


データはGoogle Driveの競馬分析/dataレポジトリにあることを想定しています。  
自分のフォルダ構成に応じてデータのパスを適宜変更して下さい。


- 競馬分析/
  - data/  # 分析に使う生データ
  - feature_data/  # 02_データの前処理.ipynbで作成した生データを加工したデータ
  - simulation_data/ # 05_馬券の購入シミュレーション(ワイド・複勝).ipynbで利用する回収率を計算するためのデータ
  - notebooks/  # 競馬分析を行うnotebook
    - 00_データのスクレイピング.ipynb
    - 01_競馬データ可視化.ipynb
    - 02_データの前処理.ipynb
    - 03_モデルの学習.ipynb
    - 04_新規データでの予測.ipynb
    - 05_馬券の購入シミュレーション(ワイド・複勝).ipynb
  - model/  # 作成したモデルを格納するレポジトリ

## 予測対象レースの出馬表を取得

In [21]:
nest_asyncio.apply()

@dataclasses.dataclass(frozen=True)
class UrlPaths:
    # 出馬表ページ
    RACE_CARD_URL: str = 'https://race.netkeiba.com/race/shutuba.html'

async def extract_horse_jockey_trainer_data(page):
    rows = await page.querySelectorAll('.HorseList')
    data = []

    for row in rows:
        columns = await row.querySelectorAll('td')
        row_data = []

        for column in columns:
            class_name = await page.evaluate('(element) => element.getAttribute("class")', column)
            if class_name in ['HorseInfo']:
                href = await column.querySelectorEval('a', '(element) => element.getAttribute("href")')
                row_data.append(re.findall(r'horse/(\d*)', href)[0])
            elif class_name in ['Jockey']:
                href = await column.querySelectorEval('a', '(element) => element.getAttribute("href")')
                row_data.append(re.findall(r'jockey/result/recent/(\w*)', href)[0])
            elif class_name in ['Trainer']:
                href = await column.querySelectorEval('a', '(element) => element.getAttribute("href")')
                row_data.append(re.findall(r'trainer/result/recent/(\w*)', href)[0])
            row_data.append(await page.evaluate('(element) => element.textContent', column))
        data.append(row_data)
    return data

async def extract_race_info(page):
    race_data = await page.querySelector('.RaceList_Item02')
    race_text = await page.evaluate('(element) => element.textContent', race_data)
    texts = re.findall(r'\w+', race_text)

    text_patterns = {
        '0m': ('course_length', lambda x: int(re.findall(r'\d+', x)[-1])),
        '晴': ('weather', '晴'),
        '曇': ('weather', '曇'),
        '雨': ('weather', '雨'),
        '良': ('ground_condition', '良'),
        '稍重': ('ground_condition', '稍重'),
        '重': ('ground_condition', '重'),
        '不良': ('ground_condition', '不良'),
        '芝': ('race_type', '芝'),
        'ダ': ('race_type', 'ダート'),
        '障': ('race_type', '障害'),
        '右': ('race_turn', '右'),
        '左': ('race_turn', '左'),
        '直線': ('race_turn', '直線'),
        '札幌': ('location', '札幌'),
        '函館': ('location', '函館'),
        '福島': ('location', '福島'),
        '新潟': ('location', '新潟'),
        '東京': ('location', '東京'),
        '中山': ('location', '中山'),
        '中京': ('location', '中京'),
        '京都': ('location', '京都'),
        '阪神': ('location', '阪神'),
        '小倉': ('location', '小倉'),
    }

    race_info = {}
    race_title = texts[0]
    hurdle_race_flg = False

    for text in texts:
        for pattern, (key, value) in text_patterns.items():
            if pattern in text:
                if callable(value):
                    race_info[key] = [value(text)]
                else:
                    race_info[key] = [value]
                if pattern == '障':
                    hurdle_race_flg = True

    return race_info, race_title, hurdle_race_flg

def process_horse_jockey_trainer_data(data):
    df = pd.DataFrame(data)
    df = df[[0, 1, 4, 5, 6, 12, 13, 11, 3, 7, 8, 9, 10]]
    df.columns = ['frame_number', 'horse_number', 'horse_name', 'sex_age', 'carried_weight', 'odds', 'popularity', 'horse_weight', 'horse_id', 'jockey_id', 'jockey', 'trainer_id', 'trainer']
    return df

async def scraping_race_card(race_id, race_date, RACE_CARD_URL):

    # ブラウザ起動
    browser = await launch(
        headless=True,
        args = [
            '--no-sandbox',
            '--disable-setuid-sandbox',
            '--disable-dev-shm-usage',
            '--disable-accelerated-2d-canvas',
            '--no-zygote',
            '--single-process',
            '--disable-gpu',
        ],
    )
    print('Launched browser.')

    try:
        query = [
            'race_id=' + str(race_id)
        ]
        url = RACE_CARD_URL + '?' + '&'.join(query)
        print(f'scraping: {url}')

        # 出走表Webページへアクセス
        page = await browser.newPage()
        await page.goto(url, {'timeout': 180000})
        await page.waitForSelector('.HorseList', {'visible': True})

        data = await extract_horse_jockey_trainer_data(page)
        race_info, race_title, hurdle_race_flg = await extract_race_info(page)
        print(f'race_info: {race_info}')
        print(f'race_title: {race_title}')
        print(f'hurdle_race_flg: {hurdle_race_flg}')
    except asyncio.TimeoutError:
        print("Timeout error!")
        return None
    except Exception as e:
        print(e)
    finally:
        # ページを閉じる
        if 'page' in locals():  # pageが定義されている場合のみcloseする
            await page.close()

    await browser.close()
    print('Closed browser.')

    return data, race_info, race_title, hurdle_race_flg

In [22]:
# 予測対象レースID
race_id = "202405050101"
# レース日
race_date = "2024-11-02"

async def main():
    data, race_info, race_title, hurdle_race_flg = await scraping_race_card(race_id, race_date, UrlPaths.RACE_CARD_URL)

    df = process_horse_jockey_trainer_data(data)
    df['race_id'] = race_id
    df['race_title'] = race_title
    df['event_date'] = [race_date] * len(df)

    for key, value in race_info.items():
        df[key] = value * len(df)
    if hurdle_race_flg:
        df["race_turn"] = ['障害'] * len(df)

    # 列の並び替え
    new_order = [
        'race_id', 'event_date', 'location', 'race_title', 'race_type', 'race_turn',
        'course_length', 'weather', 'ground_condition', 'frame_number',
        'horse_number', 'horse_id', 'horse_name', 'sex_age', 'carried_weight',
        'jockey_id', 'jockey', 'odds', 'popularity',
        'horse_weight', 'trainer'
    ]
    df = df[new_order]

    # 改行タグを削除
    df['horse_name'] = df['horse_name'].str.replace(r'[\n\t]', '', regex=True)
    df['jockey'] = df['jockey'].str.replace(r'[\n\t]', '', regex=True)
    df['popularity'] = df['popularity'].str.replace(r'[\n\t]', '', regex=True)
    df['horse_weight'] = df['horse_weight'].str.replace(r'[\n\t]', '', regex=True)

    return df

df = asyncio.get_event_loop().run_until_complete(main())
print(df)

Launched browser.
scraping: https://race.netkeiba.com/race/shutuba.html?race_id=202405050101
race_info: {'course_length': [1300], 'race_type': ['ダート'], 'race_turn': ['左'], 'weather': ['雨'], 'ground_condition': ['重'], 'location': ['東京']}
race_title: 2歳未勝利
hurdle_race_flg: False
Closed browser.
         race_id  event_date location race_title race_type race_turn  \
0   202405050101  2024-11-02       東京      2歳未勝利       ダート         左   
1   202405050101  2024-11-02       東京      2歳未勝利       ダート         左   
2   202405050101  2024-11-02       東京      2歳未勝利       ダート         左   
3   202405050101  2024-11-02       東京      2歳未勝利       ダート         左   
4   202405050101  2024-11-02       東京      2歳未勝利       ダート         左   
5   202405050101  2024-11-02       東京      2歳未勝利       ダート         左   
6   202405050101  2024-11-02       東京      2歳未勝利       ダート         左   
7   202405050101  2024-11-02       東京      2歳未勝利       ダート         左   
8   202405050101  2024-11-02       東京      2歳未勝利       ダート

## 出馬表をAIモデルが予測できる形に加工
- 02_データの前処理.ipynbで行ったデータの前処理を関数化し、一括して実行します。
- ただし一部処理が異なる部分があるので注意(timeとdifferenceのカラムの削除は不要など)

In [29]:
def preprocess_race_data(race_result, encoder_path="/content/drive/MyDrive/競馬分析/model/ordinal_encoder.pkl"):

    # 性齢
    race_result_sex = race_result['sex_age'].str.extract('([牝牡セ])(\d+)', expand=True)
    race_result['sex'] = race_result_sex.loc[:, 0]
    race_result['age'] = race_result_sex.loc[:, 1].astype(int)
    race_result = race_result.drop(['sex_age'], axis=1)

    # 馬体重(増減)
    race_result_weight = race_result['horse_weight'].str.extract('(\d{3}).([+-0]\d*)', expand=True)
    race_result['horse_weight'] = race_result_weight.loc[:, 0].fillna(0).astype(int)
    race_result['weight_gain_loss'] = race_result_weight.loc[:, 1].str.replace('\+', '', regex=True).fillna(0).astype(int)
    race_result = race_result.drop(['horse_weight'], axis=1)

    # 調教師
    race_result_trainer = race_result['trainer'].str.extract(r'\[(.)\] (.+)', expand=True)
    race_result['trainer_region'] = race_result_trainer.loc[:, 0]
    race_result['trainer_name'] = race_result_trainer.loc[:, 1]
    race_result = race_result.drop(['trainer'], axis=1)

    # コース距離の列名修正
    race_result = race_result.rename(columns={'course_length': 'course_len'})

    # レースの出走頭数を算出
    race_result['horse_count'] = len(race_result)

    # race_titleに基づいてrace_gradeカラムを追加
    grade_list = ['新馬', '未勝利', '1勝', '2勝', '3勝', 'OP', 'L', 'GI', 'GII', 'GIII', 'JGI', 'JGII', 'JGIII', 'オープン']

    def get_race_grade(title):
        for grade in grade_list:
            if grade in title:
                if grade == 'オープン':
                    return '障害オープン'
                return grade
        return None

    race_result['race_grade'] = race_result['race_title'].apply(get_race_grade)

    # オッズの処理
    race_result['odds'] = race_result['odds'].replace('---', 999).astype(float)

    # 余計な列の削除
    race_result = race_result.drop(['horse_name', 'jockey'], axis=1)

    # 日付の処理
    race_result[['year', 'month', 'day']] = race_result['event_date'].str.split('-', expand=True).astype(int)
    race_result['event_date'] = pd.to_datetime(race_result['event_date'])

    # 指定された列をint型・float型に変換
    int_columns = ['race_id', 'frame_number', 'horse_number', 'horse_id', 'jockey_id']
    float_columns = ['carried_weight', 'popularity']
    race_result[int_columns] = race_result[int_columns].astype(int)
    race_result[float_columns] = race_result[float_columns].astype(float)

    # 残りのobject型をOrdinalEncoderにかける
    categorical_columns = list(race_result.select_dtypes(include=object).columns)
    race_result[categorical_columns] = race_result[categorical_columns].astype(str)

    ordinal_encoder = joblib.load(encoder_path)
    race_result[categorical_columns] = ordinal_encoder.transform(race_result[categorical_columns])

    race_result = race_result.sort_values(by=['event_date', 'race_id', 'horse_number'], ascending=[True, True, True])

    return race_result

In [30]:
# 前処理の実行
preprocess_df = preprocess_race_data(df)

In [31]:
preprocess_df.columns

Index(['race_id', 'event_date', 'location', 'race_title', 'race_type',
       'race_turn', 'course_len', 'weather', 'ground_condition',
       'frame_number', 'horse_number', 'horse_id', 'carried_weight',
       'jockey_id', 'odds', 'popularity', 'sex', 'age', 'weight_gain_loss',
       'trainer_region', 'trainer_name', 'horse_count', 'race_grade', 'year',
       'month', 'day'],
      dtype='object')

## モデルの予測

In [32]:
# モデルのロード
loaded_model = lgb.Booster(model_file='/content/drive/MyDrive/競馬分析/model/final_model.txt')

# トレーニング時の特徴量リストを取得
train_features = loaded_model.feature_name()

# 予測データのカラムをトレーニング時の特徴量に合わせて調整
target_data = preprocess_df[train_features]

# ロードしたモデルで予測を実行
y_pred_loaded = loaded_model.predict(target_data, num_iteration=loaded_model.best_iteration)

# 予測結果を二値クラスに変換
pred_labels = (y_pred_loaded >= np.sort(y_pred_loaded)[-3]).astype(int)
target_data['y_pred_loaded'] = y_pred_loaded
target_data['pred_labels'] = pred_labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target_data['y_pred_loaded'] = y_pred_loaded


In [34]:
# モデルが推奨する馬を確認
# pred_labelsが1の馬がAIモデルが３着以内に入ると推定しているものとなる

data = pd.concat([df, target_data], axis=1)
data[['horse_name', 'pred_labels']]

Unnamed: 0,horse_name,pred_labels
0,ハピハピハッピー,0
1,ショウナンラリー,1
2,パーティーガール,0
3,ヤスエ,0
4,シュガープリンセス,0
5,ハコダテサンバ,0
6,セイウングレイ,0
7,ミスターヨッシャー,0
8,コンスピラシー,0
9,フリッカージャブ,1
