# 説明

- _start_dateと_end_dateで指定した期間のレースを取得します。
- 次に、レース出走馬のプロフィール、過去戦績、血統情報を取得します。
- 最後に、レースに含まれる騎手と調教師のプロフィールを取得します。

GUIで処理を行いたいときは`python st.py`を実行してください。

In [None]:
%reload_ext autoreload
%autoreload 2
import app
import datetime
from tqdm import tqdm
import pickle

In [2]:
# complete date range
_start_date = datetime.date(2020, 1, 1)
_end_date = datetime.date(2020, 12, 31)

In [3]:
def save_cache(data, name):
    with open(f'cache/{name}.pkl', 'wb') as f:
        pickle.dump(data, f)

def load_cache(name):
    with open(f'cache/{name}.pkl', 'rb') as f:
        return pickle.load(f)

In [4]:
def cache_data(func):
    cache = {}
    def wrapper(*args, **kwargs):
        if args not in cache:
            cache[args] = func(*args, **kwargs)
        return cache[args]
    return wrapper

# Get all 1R ids

In [5]:
@cache_data
def get_race_ids(start_date, end_date):
    return app.get_race_ids(start_date, end_date)

In [6]:
race_ids = get_race_ids(_start_date, _end_date)

# Get all race ids

In [7]:
cached_race_id_list = {}
def cache_id_list(cache_id_list: dict, race_ids: list):
    """
    引数がdriverとrace_idの関数に対して、race_idをキャッシュする
    """
    driver = app.get_driver()
    def wrapper(driver, race_id):
        if race_id not in cache_id_list:
            local_race_ids, driver = app.get_local_race_ids(driver, race_id)
            cache_id_list[race_id] = local_race_ids
        return driver, cache_id_list[race_id]
    race_id_list = []
    for race_id in tqdm(race_ids):
        driver, local_race_ids = wrapper(driver, race_id)
        race_id_list.extend(local_race_ids)
    driver.close()
    driver.quit()
    return race_id_list

In [8]:
cached_race_id_list = load_cache("cached_race_id_list")

In [None]:
race_id_list = cache_id_list(cached_race_id_list, race_ids)

In [10]:
save_cache(cached_race_id_list, "cached_race_id_list")

# Update database
### pre race & shutuba

In [11]:
cached_exists_race_id_list = []
def cache_upsert_race(cached_data: list, race_id_list: list):
    driver = app.get_driver()
    mongo = app.get_mongo_client()
    for race_id in tqdm(race_id_list):
        if race_id in cached_data:
            continue
        driver = app.upsert_pre_race_shutuba(driver, mongo, race_id, force=True)
        cached_exists_race_id_list.append(race_id)
    driver.close()
    driver.quit()
    mongo.close()

In [12]:
cached_exists_race_id_list = load_cache("cached_exists_race_id_list")

In [None]:
# TODO: あとで補完する

# 202148051910 は調教師IDが足りないのでスキップした
# https://nar.netkeiba.com/race/shutuba.html?race_id=202148051910
# https://db.netkeiba.com/race/202148051910/

# 202148040711 は調教師IDが足りないのでスキップした
# https://nar.netkeiba.com/race/shutuba.html?race_id=202148040711

# 202042102210 は調教師IDが足りないのでスキップした
# https://nar.netkeiba.com/race/shutuba.html?race_id=202042102210

# 202035092809 は調教師IDが足りないのでスキップした
# https://nar.netkeiba.com/race/shutuba.html?race_id=202035092809

# 202050031808 は調教師IDが足りないのでスキップした
# https://nar.netkeiba.com/race/shutuba.html?race_id=202050031808

In [20]:
# update pre race & shutuba
cache_upsert_race(cached_exists_race_id_list, race_id_list[3162:][1458:][6460:])

100%|██████████| 2266/2266 [1:22:14<00:00,  2.18s/it]


In [21]:
save_cache(cached_exists_race_id_list, "cached_exists_race_id_list")

### get horse_id_list

In [16]:
cached_race_id_for_horse_id_list = {}
def cache_horse_id_list_by_race_id(race_id_list: list, cached_data: dict = cached_race_id_for_horse_id_list):
    def wrapper(race_id: str, cached_data: dict):
        if race_id in cached_data:
            return cached_data[race_id]
        cached_data[race_id] = app.find_horse_ids(race_id)
        return cached_data[race_id]
    id_list = []
    for race_id in tqdm(race_id_list):
        ids = wrapper(race_id, cached_data)
        id_list.extend(ids)
    return list(set(id_list))

In [17]:
cached_race_id_for_horse_id_list = load_cache("cached_race_id_for_horse_id_list")

In [18]:
# race_id -> horse_id
horse_id_list = cache_horse_id_list_by_race_id(race_id_list)
save_cache(cached_race_id_for_horse_id_list, "cached_race_id_for_horse_id_list")

100%|██████████| 12842/12842 [05:33<00:00, 38.50it/s]


In [3]:
# all horse_id
from modules.database import FindData
mongo = app.get_mongo_client()
horse_id_list = FindData(mongo).get_all_horse_ids()

In [14]:
# update horse data
ttl_horse_id_dict = {}
def cache_ttl_horse_id_dict(horse_id_list: list, get_type: list, cached_data: dict = ttl_horse_id_dict, ttl: datetime.timedelta = datetime.timedelta(days=7)):
    driver = app.get_driver()
    mongo = app.get_mongo_client()
    cache_date = datetime.datetime.now()
    def wrapper(driver, mongo, horse_id: str, get_type: list, cached_data: dict, cache_date: datetime.datetime, ttl: datetime.timedelta):
        if horse_id in cached_data:
            last_cache_date = cached_data[horse_id]
            if last_cache_date + ttl >= cache_date:
                return driver
        cached_data[horse_id] = cache_date
        return app.upsert_horse_data(driver, mongo, horse_id, get_type)
    for horse_id in tqdm(horse_id_list):
        driver = wrapper(driver, mongo, horse_id, get_type, cached_data, cache_date, ttl)
    if driver:
        driver.close()
        driver.quit()
    mongo.close()

In [5]:
def pop_last_dict_item(d: dict):
    """
    dictの最後の要素を削除する
    """
    keys = list(d.keys())
    last_key = keys[-1]
    del d[last_key]
    return d

In [16]:
ttl_horse_id_dict = load_cache("ttl_horse_id_dict")

In [None]:
get_type = [
    "profile",
    # "pedigree",
    "result"
    ]
ttl_horse_id_dict = pop_last_dict_item(ttl_horse_id_dict)
cache_ttl_horse_id_dict(horse_id_list, get_type)
save_cache(ttl_horse_id_dict, "ttl_horse_id_dict")

In [None]:
import app
type = "jockey"
jockey_id_list = app.find_human_ids_for_db(type)

In [None]:
# update jockey data
driver = app.get_driver()
mongo = app.get_mongo_client()
for jockey_id in tqdm(jockey_id_list):
    driver = app.upsert_human_data(driver, mongo, jockey_id, type)
driver.close()
driver.quit()
mongo.close()

In [None]:
import app
type = "trainer"
jockey_id_list = app.find_human_ids_for_db(type)

In [None]:
# update trainer data
driver = app.get_driver()
mongo = app.get_mongo_client()
for jockey_id in tqdm(jockey_id_list):
    driver = app.upsert_human_data(driver, mongo, jockey_id, type)
driver.close()
driver.quit()
mongo.close()

# Preprocess for training

In [50]:
type = "horse"
include = {
    # "date": {"start_date": "2010-01-01", "end_date": "2023-12-31"},
    # "local": "名古屋",
    "course": "ダ",
    # "distance": "1600",
    # "ground_state": "良",
}
preprocess = app.Preprocess(type, include)

In [51]:
df = preprocess.formatted_result_df
print(df.shape)

(402849, 21)


In [None]:
model = app.Model(df, type)
model.train()
model.save_model()

In [None]:
model.model.summary()