In [1]:
import json
import os
import time

import numpy as np
import polars as pl
import requests

In [2]:
COMP = "lux-ai-season-3"
MAX_CALLS_PER_DAY = 3600  # Kaggle says don't do more than 3600 per day and 1 per second
LOWEST_SCORE_THRESH = 1400

META_DIR = "../data/meta/"
REPLAY_DIR = "../data/match/replay/"
INFO_DIR = "../data/match/info/"
for d in [META_DIR, REPLAY_DIR, INFO_DIR]:
    if not os.path.exists(d):
        os.makedirs(d)

base_url = "https://www.kaggle.com/api/i/competitions.EpisodeService/"
get_url = base_url + "GetEpisodeReplay"

TIME_BUFFER = 1  # seconds
COMPS = {
    "lux-ai-season-3": 86411,
    "lux-ai-2022": 45040,
    "kore-2022": 34419,
    "lux-ai-2021": 30067,
    "hungry-geese": 25401,
    "rock-paper-scissors": 22838,
    "santa-2020": 24539,
    "halite": 18011,
    "google-football": 21723,
}

### Loading

In [None]:
episodes_df = pl.read_csv(META_DIR + "Episodes.csv")
episodes_df = episodes_df.filter(pl.col("CompetitionId") == COMPS[COMP])
print(f"Episodes.csv: {len(episodes_df)} rows after filtering for {COMP}.")

In [None]:
epagents_df = pl.read_csv(META_DIR + "EpisodeAgents.csv", schema_overrides={"Reward": pl.Float32})
unique_comp_episode_ids = pl.Series(episodes_df.select(pl.col("Id").unique())).to_list()
epagents_df = epagents_df.filter(pl.col("EpisodeId").is_in(unique_comp_episode_ids))

print(f"EpisodeAgents.csv: {len(epagents_df)} rows after filtering for {COMP}.")

- Idはゲームで対戦したエージェントのユニークなID
- EpisodeIdはゲーム毎のID
- SubmissionIdは提出されたエージェントのID

In [None]:
epagents_df.head()

### Preprocess

In [None]:
# 欠損チェック
display(episodes_df.null_count())
display(epagents_df.null_count())

In [7]:
episodes_df = episodes_df.with_columns(
    pl.col("CreateTime").str.to_datetime("%m/%d/%Y %H:%M:%S"),
    pl.col("EndTime").str.to_datetime("%m/%d/%Y %H:%M:%S"),
)

epagents_df = epagents_df.with_columns(
    pl.col("InitialConfidence").replace("", np.nan).cast(pl.Float32),
    pl.col("InitialScore").replace("", np.nan).cast(pl.Float32),
    pl.col("UpdatedConfidence").replace("", np.nan).cast(pl.Float32),
    pl.col("UpdatedScore").replace("", np.nan).cast(pl.Float32),
)

epagents_df = epagents_df.fill_nan(0.0)
epagents_df = epagents_df.sort("Id", descending=True)

### Filtering

In [None]:
max_df = (
    epagents_df.sort("EpisodeId", descending=True)
    .group_by("SubmissionId")
    .agg([pl.col("EpisodeId").first(), pl.col("UpdatedScore").first()])
)

max_df = max_df.filter(pl.col("UpdatedScore") > LOWEST_SCORE_THRESH)
sub_to_score_dict = dict(zip(max_df["SubmissionId"], max_df["UpdatedScore"]))
print(f"{len(sub_to_score_dict)} submissions with score over {LOWEST_SCORE_THRESH}")

In [None]:
# Get episodes for these submissions
candidate_episodes = set()
for key, _ in sorted(sub_to_score_dict.items(), key=lambda kv: kv[1], reverse=True):
    episodes = sorted(epagents_df.filter(pl.col("SubmissionId") == key)["EpisodeId"], reverse=True)
    candidate_episodes.update(episodes)

print(f"{len(candidate_episodes)} episodes for these {len(sub_to_score_dict)} submissions")

In [None]:
all_files = []
for root, dirs, files in os.walk(REPLAY_DIR):
    all_files.extend(files)

seen_episodes = {int(file.split(".")[0]) for file in all_files if ".json" in file and file.split(".")[0].isdigit()}
unseen_episodes = candidate_episodes - seen_episodes

print(f"{len(unseen_episodes)} episodes out of the {len(candidate_episodes)} candidate episodes not yet saved")
print(f"Total episodes saved: {len(seen_episodes)}")

### Scraping

In [13]:
def get_and_save_replay(episode_id):
    re = requests.post(get_url, json={"episodeId": int(episode_id)})
    replay = re.json()

    with open(REPLAY_DIR + f"{episode_id}.json", "w") as f:
        json.dump(replay, f)


def save_replay_info(temp_episodes_df, temp_epagents_df):
    create_seconds = int(temp_episodes_df["CreateTime"].cast(pl.Float32).item() / 1e9)
    end_seconds = int(temp_episodes_df["EndTime"].cast(pl.Float32).item() / 1e9)

    agents = []
    for row in temp_epagents_df.rows(named=True):
        agent = {
            "id": int(row["Id"]),
            "state": int(row["State"]),
            "submissionId": int(row["SubmissionId"]),
            "reward": float(row["Reward"]),
            "index": int(row["Index"]),
            "initialScore": float(row["InitialScore"]),
            "initialConfidence": float(row["InitialConfidence"]),
            "updatedScore": float(row["UpdatedScore"]),
            "updatedConfidence": float(row["UpdatedConfidence"]),
        }

        agents.append(agent)

    info = {
        "id": int(episode_id),
        "competitionId": int(COMPS[COMP]),
        "createTime": create_seconds,
        "endTime": end_seconds,
        "agents": agents,
    }

    with open(INFO_DIR + f"{episode_id}.json", "w") as f:
        json.dump(info, f)

In [None]:
start_time = time.time()
num_episodes_saved = 0

num_api_calls_today = 0
for key, _ in sorted(sub_to_score_dict.items(), key=lambda kv: kv[1], reverse=True):
    episodes_for_sub = sorted(epagents_df.filter(pl.col("SubmissionId") == key)["EpisodeId"], reverse=True)

    for episode_id in episodes_for_sub:
        if episode_id not in seen_episodes:
            temp_episodes_df = episodes_df.filter(pl.col("Id") == episode_id)
            temp_epagents_df = epagents_df.filter(pl.col("EpisodeId") == episode_id).sort("Index", descending=True)

            get_and_save_replay(episode_id)
            save_replay_info(temp_episodes_df, temp_epagents_df)
            num_episodes_saved += 1

            if os.path.exists(REPLAY_DIR + f"{episode_id}.json") and os.path.exists(INFO_DIR + f"{episode_id}.json"):
                print(str(num_api_calls_today) + f": saved episode #{episode_id}")
                seen_episodes.add(episode_id)
                num_api_calls_today += 1

            else:
                raise Exception(f"Episode {episode_id} not saved")

            if time.time() - start_time < TIME_BUFFER:
                time.sleep(TIME_BUFFER - (time.time() - start_time))

        if num_api_calls_today > (min(3600, MAX_CALLS_PER_DAY)):
            print("API call limit reached")
            break
    else:
        continue
    break


print("")
print(f"Episodes saved: {num_episodes_saved}")