In [1]:
import os
import glob
import pandas as pd
from utils.match_prediction import PREPARED_DATA_DIR

# Get the first training file
train_files = glob.glob(os.path.join(PREPARED_DATA_DIR, "train", "*.parquet"))
first_file = sorted(train_files)[0] # Sort to ensure consistent file selection

# Load the file
df = pd.read_parquet(first_file)

# Print basic information
print(f"Loading file: {os.path.basename(first_file)}")
print(f"\nDataFrame shape: {df.shape}")
print("\nColumn dtypes:")
print(df.dtypes)
print("\nFirst few rows:")
df.head()

Loading file: train_0000.parquet

DataFrame shape: (50319, 396)

Column dtypes:
elo                                     int64
queue_type                              int64
patch                                   int64
champion_ids                           object
win_prediction                        float64
                                       ...   
team_200_dragonKills_at_1800000       float64
team_100_baronKills_at_1800000        float64
team_200_baronKills_at_1800000        float64
team_100_inhibitorKills_at_1800000    float64
team_200_inhibitorKills_at_1800000    float64
Length: 396, dtype: object

First few rows:


Unnamed: 0,elo,queue_type,patch,champion_ids,win_prediction,gameDuration,team_100_TOP_kills_at_900000,team_100_TOP_kills_at_1200000,team_100_TOP_kills_at_1500000,team_100_TOP_kills_at_1800000,...,team_100_riftHeraldKills_at_1500000,team_200_riftHeraldKills_at_1500000,team_100_towerKills_at_1800000,team_200_towerKills_at_1800000,team_100_dragonKills_at_1800000,team_200_dragonKills_at_1800000,team_100_baronKills_at_1800000,team_200_baronKills_at_1800000,team_100_inhibitorKills_at_1800000,team_200_inhibitorKills_at_1800000
0,3,0,8,"[57, 20, 4, 76, 16, 95, 121, 110, 33, 50]",0.0,-0.169852,-0.107682,-0.085874,-0.379072,-0.494136,...,0.976788,-0.809435,-0.788232,0.588181,-0.452755,0.165908,-0.545695,1.50004,-0.477136,0.795462
1,3,0,2,"[46, 13, 63, 56, 107, 150, 60, 110, 126, 119]",1.0,0.140692,-1.211029,-0.923673,-1.069158,-1.112064,...,-1.023764,1.23543,-0.069771,0.588181,1.281862,-1.502151,1.583909,-0.566738,-0.477136,-0.500477
2,3,0,7,"[134, 9, 169, 33, 143, 135, 137, 24, 62, 50]",0.0,0.370339,-0.107682,-0.085874,-0.379072,0.432758,...,0.976788,-0.809435,1.726382,-0.849802,-1.320064,2.667995,1.583909,1.50004,6.258865,-0.500477
3,3,0,5,"[89, 152, 64, 51, 61, 150, 60, 23, 53, 155]",0.0,-0.313381,-0.659355,-0.504773,-0.724115,-0.8031,...,0.976788,-0.809435,-0.429001,-0.849802,-1.320064,1.833966,-0.545695,1.50004,-0.477136,-0.500477
4,3,0,9,"[29, 118, 161, 51, 39, 1, 66, 138, 56, 50]",0.0,0.054575,-1.211029,-1.342572,-0.724115,-0.8031,...,-1.023764,1.23543,1.367151,-1.209298,-1.320064,0.999937,-0.545695,-0.566738,-0.477136,-0.500477


In [2]:
unique_values_elo = sorted(df["elo"].unique())
unique_values_queue_type = sorted(df["queue_type"].unique())
unique_values_patch = sorted(df["patch"].unique())
unique_values_champion_ids = sorted(df["champion_ids"].explode().unique())

print(f"Unique values for elo: {unique_values_elo}")
print(f"Unique values for queue_type: {unique_values_queue_type}")
print(f"Unique values for patch: {unique_values_patch}")
print(f"Unique values for champion_ids: {unique_values_champion_ids}")

Unique values for elo: [np.int64(0), np.int64(1), np.int64(2), np.int64(3)]
Unique values for queue_type: [np.int64(0)]
Unique values for patch: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19)]
Unique values for champion_ids: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20), np.int64(21), np.int64(22), np.int64(23), np.int64(24), np.int64(25), np.int64(26), np.int64(27), np.int64(28), np.int64(29), np.int64(30), np.int64(31), np.int64(32), np.int64(33), np.int64(34), np.int64(35), np.int64(36), np.int64(37), np.int64(38), np.int64(39), np.int64(40), 

In [3]:
from utils.match_prediction import PATCH_MAPPING_PATH, CHAMPION_ID_ENCODER_PATH
import pickle

with open(CHAMPION_ID_ENCODER_PATH, "rb") as f:
    champion_mapping = pickle.load(f)["mapping"]

with open(PATCH_MAPPING_PATH, "rb") as f:
    patch_mapping = pickle.load(f)["mapping"]

# TODO: one is dict, the other is encoder, maybe would be better if both had same type
print(f"Patch mapping: {patch_mapping}")
print(f"Champion mapping: {champion_mapping}")

Patch mapping: {'14.12': 0, '14.13': 1, '14.14': 2, '14.15': 3, '14.16': 4, '14.17': 5, '14.18': 6, '14.19': 7, '14.20': 8, '14.21': 9, '14.22': 10, '14.23': 11, '14.24': 12, '15.01': 13, '15.02': 14, '15.03': 15, '15.04': 16, '15.05': 17, '15.06': 18, '15.07': 19}
Champion mapping: LabelEncoder()


In [4]:
from utils.match_prediction import TASK_STATS_PATH

with open(TASK_STATS_PATH, "rb") as f:
    task_stats = pickle.load(f)

print(f"Task stats: {task_stats}")


Task stats: {'means': {'gameDuration': np.float64(1693.0869970997278), 'team_100_TOP_kills_at_900000': np.float64(2.195191482355468), 'team_100_TOP_kills_at_1200000': np.float64(3.20499851003822), 'team_100_TOP_kills_at_1500000': np.float64(4.098621219142459), 'team_100_TOP_kills_at_1800000': np.float64(4.599328155747957), 'team_200_TOP_kills_at_900000': np.float64(2.148420296969359), 'team_200_TOP_kills_at_1200000': np.float64(3.144496391645933), 'team_200_TOP_kills_at_1500000': np.float64(4.044983867442006), 'team_200_TOP_kills_at_1800000': np.float64(4.558975893665334), 'team_100_JUNGLE_kills_at_900000': np.float64(3.2156324601470216), 'team_100_JUNGLE_kills_at_1200000': np.float64(4.505613249691892), 'team_100_JUNGLE_kills_at_1500000': np.float64(5.543368448589702), 'team_100_JUNGLE_kills_at_1800000': np.float64(6.08942302725116), 'team_200_JUNGLE_kills_at_900000': np.float64(3.171492946793223), 'team_200_JUNGLE_kills_at_1200000': np.float64(4.4546881119372275), 'team_200_JUNGLE_ki

In [5]:
# first 10 values of gold_diff_at_20 denormalized
print(df["gold_diff_at_20"].head(10))

mean = task_stats["means"]["gold_diff_at_20"]
std = task_stats["stds"]["gold_diff_at_20"]

print(df["gold_diff_at_20"].head(10) * std + mean)

print(df["blue_has_gold_lead_at_20"].head(10))
print(df["red_has_gold_lead_at_20"].head(10))


KeyError: 'gold_diff_at_20'