In [1]:
import pandas as pd
import numpy as np

import os
import json
import time
import re

import pyspark.sql.types as T
import pyspark.sql.functions as F

from pyspark import SparkContext, SparkConf, SQLContext

from dateutil.relativedelta import relativedelta
from pyspark.sql.functions import pandas_udf
from pyspark.sql.window import Window

from datetime import date, datetime, timedelta, timezone

In [2]:
appName = "PySpark TFT puuids"
master = "local[10]"
conf = SparkConf() \
    .setAppName(appName) \
    .setMaster(master) \
    .set("spark.executor.memory", "40g") \
    .set("spark.driver.memory", "40g") \
    .set("spark.executor.memoryOverhead", "8g") \
    .set("spark.local.dir", "/home/mai/spark-temp") \
    .set("spark.sql.session.timeZone", "UTC") \
    .set("spark.dynamicAllocation.enabled", "true") \
    .set("spark.dynamicAllocation.minExecutors", "2") \
    .set("spark.dynamicAllocation.maxExecutors", "50") \
    .set("spark.speculation", "true") 
   
sc = SparkContext.getOrCreate(conf=conf)
sqlContext = SQLContext(sc)
spark = sqlContext.sparkSession

25/03/26 22:30:04 WARN Utils: Your hostname, LAPTOP-4O0SI9BK resolves to a loopback address: 127.0.1.1; using 172.30.55.29 instead (on interface eth0)
25/03/26 22:30:04 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/26 22:30:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/03/26 22:30:07 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
MATCH_DIR = './data/tft_match_data_cleaned/'

## 0. Load reference data

### Units

In [5]:
REFERENCE_DATA_DIR = './data/data_dragon/set13/'

In [6]:
# units
with open(os.path.join(REFERENCE_DATA_DIR, 'units.json'), 'r') as file:
    SET13_UNITS = json.load(file)

In [7]:
sorted([SET13_UNITS[i]['name'] for i in SET13_UNITS])

['Akali',
 'Ambessa',
 'Amumu',
 'Blitzcrank',
 'Caitlyn',
 'Camille',
 'Cassiopeia',
 'Corki',
 'Darius',
 'Dr. Mundo',
 'Draven',
 'Ekko',
 'Elise',
 'Ezreal',
 'Gangplank',
 'Garen',
 'Heimerdinger',
 'Illaoi',
 'Irelia',
 'Jayce',
 'Jinx',
 "Kog'Maw",
 'LeBlanc',
 'Leona',
 'Loris',
 'Lux',
 'Maddie',
 'Malzahar',
 'Mel',
 'Mordekaiser',
 'Morgana',
 'Nami',
 'Nocturne',
 'Nunu & Willump',
 'Powder',
 'Rell',
 'Renata Glasc',
 'Renni',
 'Rumble',
 'Scar',
 'Sett',
 'Sevika',
 'Silco',
 'Singed',
 'Smeech',
 'Steb',
 'Swain',
 'Tristana',
 'Trundle',
 'Twisted Fate',
 'Twitch',
 'Urgot',
 'Vander',
 'Vex',
 'Vi',
 'Viktor',
 'Violet',
 'Vladimir',
 'Warwick',
 'Zeri',
 'Ziggs',
 'Zoe',
 'Zyra']

In [8]:
# classify units
AP_UNITS = [
    'Akali',
    'Amumu',
    'Blitzcrank',
    'Cassiopeia',
    'Dr. Mundo',
    'Ekko',
    'Elise',
    'Heimerdinger',
    'Illaoi',
    'Irelia',
    "Kog' Maw",
    'LeBlanc',
    'Leona',
    'Loris',
    'Lux',
    'Malzahar',
    'Mel',
    'Mordekaiser',
    'Morgana',
    'Nami',
    'Nunu & Willump',
    'Powder',
    'Rell',
    'Renata Glasc',
    'Rumble',
    'Scar',
    'Sett',
    'Silco',
    'Singed',
    'Steb',
    'Swain',
    'Twisted Fate',
    'Vex',
    'Viktor',
    'Vladimir',
    'Ziggs',
    'Zoe',
    'Zyra',
]

AD_UNITS = [u for u in [SET13_UNITS[i]['name'] for i in SET13_UNITS] if u not in AP_UNITS]

CARRY_UNITS = [
    'Twitch',
    "Kog'Maw",
    'Tristana',
    'Zeri',
    'Draven',
    'Maddie',
]

CASTER_UNITS = [
    'Mel',
    'Viktor',
    'Caitlyn',
    'Jayce',          # ranged
    'Jinx',
    'LeBlanc',
    'Malzahar',
    'Corki',
    'Elise',          # ranged
    'Heimerdinger',
    'Silco',
    'Zoe',
    'Cassiopeia',
    'Ezreal',
    'Gangplank',      # ranged
    'Nami',
    'Renata Glasc',
    'Swain',          # ranged
    'Ziggs',
    'Lux',
    'Morgana',
    'Powder',
    'Vex',
    'Zyra',
    'Twisted Fate',
]

REAPER_UNITS = [
    'Smeech',
    'Akali',
    'Nocturne',
]

TANK_UNITS = [
    'Dr. Mundo',
    'Elise',             # melee
    'Garen',
    'Illaoi',
    'Blitzcrank',
    'Jayce',             # melee
    'Loris',
    'Nunu & Willump',
    'Renni',
    'Scar',
    'Swain',             # melee
    'Leona',
    'Rell',
    'Sett',
    'Vander',
    'Vladimir',
    'Amumu',
    'Irelia',
    'Singed',
    'Steb',
    'Trundle',
]

FIGHTER_UNIT = [
    'Warwick',
    'Mordekaiser',
    'Rumble',
    'Sevika',
    'Ambessa',
    'Ekko',
    'Vi',
    'Gangplank',         # melee
    'Camille',
    'Urgot',
    'Darius',
    'Violet',
]

In [9]:
SET13_UNITS['TFT13_Swain']

{'id': 'TFT13_Swain',
 'name': 'Swain',
 'tier': 3,
 'image': {'full': 'TFT13_Swain.TFT_Set13.png',
  'sprite': 'tft-champion5.png',
  'group': 'tft-champion',
  'x': 48,
  'y': 0,
  'w': 48,
  'h': 48}}

### Items

In [10]:
# items
with open(os.path.join(REFERENCE_DATA_DIR, 'items.json'), 'r') as file:
    SET13_ITEMS = json.load(file)

In [11]:
item_list = [(SET13_ITEMS[i]['name'], SET13_ITEMS[i]['id'])  for i in SET13_ITEMS if '_Ornn' in i]

sorted(item_list, key=lambda x: x[0])

[('Anima Visage', 'TFT4_Item_OrnnAnimaVisage'),
 ("Blacksmith's Gloves", 'TFT9_Item_OrnnPrototypeForge'),
 ("Choncc's Prowler's Claw", 'TFT9_Item_OrnnDuskbladeOfDraktharr'),
 ("Choncc's Rocket-Propelled Fist", 'TFT4_Item_OrnnRocketPropelledFist'),
 ("Death's Defiance", 'TFT4_Item_OrnnDeathsDefiance'),
 ('Deathfire Grasp', 'TFT9_Item_OrnnDeathfireGrasp'),
 ('Eternal Winter', 'TFT4_Item_OrnnEternalWinter'),
 ('Gold Collector', 'TFT4_Item_OrnnTheCollector'),
 ('Hullcrusher', 'TFT9_Item_OrnnHullbreaker'),
 ('Infinity Force', 'TFT4_Item_OrnnInfinityForce'),
 ('Manazane', 'TFT4_Item_OrnnMuramana'),
 ('Obsidian Cleaver', 'TFT4_Item_OrnnObsidianCleaver'),
 ("Randuin's Omen", 'TFT4_Item_OrnnRanduinsSanctum'),
 ("Sniper's Focus", 'TFT9_Item_OrnnHorizonFocus'),
 ("Trickster's Glass", 'TFT9_Item_OrnnTrickstersGlass'),
 ("Zhonya's Paradox", 'TFT4_Item_OrnnZhonyasParadox')]

In [12]:
UTILITY_ITEMS = [
    'TFT_Item_AegisOfTheLegion',
    'TFT_Item_ZekesHerald',
    'TFT_Item_Zephyr',
    'TFT_Item_TitanicHydra',     # Zz'Rot Portal
    'TFT_Item_Shroud',           # Shroud of Stillness
    'TFT_Item_BansheesVeil',
    'TFT_Item_Chalice',
    'TFT_Item_Moonstone',
    'TFT_Item_SupportKnightsVow',
    'TFT_Item_LocketOfTheIronSolari',
    'TFT_Item_SentinelSwarm',
    'TFT_Item_Spite',
    'TFT_Item_EternalFlame',
    'TFT_Item_RadiantVirtue',   # virtue of the martyr
]


COMPONENT_ITEMS = [
    'TFT_Item_BFSword',
    'TFT_Item_ChainVest',
    'TFT_Item_SparringGloves',
    'TFT_Item_GiantsBelt',
    'TFT_Item_Spatula',
    'TFT_Item_NegatronCloak',
    'TFT_Item_RecurveBow',
    'TFT_Item_FryingPan',
    'TFT_Item_TearOfTheGoddess',
    'TFT_Item_NeedlesslyLargeRod',
]

COMBINED_ITEMS = [
    'TFT_Item_AdaptiveHelm',
    'TFT_Item_ArchangelsStaff',
    'TFT_Item_Bloodthirster',
    'TFT_Item_BlueBuff',
    'TFT_Item_BrambleVest',
    'TFT_Item_Crownguard',
    'TFT_Item_Deathblade',
    'TFT_Item_DragonsClaw',
    'TFT_Item_SpectralGauntlet',    # evenshroud
    'TFT_Item_GargoyleStoneplate',
    'TFT_Item_PowerGauntlet',       # guardbreaker
    'TFT_Item_GuinsoosRageblade',
    'TFT_Item_GuardianAngel',       # edge of night
    'TFT_Item_UnstableConcoction',  # hands of justice
    'TFT_Item_HextechGunblade',  
    'TFT_Item_InfinityEdge', 
    'TFT_Item_IonicSpark',
    'TFT_Item_JeweledGauntlet',
    'TFT_Item_LastWhisper',
    'TFT_Item_Morellonomicon',
    'TFT_Item_Leviathan',          # nashor's tooth
    'TFT_Item_FrozenHeart',        # protector's vow
    'TFT_Item_Quicksilver',
    'TFT_Item_RabadonsDeathcap',
    'TFT_Item_RapidFireCannon',    # red buff
    'TFT_Item_Redemption',
    'TFT_Item_RunaansHurricane',
    'TFT_Item_SpearOfShojin',
    'TFT_Item_StatikkShiv',
    'TFT_Item_SteraksGage',
    'TFT_Item_ThiefsGloves',
    'TFT_Item_TitansResolve',
    'TFT_Item_WarmogsArmor',
    'TFT_Item_RedBuff',            # sunfire cape
]

TACTICIAN_CROWN_ITEMS = [
    'TFT_Item_TacticiansRing',   # tactician's cape
    'TFT_Item_ForceOfNature',    # tactician's crown
    'TFT_Item_TacticiansScepter', # tactician's shield
]

RADIANT_ITEMS = [SET13_ITEMS[i]['id'] for i in SET13_ITEMS if 'RadiantItems' in i]

EMBLEM_ITEMS = [SET13_ITEMS[i]['id'] for i in SET13_ITEMS if i.startswith('TFT13_EmblemItems')]

ARTIFACT_ITEMS = [SET13_ITEMS[i]['id'] for i in SET13_ITEMS if i.startswith('TFT_Item_Artifact_')]

ORNN_ITEMS = [SET13_ITEMS[i]['id'] for i in SET13_ITEMS if '_Ornn' in i]

CHEMBARON_ITEMS = [SET13_ITEMS[i]['id']  for i in SET13_ITEMS if i.startswith('TFT13_Crime_Illegal_Items')]

JUNKERKING_UPGRADE_ITEMS = [SET13_ITEMS[i]['id']  for i in SET13_ITEMS if 'JunkerKingUpgrade' in i]

SHIMMERSCALE_ITEMS = [SET13_ITEMS[i]['id']  for i in SET13_ITEMS if 'Shimmerscale' in i]

### Traits

In [13]:
# traits
with open(os.path.join(REFERENCE_DATA_DIR, 'traits.json'), 'r') as file:
    SET13_TRAITS = json.load(file)

In [14]:
TEAMUP_TRAITS = [trait for trait in SET13_TRAITS if 'TFT13_Teamup_' in trait]

UNIQUE_TRAITS = [
    'TFT13_MissMageTrait',   # banished mage (Mel)
    'TFT13_MachineHerald',   # machine herald (Viktor)
    'TFT13_BloodHunter',     # blood hunter (Warwick)
    'TFT13_HighRoller',      # high roller (Sevika)
    'TFT13_JunkerKing',      # junker king (Rumble)
]

# vertical traits - focus on providing team-wide bonuses
VERTICAL_TRAITS = [
    'TFT13_Academy',
    'TFT13_Hextech',         # automata
    'TFT13_Cabal',           # black rose
    'TFT13_Crime',           # chem baron
    'TFT13_Squad',           # enforcer
    'TFT13_Rebel',           
    'TFT13_Warband',         # conqueror
    'TFT13_Ambassador',      # emissary
    'TFT13_Experiment',
    'TFT13_Family',
    'TFT13_Hoverboard',      # firelight
    'TFT13_Scrap',
]

# horizontal traits - offer specific or situational benefits
HORIZONTAL_TRAITS = [
    'TFT13_Ambusher',
    'TFT13_Martialist',      # artillerist
    'TFT13_Bruiser',
    'TFT13_Infused',         # dominator
    'TFT13_FormSwapper',
    'TFT13_Pugilist',        # pit fighter
    'TFT13_Challenger',      # quickstriker
    'TFT13_Titan',           # sentinel
    'TFT13_Sniper',
    'TFT13_Sorcerer',
    'TFT13_Invoker',         # visionary 
    'TFT13_Watcher',
]

# defensive traits
DEFENSIVE_TRAITS = [
    'TFT13_Titan',
    'TFT13_Watcher',
    'TFT13_Bruiser',
    'TFT13_Hextech',
]

# offensive traits
OFFENSIVE_TRAITS = [
    'TFT13_Ambusher',
    'TFT13_Martialist',
    'TFT13_Infused',
    'TFT13_Pugilist',
    'TFT13_Challenger',
    'TFT13_Sniper',
    'TFT13_Sorcerer',
    'TFT13_Squad',
    'TFT13_Rebel',
    'TFT13_Warband',
]

# utility traits (sustain, mana, items, etc.)
UTILITY_TRAITS = [
    'TFT13_Hoverboard',
    'TFT13_Ambassador',
    'TFT13_Experiment',
    'TFT13_FormSwapper',
    'TFT13_Academy',
    'TFT13_Cabal',
    'TFT13_Family',
    'TFT13_Invoker',
    'TFT13_Scrap',
]

# econ trait
ECON_TRAITS = [
    'TFT13_Crime',
    'TFT13_Warband',
]

In [15]:
# # anomalies
# with open(os.path.join(REFERENCE_DATA_DIR, 'anomalies.json'), 'r') as file:
#     SET13_ANOMALIES = json.load(file)

## 00. Utility functions

In [16]:
MATCH_STATS_DIR = './data/tft_match_stats'
os.makedirs(MATCH_STATS_DIR, exist_ok=True)

TRAIT_FEATURES_DIR = './data/tft_trait_features'
os.makedirs(TRAIT_FEATURES_DIR, exist_ok=True)

UNIT_ITEM_FEATURES_DIR = './data/tft_unit_item_features'
os.makedirs(UNIT_ITEM_FEATURES_DIR, exist_ok=True)

### Separate data

In [17]:
# separate into 3 tables
# 1: individual player stats for each game
# 2: individual player traits for each game
# 3: individual player units & items for each game

def separate_data(df):
    df_ = df.copy()
    # identifier columns: match id & player puuid
    ID_COLS = ['metadata_match_id', 'puuid']
    # game stats columns
    STATS_COLS = [
        'info_game_length',
        'gold_left',
        'last_round',
        'level',
        'placement',
        'players_eliminated',
        'total_damage_to_players',
        'win',
    ]    
    # traits
    TRAITS_COL = ['traits']
    # units
    UNITS_COL = ['units']

    # separate the df into 3 df of stats, units, and traits
    df_stats = df_[ID_COLS+STATS_COLS]
    df_traits = df_[ID_COLS+TRAITS_COL].explode(column='traits')
    df_units = df_[ID_COLS+UNITS_COL].explode(column='units')

    return df_stats, df_traits, df_units
    

### Match stats

In [18]:
# function for creating match statistics
def create_match_stats(df_stats):
    # create features related to match' levels, players eliminated, total damage to players, gold left
    df_stats["max_level_by_match"] = df_stats.groupby("metadata_match_id")["level"].transform("max")
    df_stats["min_level_by_match"] = df_stats.groupby("metadata_match_id")["level"].transform("min")
    df_stats["avg_level_by_match"] = df_stats.groupby("metadata_match_id")["level"].transform("mean")
    df_stats['diff_max_level'] = df_stats['max_level_by_match'] - df['level']
    df_stats['diff_min_level'] = df_stats['level'] - df_stats['min_level_by_match']
    
    df_stats['max_players_eliminated_by_match'] = df_stats.groupby('metadata_match_id')['players_eliminated'].transform('max')
    df_stats['min_players_eliminated_by_match'] = df_stats.groupby('metadata_match_id')['players_eliminated'].transform('min')
    df_stats['avg_players_eliminated_by_match'] = df_stats.groupby('metadata_match_id')['players_eliminated'].transform('mean')
    df_stats['diff_max_players_eliminated'] = df_stats['max_players_eliminated_by_match'] - df_stats['players_eliminated']
    df_stats['diff_min_players_eliminated'] = df_stats['players_eliminated'] - df_stats['min_players_eliminated_by_match']
    
    df_stats['max_total_damage_to_players_by_match'] = df_stats.groupby('metadata_match_id')['total_damage_to_players'].transform('max')
    df_stats['min_total_damage_to_players_by_match'] = df_stats.groupby('metadata_match_id')['total_damage_to_players'].transform('min')
    df_stats['avg_total_damage_to_players_by_match'] = df_stats.groupby('metadata_match_id')['total_damage_to_players'].transform('mean')
    df_stats['diff_max_total_damage_to_players'] = df_stats['max_total_damage_to_players_by_match'] - df_stats['total_damage_to_players']
    df_stats['diff_min_total_damage_to_players'] = df_stats['total_damage_to_players'] - df_stats['min_total_damage_to_players_by_match']
    
    df_stats['max_gold_left_by_match'] = df_stats.groupby('metadata_match_id')['gold_left'].transform('max')
    df_stats['min_gold_left_by_match'] = df_stats.groupby('metadata_match_id')['gold_left'].transform('min')
    df_stats['avg_gold_left_by_match'] = df_stats.groupby('metadata_match_id')['gold_left'].transform('mean')
    df_stats['diff_max_gold_left'] = df_stats['max_gold_left_by_match'] - df_stats['gold_left']
    df_stats['diff_min_gold_left'] = df_stats['gold_left'] - df_stats['min_gold_left_by_match']

    return df_stats

### Trait features

In [19]:
# function for extract trait characteristics
def extract_trait_characteristics(df_traits):
    df_traits['trait_name'] = df_traits['traits'].apply(lambda x: SET13_TRAITS[x.get('name')]['name'] if isinstance(x, dict) else None)
    df_traits['num_trait_units'] = df_traits['traits'].apply(lambda x: x.get('num_units') if isinstance(x, dict) else np.nan)
    df_traits['trait_style'] = df_traits['traits'].apply(lambda x: x.get('style') if isinstance(x, dict) else np.nan)
    df_traits['trait_tier_current'] = df_traits['traits'].apply(lambda x: x.get('tier_current') if isinstance(x, dict) else np.nan)
    df_traits['trait_tier_total'] = df_traits['traits'].apply(lambda x: x.get('tier_total') if isinstance(x, dict) else np.nan)
    
    df_traits['is_trait'] = df_traits['traits'].apply(lambda x: isinstance(x, dict)).astype(int)
    df_traits['is_trait_active'] = ((df_traits['trait_style'] > 0) & (df_traits['is_trait'] == 1)).astype(int)

    # extract active trait characteristics
    df_traits['is_teamup_trait'] = df_traits['traits'].apply(lambda x: isinstance(x, dict) and x['name'] in TEAMUP_TRAITS).astype(int) & df_traits['is_trait_active']
    df_traits['is_unique_trait'] = df_traits['traits'].apply(lambda x: isinstance(x, dict) and x['name'] in UNIQUE_TRAITS).astype(int) & df_traits['is_trait_active']
    df_traits['is_vertical_trait'] = df_traits['traits'].apply(lambda x: isinstance(x, dict) and x['name'] in VERTICAL_TRAITS).astype(int) & df_traits['is_trait_active']
    df_traits['is_horizontal_trait'] = df_traits['traits'].apply(lambda x: isinstance(x, dict) and x['name'] in HORIZONTAL_TRAITS).astype(int) & df_traits['is_trait_active']
    df_traits['is_defensive_trait'] = df_traits['traits'].apply(lambda x: isinstance(x, dict) and x['name'] in DEFENSIVE_TRAITS).astype(int) & df_traits['is_trait_active']
    df_traits['is_offensive_trait'] = df_traits['traits'].apply(lambda x: isinstance(x, dict) and x['name'] in OFFENSIVE_TRAITS).astype(int) & df_traits['is_trait_active']
    df_traits['is_utility_trait'] = df_traits['traits'].apply(lambda x: isinstance(x, dict) and x['name'] in UTILITY_TRAITS).astype(int) & df_traits['is_trait_active']
    df_traits['is_econ_trait'] = df_traits['traits'].apply(lambda x: isinstance(x, dict) and x['name'] in ECON_TRAITS).astype(int) & df_traits['is_trait_active']
    # df_traits['is_global_trait'] = df_traits['traits'].apply(lambda x: isinstance(x, dict) and x['name'] in GLOBAL_TRAITS).astype(int) & df_traits['is_trait_active']

    # trait contest features
    df_traits_other = df_traits[df_traits['is_trait_active'] == 1][['metadata_match_id', 'puuid', 'trait_name', 'trait_style']].copy()
    df_traits_other = df_traits_other.rename(columns={'trait_style': 'trait_style_other', 'puuid': 'puuid_other'})
    df_contesting = df_traits[df_traits['is_trait_active'] == 1].merge(df_traits_other, on=['metadata_match_id', 'trait_name'], how='left') 
    df_contesting = df_contesting[df_contesting['puuid'] != df_contesting['puuid_other']]
    # is the other player contesting the same trait having higher or equal trait style 
    df_contesting['contesting_trait_higher'] = (df_contesting['trait_style_other'] >= df_contesting['trait_style']).astype(int)
    trait_contest_features = df_contesting.groupby(['metadata_match_id', 'puuid', 'trait_name']).agg(
        players_contesting_trait=('puuid_other', 'count'),
        players_contesting_trait_higher=('contesting_trait_higher', 'sum')
    ).reset_index()

    df_traits = df_traits.merge(trait_contest_features, on=['metadata_match_id', 'puuid', 'trait_name'], how='left')
    df_traits[['players_contesting_trait', 'players_contesting_trait_higher']] = df_traits[['players_contesting_trait', 'players_contesting_trait_higher']].fillna(0)
    
    return df_traits

In [20]:
# function for creating trait features
def create_trait_features(df_traits):
    df_traits_ = spark.createDataFrame(df_traits.drop('traits', axis=1))
    trait_feats = df_traits_.groupBy('metadata_match_id', 'puuid')\
                            .agg(F.collect_list(F.when(F.col('is_trait_active')==1, F.struct(F.col('trait_name'), 
                                                                                             F.col('trait_style'), 
                                                                                             F.col('players_contesting_trait'), 
                                                                                             F.col('players_contesting_trait_higher'),
                                                                                             F.col('is_teamup_trait'),
                                                                                             F.col('is_unique_trait'),
                                                                                             F.col('is_vertical_trait'),
                                                                                             F.col('is_horizontal_trait'),
                                                                                             F.col('is_defensive_trait'),
                                                                                             F.col('is_offensive_trait'),
                                                                                             F.col('is_utility_trait'),
                                                                                             F.col('is_econ_trait')
                                                                                            ))).alias('active_traits'),
                                F.sum(F.col('is_trait_active')).alias('total_active_traits'),
                                F.sum(F.col('is_teamup_trait')).alias('total_teamup_traits'),
                                F.sum(F.col('is_unique_trait')).alias('total_unique_traits'),
                                F.sum(F.col('is_vertical_trait')).alias('total_vertical_traits'),
                                F.sum(F.col('is_horizontal_trait')).alias('total_horizontal_traits'),
                                F.sum(F.col('is_defensive_trait')).alias('total_defensive_traits'),
                                F.sum(F.col('is_offensive_trait')).alias('total_offensive_traits'),
                                F.sum(F.col('is_utility_trait')).alias('total_utility_traits'),
                                F.sum(F.col('is_econ_trait')).alias('total_econ_traits'),
                                F.sum(F.when(F.col('trait_style') == 1, 1)).alias('total_bronze_style_traits'),
                                F.sum(F.when(F.col('trait_style') == 2, 1)).alias('total_silver_style_traits'),
                                F.sum(F.when(F.col('trait_style') == 4, 1)).alias('total_gold_style_traits'),
                                F.sum(F.when(F.col('trait_style') == 5, 1)).alias('total_prismatic_style_traits'),
                                F.sum(F.when((F.col('trait_style') == 5) & (F.col('is_defensive_trait') == 1), 1)).alias('total_prismatic_defensive_traits'),
                                F.sum(F.when((F.col('trait_style') == 4) & (F.col('is_defensive_trait') == 1), 1)).alias('total_gold_defensive_traits'),
                                F.sum(F.when((F.col('trait_style') == 2) & (F.col('is_defensive_trait') == 1), 1)).alias('total_silver_defensive_traits'),
                                F.sum(F.when((F.col('trait_style') == 1) & (F.col('is_defensive_trait') == 1), 1)).alias('total_bronze_defensive_traits'),
                                F.sum(F.when((F.col('trait_style') == 5) & (F.col('is_offensive_trait') == 1), 1)).alias('total_prismatic_offensive_traits'),
                                F.sum(F.when((F.col('trait_style') == 4) & (F.col('is_offensive_trait') == 1), 1)).alias('total_gold_offensive_traits'),
                                F.sum(F.when((F.col('trait_style') == 2) & (F.col('is_offensive_trait') == 1), 1)).alias('total_silver_offensive_traits'),
                                F.sum(F.when((F.col('trait_style') == 1) & (F.col('is_offensive_trait') == 1), 1)).alias('total_bronze_offensive_traits'),
                                F.sum(F.when((F.col('trait_style') == 5) & (F.col('is_econ_trait') == 1), 1)).alias('total_prismatic_econ_traits'),
                                F.sum(F.when((F.col('trait_style') == 4) & (F.col('is_econ_trait') == 1), 1)).alias('total_gold_econ_traits'),
                                F.sum(F.when((F.col('trait_style') == 2) & (F.col('is_econ_trait') == 1), 1)).alias('total_silver_econ_traits'),
                                F.sum(F.when((F.col('trait_style') == 1) & (F.col('is_econ_trait') == 1), 1)).alias('total_bronze_econ_traits'),
                                F.sum(F.when((F.col('trait_style') == 5) & (F.col('is_vertical_trait') == 1), 1)).alias('total_prismatic_vertical_traits'),
                                F.sum(F.when((F.col('trait_style') == 4) & (F.col('is_vertical_trait') == 1), 1)).alias('total_gold_vertical_traits'),
                                F.sum(F.when((F.col('trait_style') == 2) & (F.col('is_vertical_trait') == 1), 1)).alias('total_silver_vertical_traits'),
                                F.sum(F.when((F.col('trait_style') == 1) & (F.col('is_vertical_trait') == 1), 1)).alias('total_bronze_vertical_traits'),
                                F.sum(F.when((F.col('trait_style') == 5) & (F.col('is_horizontal_trait') == 1), 1)).alias('total_prismatic_horizontal_traits'),
                                F.sum(F.when((F.col('trait_style') == 4) & (F.col('is_horizontal_trait') == 1), 1)).alias('total_gold_horizontal_traits'),
                                F.sum(F.when((F.col('trait_style') == 2) & (F.col('is_horizontal_trait') == 1), 1)).alias('total_silver_horizontal_traits'),
                                F.sum(F.when((F.col('trait_style') == 1) & (F.col('is_horizontal_trait') == 1), 1)).alias('total_bronze_horizontal_traits'),
                                F.max(F.when(F.col('is_trait_active') == 1, F.col('trait_style'))).alias('highest_active_trait_style'),
                                F.max(F.when(F.col('is_vertical_trait') == 1, F.col('trait_style'))).alias('highest_vertical_trait_style'),
                                F.max(F.when(F.col('is_horizontal_trait') == 1, F.col('trait_style'))).alias('highest_horizontal_trait_style'),
                                F.max(F.when(F.col('is_defensive_trait') == 1, F.col('trait_style'))).alias('highest_defensive_trait_style'),
                                F.max(F.when(F.col('is_offensive_trait') == 1, F.col('trait_style'))).alias('highest_offensive_trait_style'),
                                F.max(F.when(F.col('is_utility_trait') == 1, F.col('trait_style'))).alias('highest_utility_trait_style'),
                                F.max(F.when(F.col('is_econ_trait') == 1, F.col('trait_style'))).alias('highest_econ_trait_style'),
                                )\
                            .fillna(0)

    return trait_feats

### Unit & Item features

In [21]:
def get_unit_name(d):
    if isinstance(d, dict):
        if d.get('character_id') == 'tft13_swain':
            return SET13_UNITS['TFT13_Swain']['name']
        elif d.get('character_id') == 'tft13_elise':
            return SET13_UNITS['TFT13_Elise']['name']
        elif d.get('character_id') == 'tft13_gangplank':
            return SET13_UNITS['TFT13_Gangplank']['name']
        elif d.get('character_id') == 'tft13_jayce':
            return SET13_UNITS['TFT13_Jayce']['name']
        elif d.get('character_id') == 'tft13_jinx':
            return SET13_UNITS['TFT13_Jinx']['name']
        elif d.get('character_id') == 'TFT13_Sion':
            return 'Black Rose Sion'
        elif d.get('character_id') == 'TFT13_JayceSummon':
            return "Jayce's summons"
        return SET13_UNITS[d.get('character_id')]['name']
    return None


def get_unit_cost(d):
    if isinstance(d, dict):
        if d.get('character_id') == 'tft13_swain':
            return SET13_UNITS['TFT13_Swain']['tier']
        elif d.get('character_id') == 'tft13_elise':
            return SET13_UNITS['TFT13_Elise']['tier']
        elif d.get('character_id') == 'tft13_gangplank':
            return SET13_UNITS['TFT13_Gangplank']['tier']
        elif d.get('character_id') == 'tft13_jayce':
            return SET13_UNITS['TFT13_Jayce']['tier']
        elif d.get('character_id') == 'tft13_jinx':
            return SET13_UNITS['TFT13_Jinx']['tier']
        elif d.get('character_id') == 'TFT13_Sion':
            return 0
        elif d.get('character_id') == 'TFT13_JayceSummon':
            return 0
        return SET13_UNITS[d.get('character_id')]['tier']
    return np.nan
        

In [22]:
# function to extract unit characteristics
def extract_unit_characteristics(df_units):
    df_units['unit_name'] = df_units['units'].apply(get_unit_name)
    df_units['unit_items'] = df_units['units'].apply(lambda x: x.get('itemNames') if isinstance(x, dict) else [])
    df_units['unit_item_count'] = df_units['unit_items'].apply(len)

    # unit cost to purchase in shop
    df_units['unit_cost'] = df_units['units'].apply(get_unit_cost)
    df_units['unit_tier'] = df_units['units'].apply(lambda x: x.get('tier') if isinstance(x, dict) else np.nan)
    df_units['unit_total_cost'] = df_units['unit_cost'] * 3**(df_units['unit_tier'] - 1)
    # unit damange output
    df_units['is_ap_damage'] = (df_units['unit_name'].isin(AP_UNITS)).astype(int)
    df_units['is_ad_damage'] = (df_units['unit_name'].isin(AD_UNITS)).astype(int)
    # unit class
    df_units['is_carry'] = (df_units['unit_name'].isin(CARRY_UNITS)).astype(int)
    df_units['is_caster'] = (df_units['unit_name'].isin(CASTER_UNITS)).astype(int)
    df_units['is_tank'] = (df_units['unit_name'].isin(TANK_UNITS)).astype(int)
    df_units['is_reaper'] = (df_units['unit_name'].isin(REAPER_UNITS)).astype(int)
    df_units['is_fighter'] = (df_units['unit_name'].isin(FIGHTER_UNIT)).astype(int)

    # unit counts
    df_units['unit_component_item_count'] = df_units['unit_items'].apply(lambda x: len([i for i in x if i in COMPONENT_ITEMS]))
    df_units['unit_combined_item_count'] = df_units['unit_items'].apply(lambda x: len([i for i in x if i in COMBINED_ITEMS]))
    df_units['unit_radiant_item_count'] = df_units['unit_items'].apply(lambda x: len([i for i in x if i in RADIANT_ITEMS]))
    df_units['unit_tactician_crown_item_count'] = df_units['unit_items'].apply(lambda x: len([i for i in x if i in TACTICIAN_CROWN_ITEMS]))
    df_units['unit_utility_item_count'] = df_units['unit_items'].apply(lambda x: len([i for i in x if i in UTILITY_ITEMS]))
    df_units['unit_artifact_item_count'] = df_units['unit_items'].apply(lambda x: len([i for i in x if i in ARTIFACT_ITEMS]))
    df_units['unit_chembaron_item_count'] = df_units['unit_items'].apply(lambda x: len([i for i in x if i in CHEMBARON_ITEMS]))
    df_units['unit_chembaron_bronze_item_count'] = df_units['unit_items'].apply(lambda x: len([i for i in x if i in CHEMBARON_ITEMS and i.startswith('TFT13_Crime_Bronze_')]))
    df_units['unit_chembaron_silver_item_count'] = df_units['unit_items'].apply(lambda x: len([i for i in x if i in CHEMBARON_ITEMS and i.startswith('TFT13_Crime_Silver_')]))
    df_units['unit_chembaron_gold_item_count'] = df_units['unit_items'].apply(lambda x: len([i for i in x if i in CHEMBARON_ITEMS and i.startswith('TFT13_Crime_Gold_')]))
    df_units['unit_chembaron_prismatic_item_count'] = df_units['unit_items'].apply(lambda x: len([i for i in x if i in CHEMBARON_ITEMS and i.startswith('TFT13_Crime_Prismatic_')]))
    df_units['unit_emblem_item_count'] = df_units['unit_items'].apply(lambda x: len([i for i in x if i in EMBLEM_ITEMS]))
    df_units['unit_junkerking_upgrade_item_count'] = df_units['unit_items'].apply(lambda x: len([i for i in x if i in JUNKERKING_UPGRADE_ITEMS]))
    df_units['unit_shimmerscale_item_count'] = df_units['unit_items'].apply(lambda x: len([i for i in x if i in SHIMMERSCALE_ITEMS]))
    df_units['unit_ornn_item_count'] = df_units['unit_items'].apply(lambda x: len([i for i in x if i in ORNN_ITEMS]))

    # contesting unit features
    df_units_other = df_units[~df_units['unit_name'].isnull()][['metadata_match_id', 'puuid', 'unit_name', 'unit_cost', 'unit_tier']].copy()
    df_units_other = df_units_other.rename(columns={'unit_cost': 'unit_cost_other', 'unit_tier': 'unit_tier_other', 'puuid': 'puuid_other'})
    
    df_contesting = df_units[~df_units['unit_name'].isnull()].merge(df_units_other, on=['metadata_match_id', 'unit_name'], how='left') 
    df_contesting = df_contesting[df_contesting['puuid'] != df_contesting['puuid_other']]
    # is the other player contesting the same unit having higher or equal unit_tier 
    df_contesting['contesting_unit_tier_equal'] = (df_contesting['unit_tier_other'] == df_contesting['unit_tier']).astype(int)
    df_contesting['contesting_unit_tier_higher'] = (df_contesting['unit_tier_other'] > df_contesting['unit_tier']).astype(int)
    
    unit_contest_features = df_contesting.groupby(['metadata_match_id', 'puuid', 'unit_name']).agg(
        players_contesting_unit=('puuid_other', 'count'),
        players_contesting_unit_tier_equal=('contesting_unit_tier_equal', 'sum'),
        players_contesting_unit_tier_higher=('contesting_unit_tier_higher', 'sum'),
    ).reset_index()

    df_units = df_units.merge(unit_contest_features, on=['metadata_match_id', 'puuid', 'unit_name'], how='left')
    df_units[['players_contesting_unit', 'players_contesting_unit_tier_equal', 'players_contesting_unit_tier_higher']] = df_units[['players_contesting_unit', 'players_contesting_unit_tier_equal', 'players_contesting_unit_tier_higher']].fillna(0)
    return df_units

In [23]:
def create_unit_features(df_units):
    df_units_ = spark.createDataFrame(df_units.drop(['units', 'unit_items'], axis=1))

    unit_feats = df_units_.groupby('metadata_match_id', 'puuid')\
                        .agg(F.collect_list(F.struct(F.col('unit_name'), 
                                                     F.col('unit_cost'), 
                                                     F.col('unit_tier'), 
                                                     F.col('unit_item_count'), 
                                                     F.col('players_contesting_unit'),
                                                     F.col('players_contesting_unit_tier_equal'),
                                                     F.col('players_contesting_unit_tier_higher'),
                                                     F.col('is_ap_damage'),
                                                     F.col('is_ad_damage'),
                                                     F.col('is_carry'),
                                                     F.col('is_caster'),
                                                     F.col('is_tank'),
                                                     F.col('is_reaper'),
                                                     F.col('is_fighter'),
                                                     F.col('unit_radiant_item_count'),
                                                     F.col('unit_artifact_item_count'),
                                                     F.col('unit_utility_item_count'),
                                                     F.col('unit_emblem_item_count'),
                                                     F.col('unit_ornn_item_count'),
                                                     F.col('unit_chembaron_item_count'),
                                                     F.col('unit_chembaron_gold_item_count'),
                                                     F.col('unit_chembaron_prismatic_item_count')
                                                    )).alias('units'),
                            F.count('unit_name').alias('total_units'),
                            F.sum('unit_item_count').alias('total_items'),
                            F.max('unit_cost').alias('highest_unit_cost'),
                            F.avg('unit_cost').alias('average_unit_cost'),
                            F.sum('unit_total_cost').alias('total_board_cost'),
                            F.sum(F.when(F.col('unit_tier') == 4, 1).otherwise(0)).alias('total_tier_4_units'),
                            F.sum(F.when(F.col('unit_tier') == 3, 1).otherwise(0)).alias('total_tier_3_units'),
                            F.sum(F.when(F.col('unit_tier') == 2, 1).otherwise(0)).alias('total_tier_2_units'),
                            F.sum(F.when((F.col('unit_tier') == 3) & (F.col('unit_cost') == 1), 1).otherwise(0)).alias('total_tier_3_1_cost_units'),
                            F.sum(F.when((F.col('unit_tier') == 3) & (F.col('unit_cost') == 2), 1).otherwise(0)).alias('total_tier_3_2_cost_units'),
                            F.sum(F.when((F.col('unit_tier') == 3) & (F.col('unit_cost') == 3), 1).otherwise(0)).alias('total_tier_3_3_cost_units'),
                            F.sum(F.when((F.col('unit_tier') == 3) & (F.col('unit_cost') == 4), 1).otherwise(0)).alias('total_tier_3_4_cost_units'),
                            F.sum(F.when((F.col('unit_tier') == 3) & (F.col('unit_cost') == 5), 1).otherwise(0)).alias('total_tier_3_5_cost_units'),
                            F.sum(F.when((F.col('unit_tier') == 3) & (F.col('unit_cost') == 6), 1).otherwise(0)).alias('total_tier_3_6_cost_units'),
                            F.sum(F.when((F.col('unit_tier') == 2) & (F.col('unit_cost').isin([1, 2])), 1).otherwise(0)).alias('total_tier_2_1_and_2_cost_units'),
                            F.sum(F.when((F.col('unit_tier') == 2) & (F.col('unit_cost') == 3), 1).otherwise(0)).alias('total_tier_2_3_cost_units'),
                            F.sum(F.when((F.col('unit_tier') == 2) & (F.col('unit_cost') == 4), 1).otherwise(0)).alias('total_tier_2_4_cost_units'),
                            F.sum(F.when((F.col('unit_tier') == 2) & (F.col('unit_cost') == 5), 1).otherwise(0)).alias('total_tier_2_5_cost_units'),
                            F.sum(F.when((F.col('unit_tier') == 2) & (F.col('unit_cost') == 6), 1).otherwise(0)).alias('total_tier_2_6_cost_units'),
                            F.sum(F.when((F.col('unit_tier') == 1) & (F.col('unit_cost') == 5), 1).otherwise(0)).alias('total_tier_1_5_cost_units'),
                            F.sum(F.when((F.col('unit_tier') == 1) & (F.col('unit_cost') == 6), 1).otherwise(0)).alias('total_tier_1_6_cost_units'),
                            F.sum('is_ap_damage').alias('total_ap_units'),
                            F.sum('is_ad_damage').alias('total_ad_units'),
                            F.sum('is_carry').alias('total_carry_units'),
                            F.sum('is_caster').alias('total_caster_units'),
                            F.sum('is_tank').alias('total_tank_units'),
                            F.sum('is_reaper').alias('total_reaper_units'),
                            F.sum('is_fighter').alias('total_fighter_units'),
                            F.sum('unit_component_item_count').alias('total_component_items'),
                            F.sum('unit_combined_item_count').alias('total_combined_items'),
                            F.sum('unit_radiant_item_count').alias('total_radiant_items'),
                            F.sum('unit_tactician_crown_item_count').alias('total_tactician_crown_items'),
                            F.sum('unit_utility_item_count').alias('total_utility_items'),
                            F.sum('unit_artifact_item_count').alias('total_artifact_items'),
                            F.sum('unit_chembaron_item_count').alias('total_chembaron_items'),
                            F.sum('unit_chembaron_bronze_item_count').alias('total_chembaron_bronze_items'),
                            F.sum('unit_chembaron_silver_item_count').alias('total_chembaron_silver_items'),
                            F.sum('unit_chembaron_gold_item_count').alias('total_chembaron_gold_items'),
                            F.sum('unit_chembaron_prismatic_item_count').alias('total_chembaron_prismatic_items'),
                            F.sum('unit_emblem_item_count').alias('total_emblem_items'),
                            F.sum('unit_junkerking_upgrade_item_count').alias('total_junkerking_upgrade_items'),
                            F.sum('unit_shimmerscale_item_count').alias('total_shimmerscale_items'),
                            F.sum('unit_ornn_item_count').alias('total_ornn_items')
                            )
    return unit_feats

## 1. Create features from data

In [24]:
# parameters of region, tier, division
REGION = 'kr'
TIER = 'GOLD'
DIVISION = 'I'

In [25]:
path = os.path.join(MATCH_DIR, f'matches_{REGION}_{TIER}_{DIVISION}.parquet')

In [26]:
df = pd.read_parquet(path)

df.head()

Unnamed: 0,metadata_data_version,metadata_match_id,info_endOfGameResult,info_gameCreation,info_game_datetime,info_game_length,info_game_version,info_mapId,info_queueId,info_tft_game_type,info_tft_set_core_name,info_tft_set_number,gold_left,last_round,level,partner_group_id,placement,players_eliminated,puuid,riotIdGameName,riotIdTagline,skill_tree,time_eliminated,total_damage_to_players,traits,units,win
0,6,KR_7491534598,GameComplete,1737915614000,1737917917153,2248.516357,Linux Version 15.2.652.1536 (Jan 23 2025/09:50...,22,1100,standard,TFTSet13,13,0,35,8,,4,2,20nK8YPS-5BL-ca5mgsX3HWeGpsWLdtYzWUs9MRWSLGr0K...,토라미노,KR1,,2089.960205,110,"[{'name': 'TFT13_Bruiser', 'num_units': 1, 'st...","[{'character_id': 'TFT13_Draven', 'itemNames':...",True
1,6,KR_7491534598,GameComplete,1737915614000,1737917917153,2248.516357,Linux Version 15.2.652.1536 (Jan 23 2025/09:50...,22,1100,standard,TFTSet13,13,1,35,8,,3,1,pK2H19GGyoV-UcICU9AoGpNfc3eTm4aZ_oRs6DAcDGEv5s...,와 키,KR1,,2096.769287,120,"[{'name': 'TFT13_Ambassador', 'num_units': 1, ...","[{'character_id': 'TFT13_Amumu', 'itemNames': ...",True
2,6,KR_7491534598,GameComplete,1737915614000,1737917917153,2248.516357,Linux Version 15.2.652.1536 (Jan 23 2025/09:50...,22,1100,standard,TFTSet13,13,32,38,9,,1,2,lTYm9Wn5ub-OEAkzCERLKDr3b1az4RCVJurOlaujDAureG...,풍자이국주박나래화사,KR1,,2235.349121,199,"[{'name': 'TFT13_Ambassador', 'num_units': 1, ...","[{'character_id': 'TFT13_Camille', 'itemNames'...",True
3,6,KR_7491534598,GameComplete,1737915614000,1737917917153,2248.516357,Linux Version 15.2.652.1536 (Jan 23 2025/09:50...,22,1100,standard,TFTSet13,13,0,30,7,,6,0,8c4Y48OU3vWU6TgPNqxhHCnmPnZx9xkIAMsMaT2eOaddft...,넌패배자야,KR1,,1793.041138,71,"[{'name': 'TFT13_Academy', 'num_units': 1, 'st...","[{'character_id': 'TFT13_Morgana', 'itemNames'...",False
4,6,KR_7491534598,GameComplete,1737915614000,1737917917153,2248.516357,Linux Version 15.2.652.1536 (Jan 23 2025/09:50...,22,1100,standard,TFTSet13,13,1,26,8,,8,0,c5C9pDtuarWL2BYhp8QEk4mUHyWWSfKU8MZ0VQDCZEkjlk...,tjcodnjs,9876,,1567.391846,28,"[{'name': 'TFT13_Academy', 'num_units': 2, 'st...","[{'character_id': 'TFT13_Irelia', 'itemNames':...",False


In [27]:
df_stats, df_traits, df_units = separate_data(df)

In [28]:
df_stats.shape, df_traits.shape, df_units.shape

((84688, 10), (931823, 3), (725108, 3))

#### Match stats

In [29]:
df_stats.head(8)

Unnamed: 0,metadata_match_id,puuid,info_game_length,gold_left,last_round,level,placement,players_eliminated,total_damage_to_players,win
0,KR_7491534598,20nK8YPS-5BL-ca5mgsX3HWeGpsWLdtYzWUs9MRWSLGr0K...,2248.516357,0,35,8,4,2,110,True
1,KR_7491534598,pK2H19GGyoV-UcICU9AoGpNfc3eTm4aZ_oRs6DAcDGEv5s...,2248.516357,1,35,8,3,1,120,True
2,KR_7491534598,lTYm9Wn5ub-OEAkzCERLKDr3b1az4RCVJurOlaujDAureG...,2248.516357,32,38,9,1,2,199,True
3,KR_7491534598,8c4Y48OU3vWU6TgPNqxhHCnmPnZx9xkIAMsMaT2eOaddft...,2248.516357,0,30,7,6,0,71,False
4,KR_7491534598,c5C9pDtuarWL2BYhp8QEk4mUHyWWSfKU8MZ0VQDCZEkjlk...,2248.516357,1,26,8,8,0,28,False
5,KR_7491534598,nZ5aEmlWcnzwHLRoBd-tbVPBTCdTVoyCr6sZsMOJH08MP2...,2248.516357,1,38,8,2,3,176,True
6,KR_7491534598,CwDmfkLvZv919GdH43ZAf3BLLo7p25DxLcl-hQ8BIlqW-Y...,2248.516357,0,27,7,7,0,38,False
7,KR_7491534598,PSkXZLpJMIh9ru9Yc0WL6kO5d3NwPfQNMpnMnhO72GgYwX...,2248.516357,0,33,8,5,0,79,False


In [30]:
# create features related to match' levels, players eliminated, total damage to players, gold left
df_stats = create_match_stats(df_stats)

In [31]:
df_stats.head(8)

Unnamed: 0,metadata_match_id,puuid,info_game_length,gold_left,last_round,level,placement,players_eliminated,total_damage_to_players,win,max_level_by_match,min_level_by_match,avg_level_by_match,diff_max_level,diff_min_level,max_players_eliminated_by_match,min_players_eliminated_by_match,avg_players_eliminated_by_match,diff_max_players_eliminated,diff_min_players_eliminated,max_total_damage_to_players_by_match,min_total_damage_to_players_by_match,avg_total_damage_to_players_by_match,diff_max_total_damage_to_players,diff_min_total_damage_to_players,max_gold_left_by_match,min_gold_left_by_match,avg_gold_left_by_match,diff_max_gold_left,diff_min_gold_left
0,KR_7491534598,20nK8YPS-5BL-ca5mgsX3HWeGpsWLdtYzWUs9MRWSLGr0K...,2248.516357,0,35,8,4,2,110,True,9,7,7.875,1,1,3,0,1.0,1,2,199,28,102.625,89,82,32,0,4.375,32,0
1,KR_7491534598,pK2H19GGyoV-UcICU9AoGpNfc3eTm4aZ_oRs6DAcDGEv5s...,2248.516357,1,35,8,3,1,120,True,9,7,7.875,1,1,3,0,1.0,2,1,199,28,102.625,79,92,32,0,4.375,31,1
2,KR_7491534598,lTYm9Wn5ub-OEAkzCERLKDr3b1az4RCVJurOlaujDAureG...,2248.516357,32,38,9,1,2,199,True,9,7,7.875,0,2,3,0,1.0,1,2,199,28,102.625,0,171,32,0,4.375,0,32
3,KR_7491534598,8c4Y48OU3vWU6TgPNqxhHCnmPnZx9xkIAMsMaT2eOaddft...,2248.516357,0,30,7,6,0,71,False,9,7,7.875,2,0,3,0,1.0,3,0,199,28,102.625,128,43,32,0,4.375,32,0
4,KR_7491534598,c5C9pDtuarWL2BYhp8QEk4mUHyWWSfKU8MZ0VQDCZEkjlk...,2248.516357,1,26,8,8,0,28,False,9,7,7.875,1,1,3,0,1.0,3,0,199,28,102.625,171,0,32,0,4.375,31,1
5,KR_7491534598,nZ5aEmlWcnzwHLRoBd-tbVPBTCdTVoyCr6sZsMOJH08MP2...,2248.516357,1,38,8,2,3,176,True,9,7,7.875,1,1,3,0,1.0,0,3,199,28,102.625,23,148,32,0,4.375,31,1
6,KR_7491534598,CwDmfkLvZv919GdH43ZAf3BLLo7p25DxLcl-hQ8BIlqW-Y...,2248.516357,0,27,7,7,0,38,False,9,7,7.875,2,0,3,0,1.0,3,0,199,28,102.625,161,10,32,0,4.375,32,0
7,KR_7491534598,PSkXZLpJMIh9ru9Yc0WL6kO5d3NwPfQNMpnMnhO72GgYwX...,2248.516357,0,33,8,5,0,79,False,9,7,7.875,1,1,3,0,1.0,3,0,199,28,102.625,120,51,32,0,4.375,32,0


In [32]:
df_stats.shape

(84688, 30)

In [33]:
# save match stats
df_stats.to_parquet(os.path.join(MATCH_STATS_DIR, f'match_stats_{REGION}_{TIER}_{DIVISION}.parquet'))

#### Traits

In [29]:
df_traits.head()

Unnamed: 0,metadata_match_id,puuid,traits
0,KR_7491534598,20nK8YPS-5BL-ca5mgsX3HWeGpsWLdtYzWUs9MRWSLGr0K...,"{'name': 'TFT13_Bruiser', 'num_units': 1, 'sty..."
0,KR_7491534598,20nK8YPS-5BL-ca5mgsX3HWeGpsWLdtYzWUs9MRWSLGr0K...,"{'name': 'TFT13_Cabal', 'num_units': 1, 'style..."
0,KR_7491534598,20nK8YPS-5BL-ca5mgsX3HWeGpsWLdtYzWUs9MRWSLGr0K...,"{'name': 'TFT13_Crime', 'num_units': 1, 'style..."
0,KR_7491534598,20nK8YPS-5BL-ca5mgsX3HWeGpsWLdtYzWUs9MRWSLGr0K...,"{'name': 'TFT13_Experiment', 'num_units': 1, '..."
0,KR_7491534598,20nK8YPS-5BL-ca5mgsX3HWeGpsWLdtYzWUs9MRWSLGr0K...,"{'name': 'TFT13_FormSwapper', 'num_units': 2, ..."


In [30]:
df_traits.shape

(931823, 3)

In [31]:
df_traits = extract_trait_characteristics(df_traits)

In [32]:
df_traits.shape

(931823, 20)

In [33]:
trait_features = create_trait_features(df_traits)

In [34]:
trait_features.write.mode('overwrite').parquet(os.path.join(TRAIT_FEATURES_DIR, f'trait_features_{REGION}_{TIER}_{DIVISION}.parquet'))

25/03/26 22:31:37 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/03/26 22:31:38 WARN TaskSetManager: Stage 0 contains a task of very large size (8565 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

#### Units & Items

In [42]:
df_units.head()

Unnamed: 0,metadata_match_id,puuid,units
0,KR_7491534598,20nK8YPS-5BL-ca5mgsX3HWeGpsWLdtYzWUs9MRWSLGr0K...,"{'character_id': 'TFT13_Draven', 'itemNames': ..."
0,KR_7491534598,20nK8YPS-5BL-ca5mgsX3HWeGpsWLdtYzWUs9MRWSLGr0K...,"{'character_id': 'TFT13_Urgot', 'itemNames': [..."
0,KR_7491534598,20nK8YPS-5BL-ca5mgsX3HWeGpsWLdtYzWUs9MRWSLGr0K...,"{'character_id': 'TFT13_Gangplank', 'itemNames..."
0,KR_7491534598,20nK8YPS-5BL-ca5mgsX3HWeGpsWLdtYzWUs9MRWSLGr0K...,"{'character_id': 'tft13_elise', 'itemNames': [..."
0,KR_7491534598,20nK8YPS-5BL-ca5mgsX3HWeGpsWLdtYzWUs9MRWSLGr0K...,"{'character_id': 'TFT13_Vi', 'itemNames': [], ..."


In [43]:
df_units = extract_unit_characteristics(df_units)

In [44]:
df_units.head()

Unnamed: 0,metadata_match_id,puuid,units,unit_name,unit_items,unit_item_count,unit_cost,unit_tier,unit_total_cost,is_ap_damage,is_ad_damage,is_carry,is_caster,is_tank,is_reaper,is_fighter,unit_component_item_count,unit_combined_item_count,unit_radiant_item_count,unit_tactician_crown_item_count,unit_utility_item_count,unit_artifact_item_count,unit_chembaron_item_count,unit_chembaron_bronze_item_count,unit_chembaron_silver_item_count,unit_chembaron_gold_item_count,unit_chembaron_prismatic_item_count,unit_emblem_item_count,unit_junkerking_upgrade_item_count,unit_shimmerscale_item_count,unit_ornn_item_count,players_contesting_unit,players_contesting_unit_tier_equal,players_contesting_unit_tier_higher
0,KR_7491534598,20nK8YPS-5BL-ca5mgsX3HWeGpsWLdtYzWUs9MRWSLGr0K...,"{'character_id': 'TFT13_Draven', 'itemNames': ...",Draven,[],0,1,2,3,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0
1,KR_7491534598,20nK8YPS-5BL-ca5mgsX3HWeGpsWLdtYzWUs9MRWSLGr0K...,"{'character_id': 'TFT13_Urgot', 'itemNames': [...",Urgot,[],0,2,2,6,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0
2,KR_7491534598,20nK8YPS-5BL-ca5mgsX3HWeGpsWLdtYzWUs9MRWSLGr0K...,"{'character_id': 'TFT13_Gangplank', 'itemNames...",Gangplank,"[TFT_Item_ZekesHerald, TFT_Item_LastWhisper, T...",3,3,2,9,0,1,0,1,0,0,1,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0
3,KR_7491534598,20nK8YPS-5BL-ca5mgsX3HWeGpsWLdtYzWUs9MRWSLGr0K...,"{'character_id': 'tft13_elise', 'itemNames': [...",Elise,"[TFT_Item_Redemption, TFT_Item_BrambleVest, TF...",3,4,1,4,1,0,0,1,1,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0.0,1.0
4,KR_7491534598,20nK8YPS-5BL-ca5mgsX3HWeGpsWLdtYzWUs9MRWSLGr0K...,"{'character_id': 'TFT13_Vi', 'itemNames': [], ...",Vi,[],0,4,2,12,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0


In [45]:
unit_features = create_unit_features(df_units)

In [46]:
unit_features.count()

25/03/17 17:23:25 WARN TaskSetManager: Stage 9 contains a task of very large size (7090 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

84688

In [47]:
unit_features.write.mode('overwrite').parquet(os.path.join(UNIT_ITEM_FEATURES_DIR, f'unit_item_features_{REGION}_{TIER}_{DIVISION}.parquet'))

25/03/17 17:23:32 WARN TaskSetManager: Stage 15 contains a task of very large size (7090 KiB). The maximum recommended task size is 1000 KiB.
                                                                                