# Dataset Preprocessing

## Importing libraries

In [53]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from IPython.display import display
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

from neuralol.constants import Role, StatsCols, NewFeature
from neuralol.data import get_model_players_game_features

pd.set_option('display.max_columns', None)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load data

In [54]:
df_players_game_stats = pd.read_csv('../data/raw/game_players_stats.csv')
print(f"{df_players_game_stats.shape[0]} rows and {df_players_game_stats.shape[1]} columns")
display(df_players_game_stats.head(10))

df_players_game_stats.describe(percentiles=[.01, .05, .25, .5, .75, .95, .99])

374554 rows and 28 columns


Unnamed: 0,game_id,player_id,player_name,team_id,team_name,team_acronym,role,win,game_length,champion_name,team_kills,tower_kills,inhibitor_kills,dragon_kills,herald_kills,baron_kills,player_kills,player_deaths,player_assists,total_minions_killed,gold_earned,level,total_damage_dealt,total_damage_dealt_to_champions,total_damage_taken,wards_placed,largest_killing_spree,largest_multi_kill
0,15,0,shanji,0,Legend Esport Gaming,LEG,Top,False,1415,Irelia,7,3,0,0,0,0,2,5,1,179,8530,12,99007,7923,15326,8,0,1
1,15,1,Demon,1,Bilibili Gaming Junior,BLGJ,Top,True,1415,Vladimir,17,8,1,3,1,1,2,3,6,174,8565,14,100342,10857,16475,11,2,1
2,15,2,yanxuan,0,Legend Esport Gaming,LEG,Bot,False,1415,Kai'Sa,7,3,0,0,0,0,2,2,1,227,9613,12,116407,7011,5788,9,0,1
3,15,3,Viod,0,Legend Esport Gaming,LEG,Support,False,1415,Lux,7,3,0,0,0,0,0,4,2,19,5442,10,23555,4932,6151,25,0,0
4,15,4,HanXuan,1,Bilibili Gaming Junior,BLGJ,Mid,True,1415,Aatrox,17,8,1,3,1,1,4,2,4,188,10125,14,125022,10749,15481,10,3,2
5,15,5,Yui,1,Bilibili Gaming Junior,BLGJ,Support,True,1415,Alistar,17,8,1,3,1,1,1,1,11,45,7615,12,27332,2889,8318,25,0,1
6,15,6,Chieftain,1,Bilibili Gaming Junior,BLGJ,Jungle,True,1415,Gragas,17,8,1,3,1,1,5,0,5,129,9686,13,121081,7263,18191,19,5,1
7,15,7,Virus,1,Bilibili Gaming Junior,BLGJ,Bot,True,1415,Xayah,17,8,1,3,1,1,5,1,3,249,12122,14,147097,9059,8442,9,3,2
8,15,8,Assassin,0,Legend Esport Gaming,LEG,Mid,False,1415,Akali,7,3,0,0,0,0,3,5,0,193,9434,13,114025,11373,18390,6,2,1
9,15,9,CatJug,0,Legend Esport Gaming,LEG,Jungle,False,1415,Sejuani,7,3,0,0,0,0,0,1,4,167,8511,13,128180,5912,16508,6,0,0


Unnamed: 0,game_id,player_id,team_id,game_length,team_kills,tower_kills,inhibitor_kills,dragon_kills,herald_kills,baron_kills,player_kills,player_deaths,player_assists,total_minions_killed,gold_earned,level,total_damage_dealt,total_damage_dealt_to_champions,total_damage_taken,wards_placed,largest_killing_spree,largest_multi_kill
count,374554.0,374554.0,374554.0,374554.0,374554.0,374554.0,374554.0,374554.0,374554.0,374554.0,374554.0,374554.0,374554.0,374554.0,374554.0,374554.0,374554.0,374554.0,374554.0,374554.0,374554.0,374554.0
mean,18729.401448,1549.469631,283.521236,2376.872,13.915163,6.018486,0.895227,2.262723,0.885904,0.696305,2.783038,2.788671,6.393858,204.271643,11472.20518,14.789699,155852.709826,13792.472418,19392.803756,20.362722,1.762427,1.060029
std,10813.836519,1234.016945,222.844447,32027.94,7.267931,3.61885,1.065168,1.400929,0.759774,0.741653,2.683247,1.947362,4.446742,105.84433,3389.038142,2.1846,92223.609967,8750.449083,9394.270675,18.232569,2.250181,0.833039
min,0.0,0.0,0.0,806.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1920.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1%,374.0,33.0,5.0,1299.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,5316.0,10.0,11445.53,2087.0,4978.0,0.0,0.0,0.0
5%,1872.0,120.0,25.0,1435.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,28.0,6472.0,11.0,15928.0,3327.65,7587.0,6.0,0.0,0.0
25%,9363.0,480.0,86.0,1682.0,8.0,3.0,0.0,1.0,0.0,0.0,1.0,1.0,3.0,147.0,8911.0,13.0,102675.75,7145.0,12421.0,10.0,0.0,1.0
50%,18731.0,1221.0,221.0,1873.0,14.0,7.0,1.0,2.0,1.0,1.0,2.0,3.0,6.0,220.0,11243.0,15.0,158449.0,12180.0,17574.0,14.0,0.0,1.0
75%,28094.0,2409.0,449.0,2111.0,19.0,9.0,1.0,3.0,1.0,1.0,4.0,4.0,9.0,277.0,13635.0,16.0,209655.5,18340.0,24689.0,21.0,3.0,1.0
95%,35586.0,3965.0,699.0,2531.0,26.0,11.0,3.0,4.0,2.0,2.0,8.0,6.0,15.0,360.0,17422.0,18.0,309147.1,30181.35,36953.0,61.0,6.0,3.0


In [55]:
print(f"Number of different games: {df_players_game_stats[StatsCols.GAME_ID].nunique()}")
print(f"Number of different players: {df_players_game_stats[StatsCols.PLAYER_ID].nunique()}")
print(f"Number of different teams: {df_players_game_stats[StatsCols.TEAM_ID].nunique()}")
# Check missing values in players_game_stats
print("\nMissing values in players_game_stats:")
missing_values = df_players_game_stats.isna().sum()
print(missing_values[missing_values > 0])  # Only show columns with missing values

# Percentage of missing values
print("\nPercentage of missing values:")
percent_missing = (missing_values / df_players_game_stats.shape[0]) * 100
print(percent_missing[percent_missing > 0])  # Only show columns with missing values

Number of different games: 37459
Number of different players: 4953
Number of different teams: 844

Missing values in players_game_stats:
team_acronym    290
dtype: int64

Percentage of missing values:
team_acronym    0.077425
dtype: float64


## Missing values & outliers

In [56]:
# Drop team_acronym column as it has missing values and do not provide any useful information
processed_players_game_stats = df_players_game_stats.drop(columns=[StatsCols.TEAM_ACRONYM])

In [57]:
# Remove outliers: game_length > 3600 (1 hour) and wards_placed > 120
processed_players_game_stats = processed_players_game_stats[
    (processed_players_game_stats[StatsCols.GAME_LENGTH] <= 3600) &
    (processed_players_game_stats[StatsCols.WARDS_PLACED] <= 120)
]
processed_players_game_stats[[StatsCols.GAME_LENGTH, StatsCols.WARDS_PLACED]].describe(percentiles=[.01, .05, .25, .5, .75, .95, .99])

Unnamed: 0,game_length,wards_placed
count,374079.0,374079.0
mean,1914.158151,20.259432
std,334.942078,17.916448
min,806.0,0.0
1%,1299.0,0.0
5%,1435.0,6.0
25%,1681.0,10.0
50%,1872.0,13.0
75%,2110.0,21.0
95%,2527.0,60.0


## One-hot encoding

Since `object`/`str` type columns are more of a metadata information (player name, champ name...) rather than actual useful info to determine the player's game win, no encoding is needed.

## Feature creation & selection

In [58]:
df_processed = processed_players_game_stats.copy()
# KLA ratio
df_processed[NewFeature.KLA] = (df_processed[StatsCols.PLAYER_KILLS] + df_processed[StatsCols.PLAYER_ASSISTS]) / (df_processed[StatsCols.PLAYER_DEATHS] + 1)
# Objective Control Score: weighted sum of dragon_kills, herald_kills, baron_kills
df_processed[NewFeature.OBJECTIVES_CONTROL] = (df_processed[StatsCols.DRAGON_KILLS] * 10 +
                                      df_processed[StatsCols.HERALD_KILLS] * 10 +
                                      df_processed[StatsCols.BARON_KILLS] * 50)
# Tower control score: weighted sum of tower_kills and inhibitor_kills
df_processed[NewFeature.TOWER_CONTROL] = (df_processed[StatsCols.TOWER_KILLS] * 5 + df_processed[StatsCols.INHIBITOR_KILLS] * 20)
# Gold per minute
df_processed[NewFeature.GOLD_PER_MIN] = df_processed[StatsCols.GOLD_EARNED] / (df_processed[StatsCols.GAME_LENGTH] / 60)
# Level per minute
df_processed[NewFeature.LEVEL_PER_MIN] = df_processed[StatsCols.LEVEL] / (df_processed[StatsCols.GAME_LENGTH] / 60)
# CS per minute
df_processed[NewFeature.CS_PER_MIN] = df_processed[StatsCols.TOTAL_MINIONS_KILLED] / (df_processed[StatsCols.GAME_LENGTH] / 60)
# Team kills per minute
df_processed[NewFeature.TEAM_KILLS_PER_MIN] = df_processed[StatsCols.TEAM_KILLS] / (df_processed[StatsCols.GAME_LENGTH] / 60)
# Total damage per minute
df_processed[NewFeature.TOTAL_DAMAGE_PER_MIN] = df_processed[StatsCols.TOTAL_DAMAGE_DEALT] / (df_processed[StatsCols.GAME_LENGTH] / 60)
# Damage to champions per minute
df_processed[NewFeature.DAMAGE_TO_CHAMPIONS_PER_MIN] = df_processed[StatsCols.TOTAL_DAMAGE_DEALT_TO_CHAMPIONS] / (df_processed[StatsCols.GAME_LENGTH] / 60)
# Damage taken per minute
df_processed[NewFeature.DAMAGE_TAKEN_PER_MIN] = df_processed[StatsCols.TOTAL_DAMAGE_TAKEN] / (df_processed[StatsCols.GAME_LENGTH] / 60)

model_input_features = get_model_players_game_features()
df_processed = df_processed[[
    StatsCols.GAME_ID,
    StatsCols.PLAYER_ID,
    StatsCols.PLAYER_NAME,
    StatsCols.TEAM_ID,
    StatsCols.TEAM_NAME,
    StatsCols.ROLE,
    StatsCols.WIN,
    StatsCols.CHAMPION_NAME] + model_input_features]
print(f"Processed columns: {df_processed.columns.tolist()}")

Processed columns: ['game_id', 'player_id', 'player_name', 'team_id', 'team_name', 'role', 'win', 'champion_name', 'kla', 'objectives_control', 'tower_control', 'gold_per_min', 'level_per_min', 'team_kills_per_min', 'largest_killing_spree', 'largest_multi_kill', 'cs_per_min', 'total_damage_per_min', 'damage_to_champions_per_min', 'damage_taken_per_min']


## Numerical features standardization

In [59]:
df_standardized = df_processed.copy()
df_standardized[model_input_features] = (df_processed[model_input_features] - df_processed[model_input_features].mean()) / df_processed[model_input_features].std()

df_standardized.describe()

Unnamed: 0,game_id,player_id,team_id,kla,objectives_control,tower_control,gold_per_min,level_per_min,team_kills_per_min,largest_killing_spree,largest_multi_kill,cs_per_min,total_damage_per_min,damage_to_champions_per_min,damage_taken_per_min
count,374079.0,374079.0,374079.0,374079.0,374079.0,374079.0,374079.0,374079.0,374079.0,374079.0,374079.0,374079.0,374079.0,374079.0,374079.0
mean,18726.951107,1549.50619,283.498266,-1.155243e-16,-8.783037000000001e-17,-2.6440280000000003e-17,-1.394953e-16,1.445098e-15,1.115354e-16,1.4739700000000002e-17,-5.667946e-17,6.6860490000000004e-18,-2.103066e-16,3.2822420000000004e-17,4.3763230000000005e-17
std,10812.926044,1234.081213,222.854506,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,0.0,0.0,0.0,-0.9791201,-1.388977,-1.273804,-2.820607,-6.326694,-1.818931,-0.7835322,-1.272785,-2.07752,-1.934243,-1.762393,-2.339467
25%,9360.0,480.0,86.0,-0.7135088,-0.9696955,-0.8755363,-0.7208121,-0.7090833,-0.7755431,-0.7835322,-0.07243874,-0.4944906,-0.4364736,-0.7983969,-0.770615
50%,18728.0,1221.0,221.0,-0.3416529,0.07850847,0.1865119,0.02627638,0.009767343,-0.1119088,-0.7835322,-0.07243874,0.2910976,0.1507312,-0.1316099,-0.1821147
75%,28087.5,2407.0,449.0,0.3489365,0.7074309,0.717536,0.7054868,0.6919078,0.6337479,0.5494325,-0.07243874,0.75596,0.6497438,0.6112394,0.6787213
max,37458.0,4952.0,843.0,8.582887,5.529169,5.496753,6.805512,6.575711,9.493347,8.991542,4.728945,3.135712,5.69235,7.389783,7.940019


## Save preprocessed data

In [60]:
import os

# Create directory if it doesn't exist
preprocessed_dir = '../data/preprocessed'
os.makedirs(preprocessed_dir, exist_ok=True)

# Save the processed dataframe to CSV
processed_filename = 'processed_game_players_stats.csv'
processed_file_path = os.path.join(preprocessed_dir, processed_filename)
df_processed.to_csv(processed_file_path, index=False)
print(f"Processed dataset saved to {processed_file_path}")

# Save the standardized dataframe to CSV
standardized_filename = 'standardized_game_players_stats.csv'
standardized_file_path = os.path.join(preprocessed_dir, standardized_filename)
df_standardized.to_csv(standardized_file_path, index=False)
print(f"Standardized dataset saved to {standardized_file_path}")

Processed dataset saved to ../data/preprocessed\processed_game_players_stats.csv
Standardized dataset saved to ../data/preprocessed\standardized_game_players_stats.csv
