In [13]:
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from ydata_profiling import ProfileReport
import src.dataHandling.cleaningUtils as clean
import os
import xgboost as xgb
import shap
import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance
import re


In [2]:
try:
    with open("data/static_05_12_23/raw/static_full.pkl", "rb") as f:
        df = pickle.load(f)
except FileNotFoundError:
    os.chdir('../')
    with open("data/static_05_12_23/raw/static_full.pkl", "rb") as f:
        df = pickle.load(f)
df = clean.drop_wrong_data(df)
df.reset_index(drop=True, inplace=True)
df = clean.fix_rank(df)
df = clean.calc_winrate(df)
df = clean.fix_teamId(df)
df = clean.convert_booleans(df)
df = clean.convert_lastPlayTime(df)
df = clean.convert_championTier(df)
df = clean.get_winning_team(df)  # this has to be the last step where a column is inserted
df = clean.drop_wrong_teamIds(df)
df = clean.drop_wrong_wins(df)
df = clean.drop_irrelevant(df)
df = clean.drop_missing(df)
assert df.columns[-1] == 'label'
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1],
                                                    df.iloc[:, -1],
                                                    test_size=0.1,
                                                    random_state=42,
                                                    shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                  y_train,
                                                  test_size=0.1,
                                                  random_state=42,
                                                  shuffle=True)

found 51360 rows
dropped wrong mapId
dropped wrong queueId
dropped wrong gameDuration
dropped wrong seasonId
dropped wrong gameVersion
dropped wrong patch
dropped 0 wrong rows
dropped 51 rows because of wrong teamIds
dropped 0 rows because of wrong wins
dropped 25835 rows


In [3]:
df

Unnamed: 0,participant1_level,participant1_champion_championNumber,participant1_champion_infoAttack,participant1_champion_infoDefense,participant1_champion_infoMagic,participant1_champion_infoDifficulty,participant1_champion_tier,participant1_champion_win_rate,participant1_champion_pick_rate,participant1_champion_ban_rate,...,participant2_winrate,participant3_winrate,participant4_winrate,participant5_winrate,participant6_winrate,participant7_winrate,participant8_winrate,participant9_winrate,participant10_winrate,label
3,841,54,5,9,7,2,5,51.50,15.2,15.2,...,0.541528,0.524000,0.653846,0.511971,0.506912,0.553476,0.525275,0.468750,0.508163,1
4,766,24,7,5,7,5,0,48.49,16.8,16.8,...,0.517751,0.536443,0.504360,0.562264,0.368421,0.666667,0.428571,0.583333,0.622449,0
7,188,54,5,9,7,2,5,51.50,15.2,15.2,...,0.513274,0.528000,0.515038,0.504274,0.505882,0.529586,1.000000,0.519846,0.539648,0
8,54,24,7,5,7,5,0,48.49,16.8,16.8,...,0.510870,0.562189,0.508333,0.555556,0.552239,0.541528,0.538462,0.457143,0.568182,0
10,54,24,7,5,7,5,0,48.49,16.8,16.8,...,0.538462,0.534591,0.505263,0.554545,0.489130,0.342857,0.529049,0.777778,0.524194,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51312,641,268,6,3,8,9,0,48.03,0.7,0.7,...,0.707317,0.530303,0.517312,0.481013,0.490909,0.512048,0.676923,0.535294,0.534562,1
51314,392,887,7,4,5,5,2,50.80,2.2,2.2,...,0.559809,0.517808,0.916667,0.700000,0.522727,0.538206,0.565657,0.515695,0.800000,0
51342,77,122,9,5,1,2,5,50.21,17.9,17.9,...,0.571429,0.491379,0.508321,0.502513,0.666667,0.440000,0.464481,0.533333,0.517766,0
51357,700,166,0,0,0,0,4,51.17,6.1,6.1,...,1.000000,0.546667,0.380952,0.575758,0.584795,0.546243,0.666667,0.523923,0.494350,0


In [4]:
df_experiment = X_train.loc[:, ['participant1_champion_win_rate', 'participant1_teamId',
                                'participant2_champion_win_rate', 'participant2_teamId', 'participant1_tier',
                                'participant2_tier']][:10]
df_experiment = df_experiment.reset_index(drop=True)
df_experiment

Unnamed: 0,participant1_champion_win_rate,participant1_teamId,participant2_champion_win_rate,participant2_teamId,participant1_tier,participant2_tier
0,49.75,0,51.53,0,9.1,8.1
1,52.06,0,54.12,0,9.1,7.1
2,48.49,0,51.0,0,8.1,7.1
3,51.6,0,52.18,0,7.1,7.1
4,50.21,0,50.7,0,8.1,7.1
5,50.8,0,51.53,0,8.1,3.2
6,50.41,0,53.24,0,3.1,7.1
7,48.49,0,49.15,0,7.1,7.1
8,50.53,0,50.98,0,9.1,8.1
9,46.64,0,50.98,0,7.1,7.1


In [5]:
def merge_columns(df: pd.DataFrame) -> dict:
    """
    Merges columns of the form participant<x>_<col> into two columns per category, one for each team.
    Uses participant<x>_team columns to determine the team of each participant.
    
    :param df: DataFrame containing the columns to be merged.
    :return: Dictionary with keys as categories and values as a DataFrame with two columns per category, one for each team.
    """
    cols = df.columns.tolist()
    merged_columns = {}
    # cols that are not to be merged as they are categorical, so averaging them does not make sense
    cols_left = ['teamId', 'champion_championNumber']
    for col in cols:
        matches = re.search("participant(\d+)_(\w+)", col)
        if matches and matches.group(2) not in cols_left:
            participant_number = matches.group(1)
            col_type = matches.group(2)

            # Determine the team of the participant
            team_col = f'participant{participant_number}_teamId'
            team = df[team_col].iloc[0] if team_col in df.columns else 'unknown'

            if col_type not in merged_columns:
                merged_columns[col_type] = {'team1': [], 'team2': []}

            if team == 0:
                merged_columns[col_type]['team1'].append(df[col])
            elif team == 1:
                merged_columns[col_type]['team2'].append(df[col])

    merged_series = {}
    for col_type, teams_data in merged_columns.items():
        for team, data_list in teams_data.items():
            if data_list:  # Only proceed if there are columns to merge
                merged_series[f"{col_type}_{team}"] = pd.concat(data_list, axis=1).mean(axis=1)

    return merged_series

In [6]:
df_merged = pd.DataFrame(merge_columns(df))

In [11]:
df_merged

Unnamed: 0,level_team1,level_team2,champion_infoAttack_team1,champion_infoAttack_team2,champion_infoDefense_team1,champion_infoDefense_team2,champion_infoMagic_team1,champion_infoMagic_team2,champion_infoDifficulty_team1,champion_infoDifficulty_team2,...,maxKills_team1,maxKills_team2,cs_team1,cs_team2,damage_team1,damage_team2,gold_team1,gold_team2,winrate_team1,winrate_team2
3,275.4,307.6,5.2,5.4,5.2,3.6,5.4,5.2,5.0,5.8,...,11.4,12.2,138.32,158.58,16868.4,20872.6,10661.4,11567.8,0.543806,0.512515
4,699.4,538.2,6.0,6.0,3.8,3.8,5.4,4.2,4.8,6.6,...,11.0,13.4,134.66,145.44,19998.0,19525.8,11366.8,10758.6,0.533784,0.533888
7,544.8,560.2,7.0,8.4,5.0,3.6,4.8,3.4,4.6,7.6,...,12.6,13.4,142.20,177.44,19557.0,21106.8,11767.2,11630.0,0.522247,0.618992
8,364.2,405.2,5.4,5.8,4.8,4.6,6.2,4.8,5.8,5.2,...,12.0,10.8,158.64,149.40,18130.8,16636.2,10773.8,10792.4,0.555881,0.531511
10,524.6,502.6,5.4,7.4,3.6,4.2,5.6,4.4,5.6,5.2,...,9.8,10.4,163.16,147.80,18453.0,19359.0,11611.0,11132.8,0.555064,0.532602
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51312,626.2,269.8,4.4,6.0,5.0,5.4,6.2,3.8,6.4,5.8,...,12.2,11.2,156.26,156.74,19061.6,17401.8,10653.4,10826.4,0.569294,0.549947
51314,307.8,513.6,6.4,5.8,5.0,4.4,4.2,5.0,5.2,5.6,...,11.4,12.8,156.60,169.72,18495.6,18986.6,11251.6,11443.4,0.649495,0.588457
51342,680.8,239.8,4.6,6.6,6.0,3.8,5.6,5.0,4.8,6.0,...,14.8,11.0,150.08,155.64,19370.8,21313.0,10960.6,11867.6,0.513814,0.524449
51357,723.2,722.4,5.4,4.6,3.6,3.2,4.4,6.0,5.2,6.8,...,10.4,12.0,152.92,156.36,17576.2,20622.2,10920.0,11127.4,0.607342,0.563196


In [7]:
categorial_columns = [f'participant{x}_champion_championNumber' for x in range(1, 11)]
df_categorical = df.loc[:, categorial_columns]
df_categorical_one_hot = pd.get_dummies(df_categorical.loc[:, 'participant1_champion_championNumber'],
                                        columns='participant1_champion_championNumber', prefix='championNumber')
df_categorical_one_hot

Unnamed: 0,championNumber_1,championNumber_2,championNumber_3,championNumber_4,championNumber_5,championNumber_6,championNumber_7,championNumber_8,championNumber_9,championNumber_10,...,championNumber_555,championNumber_711,championNumber_777,championNumber_875,championNumber_876,championNumber_887,championNumber_895,championNumber_897,championNumber_902,championNumber_950
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51312,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
51314,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
51342,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
51357,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
df_merged_ohc = pd.concat([df_merged, df_categorical_one_hot], axis=1)
df_merged_ohc['label'] = df['label']
df_merged_ohc

Unnamed: 0,level_team1,level_team2,champion_infoAttack_team1,champion_infoAttack_team2,champion_infoDefense_team1,champion_infoDefense_team2,champion_infoMagic_team1,champion_infoMagic_team2,champion_infoDifficulty_team1,champion_infoDifficulty_team2,...,championNumber_711,championNumber_777,championNumber_875,championNumber_876,championNumber_887,championNumber_895,championNumber_897,championNumber_902,championNumber_950,label
3,275.4,307.6,5.2,5.4,5.2,3.6,5.4,5.2,5.0,5.8,...,0,0,0,0,0,0,0,0,0,1
4,699.4,538.2,6.0,6.0,3.8,3.8,5.4,4.2,4.8,6.6,...,0,0,0,0,0,0,0,0,0,0
7,544.8,560.2,7.0,8.4,5.0,3.6,4.8,3.4,4.6,7.6,...,0,0,0,0,0,0,0,0,0,0
8,364.2,405.2,5.4,5.8,4.8,4.6,6.2,4.8,5.8,5.2,...,0,0,0,0,0,0,0,0,0,0
10,524.6,502.6,5.4,7.4,3.6,4.2,5.6,4.4,5.6,5.2,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51312,626.2,269.8,4.4,6.0,5.0,5.4,6.2,3.8,6.4,5.8,...,0,0,0,0,0,0,0,0,0,1
51314,307.8,513.6,6.4,5.8,5.0,4.4,4.2,5.0,5.2,5.6,...,0,0,0,0,1,0,0,0,0,0
51342,680.8,239.8,4.6,6.6,6.0,3.8,5.6,5.0,4.8,6.0,...,0,0,0,0,0,0,0,0,0,0
51357,723.2,722.4,5.4,4.6,3.6,3.2,4.4,6.0,5.2,6.8,...,0,0,0,0,0,0,0,0,0,0


In [20]:
df_merged_ohc.columns.tolist()

['level_team1',
 'level_team2',
 'champion_infoAttack_team1',
 'champion_infoAttack_team2',
 'champion_infoDefense_team1',
 'champion_infoDefense_team2',
 'champion_infoMagic_team1',
 'champion_infoMagic_team2',
 'champion_infoDifficulty_team1',
 'champion_infoDifficulty_team2',
 'champion_tier_team1',
 'champion_tier_team2',
 'champion_win_rate_team1',
 'champion_win_rate_team2',
 'champion_pick_rate_team1',
 'champion_pick_rate_team2',
 'champion_ban_rate_team1',
 'champion_ban_rate_team2',
 'champion_matches_team1',
 'champion_matches_team2',
 'tier_team1',
 'tier_team2',
 'leaguePoints_team1',
 'leaguePoints_team2',
 'veteran_team1',
 'veteran_team2',
 'inactive_team1',
 'inactive_team2',
 'freshBlood_team1',
 'freshBlood_team2',
 'hotStreak_team1',
 'hotStreak_team2',
 'lastPlayTime_team1',
 'lastPlayTime_team2',
 'championLevel_team1',
 'championLevel_team2',
 'championPoints_team1',
 'championPoints_team2',
 'championPointsSinceLastLevel_team1',
 'championPointsSinceLastLevel_te

In [21]:
categorial_columns = df_categorical_one_hot.columns.tolist()
numerical_columns = df_merged.columns.tolist()
preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), numerical_columns)],
    remainder='passthrough')
df_merged_ohc = pd.DataFrame(preprocessor.fit_transform(df_merged_ohc), columns=df_merged_ohc.columns)
df_merged_ohc

Unnamed: 0,level_team1,level_team2,champion_infoAttack_team1,champion_infoAttack_team2,champion_infoDefense_team1,champion_infoDefense_team2,champion_infoMagic_team1,champion_infoMagic_team2,champion_infoDifficulty_team1,champion_infoDifficulty_team2,...,championNumber_711,championNumber_777,championNumber_875,championNumber_876,championNumber_887,championNumber_895,championNumber_897,championNumber_902,championNumber_950,label
0,-0.789315,-0.583545,-0.582124,-0.349560,1.035229,-0.944089,0.462482,0.241753,-0.874332,-0.006871,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2.066708,0.970357,0.323217,0.319613,-0.683467,-0.697821,0.462482,-0.841484,-1.091962,0.865061,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.025337,1.118604,1.454893,2.996305,0.789701,-0.944089,-0.190654,-1.708074,-1.309593,1.954975,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.191167,0.074134,-0.355789,0.096556,0.544173,0.287247,1.333331,-0.191542,-0.003810,-0.660820,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.889272,0.730466,-0.355789,1.881017,-0.928995,-0.205287,0.680194,-0.624836,-0.221441,-0.660820,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25469,1.573640,-0.838261,-1.487465,0.319613,0.789701,1.272316,1.333331,-1.274779,0.649081,-0.006871,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25470,-0.571071,0.804589,0.775887,0.096556,0.789701,0.040980,-0.843790,0.025106,-0.656701,-0.224854,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25471,1.941420,-1.040416,-1.261130,0.988786,2.017341,-0.697821,0.680194,0.025106,-1.091962,0.211112,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25472,2.227022,2.211592,-0.355789,-1.241790,-0.928995,-1.436623,-0.626078,1.108343,-0.656701,1.083043,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
X_train, X_test, y_train, y_test = train_test_split(df_merged_ohc.iloc[:, :-1], df_merged_ohc.iloc[:, -1],
                                                    test_size=0.1,
                                                    random_state=42,
                                                    shuffle=True)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

(22926, 222)
(2548, 222)
(22926,)


In [10]:
bst = xgb.XGBClassifier()
bst.fit(X_train, y_train)
bst.score(X_test, y_test)

0.6911302982731554