In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from ydata_profiling import ProfileReport
import src.dataHandling.cleaningUtils as clean
import os

try:
    with open("data/raw/static_dataset.pkl", "rb") as f:
        df = pickle.load(f)
except FileNotFoundError:
    os.chdir('../')
    with open("data/raw/static_dataset.pkl", "rb") as f:
        df = pickle.load(f)



In [2]:
df.head()

Unnamed: 0,gameDuration,gameCreation,gameVersion,mapId,queueId,patch,seasonId,platformId,participant1_level,participant1_tier,...,participant10_champion_infoDefense,participant10_champion_infoMagic,participant10_champion_infoDifficulty,participant10_champion_tier,participant10_champion_win_rate,participant10_champion_pick_rate,participant10_champion_ban_rate,participant10_champion_matches,participant10_teamId,participant10_win
0,1982,1696715184351,13.19.534.5972,11,420,19,13,EUW1,864,MASTER,...,2,7,10,D,46.8,12.7,12.7,9632,200,False
0,1525,1697696037436,13.20.536.9576,11,420,20,13,KR,132,MASTER,...,6,6,7,S+,50.69,6.2,6.2,50599,200,True
0,1977,1697835934027,13.20.536.9576,11,420,20,13,KR,860,MASTER,...,4,8,5,S+,51.15,5.7,5.7,38009,200,False
0,1446,1696175211836,13.19.533.1909,11,420,19,13,EUW1,196,MASTER,...,4,5,9,S,52.19,0.6,0.6,11217,200,False
0,1859,1697133998343,13.20.536.2230,11,420,20,13,EUW1,465,DIAMOND,...,5,7,3,S+,52.19,2.8,2.8,20275,200,True


In [3]:
df.columns.values

array(['gameDuration', 'gameCreation', 'gameVersion', 'mapId', 'queueId',
       'patch', 'seasonId', 'platformId', 'participant1_level',
       'participant1_tier', 'participant1_rank',
       'participant1_leaguePoints', 'participant1_wins',
       'participant1_losses', 'participant1_veteran',
       'participant1_inactive', 'participant1_freshBlood',
       'participant1_hotStreak', 'participant1_champion_lastPlayTime',
       'participant1_champion_championLevel',
       'participant1_champion_championPoints',
       'participant1_champion_championPointsSinceLastLevel',
       'participant1_champion_tokensEarned', 'participant1_champion_kda',
       'participant1_champion_kills', 'participant1_champion_deaths',
       'participant1_champion_assists', 'participant1_champion_lp',
       'participant1_champion_maxKills', 'participant1_champion_cs',
       'participant1_champion_damage', 'participant1_champion_gold',
       'participant1_champion_championNumber',
       'participant1_

In [4]:
df['platformId'].value_counts()

EUW1    16682
KR      15488
EUN1     3578
NA1      2413
Name: platformId, dtype: int64

In [5]:
len(df)

38161

In [6]:
df = clean.drop_missing(df)
df.head()

dropped 17648 rows


Unnamed: 0,gameDuration,gameCreation,gameVersion,mapId,queueId,patch,seasonId,platformId,participant1_level,participant1_tier,...,participant10_champion_infoDefense,participant10_champion_infoMagic,participant10_champion_infoDifficulty,participant10_champion_tier,participant10_champion_win_rate,participant10_champion_pick_rate,participant10_champion_ban_rate,participant10_champion_matches,participant10_teamId,participant10_win
0,1982,1696715184351,13.19.534.5972,11,420,19,13,EUW1,864,MASTER,...,2,7,10,D,46.8,12.7,12.7,9632,200,False
0,1446,1696175211836,13.19.533.1909,11,420,19,13,EUW1,196,MASTER,...,4,5,9,S,52.19,0.6,0.6,11217,200,False
0,1859,1697133998343,13.20.536.2230,11,420,20,13,EUW1,465,DIAMOND,...,5,7,3,S+,52.19,2.8,2.8,20275,200,True
0,1418,1697914083101,13.20.536.9576,11,420,20,13,EUW1,725,MASTER,...,5,3,6,D,48.23,16.1,16.1,92723,200,False
0,1800,1697564797120,13.20.536.2230,11,420,20,13,KR,110,MASTER,...,0,0,0,S,50.18,5.7,5.7,14487,200,False


In [7]:
df = clean.drop_wrong_data(df)
df.head()

found 20513 rows
dropped wrong mapId
dropped wrong queueId
dropped wrong gameDuration
dropped wrong seasonId
dropped wrong gameVersion
dropped wrong patch
dropped 0 wrong rows


Unnamed: 0,gameDuration,gameCreation,gameVersion,mapId,queueId,patch,seasonId,platformId,participant1_level,participant1_tier,...,participant10_champion_infoDefense,participant10_champion_infoMagic,participant10_champion_infoDifficulty,participant10_champion_tier,participant10_champion_win_rate,participant10_champion_pick_rate,participant10_champion_ban_rate,participant10_champion_matches,participant10_teamId,participant10_win
0,1982,1696715184351,13.19.534.5972,11,420,19,13,EUW1,864,MASTER,...,2,7,10,D,46.8,12.7,12.7,9632,200,False
0,1446,1696175211836,13.19.533.1909,11,420,19,13,EUW1,196,MASTER,...,4,5,9,S,52.19,0.6,0.6,11217,200,False
0,1859,1697133998343,13.20.536.2230,11,420,20,13,EUW1,465,DIAMOND,...,5,7,3,S+,52.19,2.8,2.8,20275,200,True
0,1418,1697914083101,13.20.536.9576,11,420,20,13,EUW1,725,MASTER,...,5,3,6,D,48.23,16.1,16.1,92723,200,False
0,1800,1697564797120,13.20.536.2230,11,420,20,13,KR,110,MASTER,...,0,0,0,S,50.18,5.7,5.7,14487,200,False


In [8]:
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,gameDuration,gameCreation,gameVersion,mapId,queueId,patch,seasonId,platformId,participant1_level,participant1_tier,...,participant10_champion_infoDefense,participant10_champion_infoMagic,participant10_champion_infoDifficulty,participant10_champion_tier,participant10_champion_win_rate,participant10_champion_pick_rate,participant10_champion_ban_rate,participant10_champion_matches,participant10_teamId,participant10_win
0,1982,1696715184351,13.19.534.5972,11,420,19,13,EUW1,864,MASTER,...,2,7,10,D,46.8,12.7,12.7,9632,200,False
1,1446,1696175211836,13.19.533.1909,11,420,19,13,EUW1,196,MASTER,...,4,5,9,S,52.19,0.6,0.6,11217,200,False
2,1859,1697133998343,13.20.536.2230,11,420,20,13,EUW1,465,DIAMOND,...,5,7,3,S+,52.19,2.8,2.8,20275,200,True
3,1418,1697914083101,13.20.536.9576,11,420,20,13,EUW1,725,MASTER,...,5,3,6,D,48.23,16.1,16.1,92723,200,False
4,1800,1697564797120,13.20.536.2230,11,420,20,13,KR,110,MASTER,...,0,0,0,S,50.18,5.7,5.7,14487,200,False


In [9]:
df = clean.fix_rank(df)
df.head()

Unnamed: 0,gameDuration,gameCreation,gameVersion,mapId,queueId,patch,seasonId,platformId,participant1_level,participant1_tier,...,participant10_champion_infoDefense,participant10_champion_infoMagic,participant10_champion_infoDifficulty,participant10_champion_tier,participant10_champion_win_rate,participant10_champion_pick_rate,participant10_champion_ban_rate,participant10_champion_matches,participant10_teamId,participant10_win
0,1982,1696715184351,13.19.534.5972,11,420,19,13,EUW1,864,7.1,...,2,7,10,D,46.8,12.7,12.7,9632,200,False
1,1446,1696175211836,13.19.533.1909,11,420,19,13,EUW1,196,7.1,...,4,5,9,S,52.19,0.6,0.6,11217,200,False
2,1859,1697133998343,13.20.536.2230,11,420,20,13,EUW1,465,6.2,...,5,7,3,S+,52.19,2.8,2.8,20275,200,True
3,1418,1697914083101,13.20.536.9576,11,420,20,13,EUW1,725,7.1,...,5,3,6,D,48.23,16.1,16.1,92723,200,False
4,1800,1697564797120,13.20.536.2230,11,420,20,13,KR,110,7.1,...,0,0,0,S,50.18,5.7,5.7,14487,200,False


In [10]:
df = clean.calc_winrate(df)
df.head()

Unnamed: 0,gameDuration,gameCreation,gameVersion,mapId,queueId,patch,seasonId,platformId,participant1_level,participant1_tier,...,participant1_winrate,participant2_winrate,participant3_winrate,participant4_winrate,participant5_winrate,participant6_winrate,participant7_winrate,participant8_winrate,participant9_winrate,participant10_winrate
0,1982,1696715184351,13.19.534.5972,11,420,19,13,EUW1,864,7.1,...,0.520833,0.530488,0.532895,0.510417,0.517986,0.525597,0.491667,0.54902,0.497268,0.470588
1,1446,1696175211836,13.19.533.1909,11,420,19,13,EUW1,196,7.1,...,0.513308,0.556911,0.567442,0.522831,0.56422,0.538217,0.536458,0.541063,0.542683,0.548736
2,1859,1697133998343,13.20.536.2230,11,420,20,13,EUW1,465,6.2,...,0.488525,0.483871,0.496241,0.571429,0.5,0.496479,0.44,0.628571,0.493506,0.564935
3,1418,1697914083101,13.20.536.9576,11,420,20,13,EUW1,725,7.1,...,0.520581,0.554545,0.564024,0.551515,0.583333,0.509004,0.522388,0.523967,0.598214,0.547368
4,1800,1697564797120,13.20.536.2230,11,420,20,13,KR,110,7.1,...,0.518868,0.527304,0.833333,0.53681,0.523529,0.513869,0.512535,0.565891,0.544304,0.510453


In [11]:
df = clean.fix_teamId(df)
df.head()

Unnamed: 0,gameDuration,gameCreation,gameVersion,mapId,queueId,patch,seasonId,platformId,participant1_level,participant1_tier,...,participant1_winrate,participant2_winrate,participant3_winrate,participant4_winrate,participant5_winrate,participant6_winrate,participant7_winrate,participant8_winrate,participant9_winrate,participant10_winrate
0,1982,1696715184351,13.19.534.5972,11,420,19,13,EUW1,864,7.1,...,0.520833,0.530488,0.532895,0.510417,0.517986,0.525597,0.491667,0.54902,0.497268,0.470588
1,1446,1696175211836,13.19.533.1909,11,420,19,13,EUW1,196,7.1,...,0.513308,0.556911,0.567442,0.522831,0.56422,0.538217,0.536458,0.541063,0.542683,0.548736
2,1859,1697133998343,13.20.536.2230,11,420,20,13,EUW1,465,6.2,...,0.488525,0.483871,0.496241,0.571429,0.5,0.496479,0.44,0.628571,0.493506,0.564935
3,1418,1697914083101,13.20.536.9576,11,420,20,13,EUW1,725,7.1,...,0.520581,0.554545,0.564024,0.551515,0.583333,0.509004,0.522388,0.523967,0.598214,0.547368
4,1800,1697564797120,13.20.536.2230,11,420,20,13,KR,110,7.1,...,0.518868,0.527304,0.833333,0.53681,0.523529,0.513869,0.512535,0.565891,0.544304,0.510453


In [12]:
df = clean.convert_booleans(df)
df.head()

Unnamed: 0,gameDuration,gameCreation,gameVersion,mapId,queueId,patch,seasonId,platformId,participant1_level,participant1_tier,...,participant1_winrate,participant2_winrate,participant3_winrate,participant4_winrate,participant5_winrate,participant6_winrate,participant7_winrate,participant8_winrate,participant9_winrate,participant10_winrate
0,1982,1696715184351,13.19.534.5972,11,420,19,13,EUW1,864,7.1,...,0.520833,0.530488,0.532895,0.510417,0.517986,0.525597,0.491667,0.54902,0.497268,0.470588
1,1446,1696175211836,13.19.533.1909,11,420,19,13,EUW1,196,7.1,...,0.513308,0.556911,0.567442,0.522831,0.56422,0.538217,0.536458,0.541063,0.542683,0.548736
2,1859,1697133998343,13.20.536.2230,11,420,20,13,EUW1,465,6.2,...,0.488525,0.483871,0.496241,0.571429,0.5,0.496479,0.44,0.628571,0.493506,0.564935
3,1418,1697914083101,13.20.536.9576,11,420,20,13,EUW1,725,7.1,...,0.520581,0.554545,0.564024,0.551515,0.583333,0.509004,0.522388,0.523967,0.598214,0.547368
4,1800,1697564797120,13.20.536.2230,11,420,20,13,KR,110,7.1,...,0.518868,0.527304,0.833333,0.53681,0.523529,0.513869,0.512535,0.565891,0.544304,0.510453


In [13]:
df = clean.convert_lastPlayTime(df)
df.head()

Unnamed: 0,gameDuration,gameCreation,gameVersion,mapId,queueId,patch,seasonId,platformId,participant1_level,participant1_tier,...,participant1_winrate,participant2_winrate,participant3_winrate,participant4_winrate,participant5_winrate,participant6_winrate,participant7_winrate,participant8_winrate,participant9_winrate,participant10_winrate
0,1982,1696715184351,13.19.534.5972,11,420,19,13,EUW1,864,7.1,...,0.520833,0.530488,0.532895,0.510417,0.517986,0.525597,0.491667,0.54902,0.497268,0.470588
1,1446,1696175211836,13.19.533.1909,11,420,19,13,EUW1,196,7.1,...,0.513308,0.556911,0.567442,0.522831,0.56422,0.538217,0.536458,0.541063,0.542683,0.548736
2,1859,1697133998343,13.20.536.2230,11,420,20,13,EUW1,465,6.2,...,0.488525,0.483871,0.496241,0.571429,0.5,0.496479,0.44,0.628571,0.493506,0.564935
3,1418,1697914083101,13.20.536.9576,11,420,20,13,EUW1,725,7.1,...,0.520581,0.554545,0.564024,0.551515,0.583333,0.509004,0.522388,0.523967,0.598214,0.547368
4,1800,1697564797120,13.20.536.2230,11,420,20,13,KR,110,7.1,...,0.518868,0.527304,0.833333,0.53681,0.523529,0.513869,0.512535,0.565891,0.544304,0.510453


In [14]:
df = clean.convert_championTier(df)
df.head()

Unnamed: 0,gameDuration,gameCreation,gameVersion,mapId,queueId,patch,seasonId,platformId,participant1_level,participant1_tier,...,participant1_winrate,participant2_winrate,participant3_winrate,participant4_winrate,participant5_winrate,participant6_winrate,participant7_winrate,participant8_winrate,participant9_winrate,participant10_winrate
0,1982,1696715184351,13.19.534.5972,11,420,19,13,EUW1,864,7.1,...,0.520833,0.530488,0.532895,0.510417,0.517986,0.525597,0.491667,0.54902,0.497268,0.470588
1,1446,1696175211836,13.19.533.1909,11,420,19,13,EUW1,196,7.1,...,0.513308,0.556911,0.567442,0.522831,0.56422,0.538217,0.536458,0.541063,0.542683,0.548736
2,1859,1697133998343,13.20.536.2230,11,420,20,13,EUW1,465,6.2,...,0.488525,0.483871,0.496241,0.571429,0.5,0.496479,0.44,0.628571,0.493506,0.564935
3,1418,1697914083101,13.20.536.9576,11,420,20,13,EUW1,725,7.1,...,0.520581,0.554545,0.564024,0.551515,0.583333,0.509004,0.522388,0.523967,0.598214,0.547368
4,1800,1697564797120,13.20.536.2230,11,420,20,13,KR,110,7.1,...,0.518868,0.527304,0.833333,0.53681,0.523529,0.513869,0.512535,0.565891,0.544304,0.510453


In [15]:
df = clean.get_winning_team(df)
df.head()

Unnamed: 0,gameDuration,gameCreation,gameVersion,mapId,queueId,patch,seasonId,platformId,participant1_level,participant1_tier,...,participant2_winrate,participant3_winrate,participant4_winrate,participant5_winrate,participant6_winrate,participant7_winrate,participant8_winrate,participant9_winrate,participant10_winrate,label
0,1982,1696715184351,13.19.534.5972,11,420,19,13,EUW1,864,7.1,...,0.530488,0.532895,0.510417,0.517986,0.525597,0.491667,0.54902,0.497268,0.470588,0
1,1446,1696175211836,13.19.533.1909,11,420,19,13,EUW1,196,7.1,...,0.556911,0.567442,0.522831,0.56422,0.538217,0.536458,0.541063,0.542683,0.548736,0
2,1859,1697133998343,13.20.536.2230,11,420,20,13,EUW1,465,6.2,...,0.483871,0.496241,0.571429,0.5,0.496479,0.44,0.628571,0.493506,0.564935,1
3,1418,1697914083101,13.20.536.9576,11,420,20,13,EUW1,725,7.1,...,0.554545,0.564024,0.551515,0.583333,0.509004,0.522388,0.523967,0.598214,0.547368,0
4,1800,1697564797120,13.20.536.2230,11,420,20,13,KR,110,7.1,...,0.527304,0.833333,0.53681,0.523529,0.513869,0.512535,0.565891,0.544304,0.510453,0


In [16]:
#df = clean.drop_wrong_teamIds(df)

In [17]:
df[df['participant1_teamId'] != 0]

Unnamed: 0,gameDuration,gameCreation,gameVersion,mapId,queueId,patch,seasonId,platformId,participant1_level,participant1_tier,...,participant2_winrate,participant3_winrate,participant4_winrate,participant5_winrate,participant6_winrate,participant7_winrate,participant8_winrate,participant9_winrate,participant10_winrate,label
137,1625,1697745028414,13.20.536.9576,11,420,20,13,EUN1,60,7.1,...,0.584337,0.531250,0.571429,0.523438,0.800000,0.454545,0.577465,0.544355,0.660377,0
196,1697,1696424943547,13.19.534.5972,11,420,19,13,EUW1,632,8.1,...,0.587342,0.567335,0.519337,0.700000,0.536723,0.592697,0.537600,0.558036,0.537500,0
270,1816,1696883771013,13.19.534.5972,11,420,19,13,EUW1,454,6.1,...,0.539548,0.619048,0.573427,0.520161,0.560831,0.642857,0.529293,0.652695,0.614213,0
300,1172,1695764320830,13.18.530.4653,11,420,18,13,EUN1,664,9.1,...,0.666667,0.539130,0.515885,0.539773,0.557312,0.536993,0.571429,0.507062,0.576419,0
783,1571,1696234051098,13.19.533.1909,11,420,19,13,EUW1,148,8.1,...,0.542945,0.512630,0.539749,0.487685,0.518367,0.533898,0.559748,0.500000,0.660000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20218,1350,1697816581639,13.20.536.9576,11,420,20,13,EUN1,121,8.1,...,0.607143,0.567100,0.621849,0.504545,0.638298,0.511682,0.551148,0.527066,0.722892,0
20277,2429,1697150494924,13.20.536.2230,11,420,20,13,NA1,340,7.1,...,0.548077,0.594595,0.538462,0.690909,0.500000,0.629630,0.557692,0.629630,0.560000,0
20329,1478,1697749093856,13.20.536.9576,11,420,20,13,NA1,653,9.1,...,0.594595,0.666667,0.625000,0.557292,0.532374,0.532468,0.517621,0.601399,0.571429,0
20357,1657,1697828058179,13.20.536.9576,11,420,20,13,EUW1,307,8.1,...,0.506241,0.523227,0.687943,0.909091,0.589595,0.896552,0.538462,0.586345,0.542453,0


In [18]:
assert np.all(df['participant1_win'] == df['participant2_win'])

AssertionError: 

In [19]:
df = clean.drop_irrelevant(df)
df.head()

Unnamed: 0,participant1_level,participant1_tier,participant1_champion_lastPlayTime,participant1_champion_championPoints,participant1_champion_kda,participant1_champion_lp,participant1_champion_win_rate,participant2_level,participant2_tier,participant2_champion_lastPlayTime,...,participant2_winrate,participant3_winrate,participant4_winrate,participant5_winrate,participant6_winrate,participant7_winrate,participant8_winrate,participant9_winrate,participant10_winrate,label
0,864,7.1,2654830,9801,1.02,100,49.4,533,7.1,2700780,...,0.530488,0.532895,0.510417,0.517986,0.525597,0.491667,0.54902,0.497268,0.470588,0
1,196,7.1,2734057,1051316,1.29,1718,49.17,386,8.1,3282806,...,0.556911,0.567442,0.522831,0.56422,0.538217,0.536458,0.541063,0.542683,0.548736,0
2,465,6.2,2514884,57996,1.24,57,52.44,107,6.2,2404603,...,0.483871,0.496241,0.571429,0.5,0.496479,0.44,0.628571,0.493506,0.564935,1
3,725,7.1,2694772,2892093,2.39,1190,50.15,117,7.1,2576903,...,0.554545,0.564024,0.551515,0.583333,0.509004,0.522388,0.523967,0.598214,0.547368,0
4,110,7.1,2322436,61840,2.87,2,48.2,69,7.1,2580681,...,0.527304,0.833333,0.53681,0.523529,0.513869,0.512535,0.565891,0.544304,0.510453,0


In [20]:
assert df.columns[-1] == 'label'

In [21]:
profile = ProfileReport(df, title="Profiling Report")

In [22]:
#profile.to_widgets()

In [23]:
df['label'].value_counts()

1    10730
0     9783
Name: label, dtype: int64

In [24]:
df.columns.values


array(['participant1_level', 'participant1_tier',
       'participant1_champion_lastPlayTime',
       'participant1_champion_championPoints',
       'participant1_champion_kda', 'participant1_champion_lp',
       'participant1_champion_win_rate', 'participant2_level',
       'participant2_tier', 'participant2_champion_lastPlayTime',
       'participant2_champion_championPoints',
       'participant2_champion_kda', 'participant2_champion_lp',
       'participant2_champion_win_rate', 'participant3_level',
       'participant3_tier', 'participant3_champion_lastPlayTime',
       'participant3_champion_championPoints',
       'participant3_champion_kda', 'participant3_champion_lp',
       'participant3_champion_win_rate', 'participant4_level',
       'participant4_tier', 'participant4_champion_lastPlayTime',
       'participant4_champion_championPoints',
       'participant4_champion_kda', 'participant4_champion_lp',
       'participant4_champion_win_rate', 'participant5_level',
       'par

In [None]:

X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df.iloc[:, -1], test_size=0.2,
                                                    random_state=42)
y_train = y_train.astype(int)
y_test = y_test.astype(int)
X_train

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

for i, col in enumerate(df.columns):
    plt.figure(i)
    sns.histplot(df[col], stat='density', kde=True)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train = np.append(X_train, np.expand_dims(y_train, axis=1), axis=1)
X_test = np.append(X_test, np.expand_dims(y_test, axis=1), axis=1)

In [None]:
X_train