# Import data

In [1]:
import numpy as np
import pandas as pd
import json

In [2]:
file = '../raw_data/full_dump.json'
with open(file) as data_file:    
    data = json.load(data_file) 
len(data)

35320

In [3]:
data_keys = list(data.keys())
data_keys[:5]

['EU Challenger Series/2017 Season/Spring Qualifiers/Scoreboards_1_1',
 'EU Challenger Series/2017 Season/Spring Qualifiers/Scoreboards_2_1',
 'EU Challenger Series/2017 Season/Spring Qualifiers/Scoreboards_3_1',
 'EU Challenger Series/2017 Season/Spring Qualifiers/Scoreboards_4_1',
 'EU Challenger Series/2017 Season/Spring Qualifiers/Scoreboards_5_1']

In [4]:
import re

pattern = r'(\d{4})'
re.search(pattern, data_keys[2]).group(1)

year = []
no_year = []
for i in range(len(data_keys)):
    try:
        year.append(re.search(pattern, data_keys[i]).group(1))
    except:
        no_year.append(data_keys[i])

np.unique(year), len(no_year)

(array(['2016', '2017', '2018', '2019', '2020', '2021'], dtype='<U4'), 1703)

# Normalize the JSON file

Normalize means to expand the nested keys in the JSON file and the subsequent DataFrame.

In [5]:
#Normalize the complete dataframe and make a copy
df_normalized_teams = pd.json_normalize(data.values())
df_teams = df_normalized_teams.copy()
df_teams.sample(2)

Unnamed: 0,id,start,patch,winner,duration,picks_bans,teams.BLUE.name,teams.BLUE.total_turret_kills,teams.BLUE.total_inhibitor_kills,teams.BLUE.total_rift_herald_kills,...,teams.BLUE.side,teams.BLUE.players,teams.RED.name,teams.RED.total_turret_kills,teams.RED.total_inhibitor_kills,teams.RED.total_rift_herald_kills,teams.RED.total_dragon_kills,teams.RED.total_baron_kills,teams.RED.side,teams.RED.players
29324,Baltic Masters/2021 Season/Spring Playoffs/Sco...,,11.4,BLUE,1681,"[{'champion_name': 'Seraphine', 'is_ban': True...",Method2Madness,11,2,2,...,BLUE,"[{'name': 'arnax', 'id': 310279, 'role': 'TOP'...",PIGSPORTS,3,0,0,0,0,RED,"[{'name': 'Godux', 'id': 364114, 'role': 'TOP'..."
22066,Liga Tica de Leyendas/2020 Season/Scoreboards/...,,10.11,BLUE,1681,[],OG Esports,11,2,2,...,BLUE,"[{'name': 'Bazu', 'id': 219793, 'role': 'TOP',...",Zwan Gaming,0,0,0,1,0,RED,"[{'name': 'Zeth', 'id': 306046, 'role': 'TOP',..."


In [6]:
#Make a dataframe dedicated to the teams 
df_teams.drop(['teams.BLUE.players', 'teams.RED.players', 'picks_bans'], axis=1, inplace=True)
df_teams.head(2)

Unnamed: 0,id,start,patch,winner,duration,teams.BLUE.name,teams.BLUE.total_turret_kills,teams.BLUE.total_inhibitor_kills,teams.BLUE.total_rift_herald_kills,teams.BLUE.total_dragon_kills,teams.BLUE.total_baron_kills,teams.BLUE.side,teams.RED.name,teams.RED.total_turret_kills,teams.RED.total_inhibitor_kills,teams.RED.total_rift_herald_kills,teams.RED.total_dragon_kills,teams.RED.total_baron_kills,teams.RED.side
0,EU Challenger Series/2017 Season/Spring Qualif...,,6.24,BLUE,2023,Fnatic Academy,11,3,0,2,1,BLUE,EURONICS Gaming,4,0,0,1,0,RED
1,EU Challenger Series/2017 Season/Spring Qualif...,,6.24,RED,1782,AlienTech eSports,3,0,0,0,0,BLUE,Team Larssen,7,1,1,2,1,RED


In [7]:
#Make a dataframe dedicated to the team BLUE
df_normalized_teams['teams.BLUE.players'].explode()
df_normalized_BLUE = pd.json_normalize(df_normalized_teams['teams.BLUE.players'].explode())
df_BLUE = df_normalized_BLUE.copy()
df_BLUE.head(5)

Unnamed: 0,name,id,role,champion_name,champion_id,gold_15,kills_assists_15,deaths_15,total_gold,total_cs,total_kills,total_monster_kills,total_assists,total_deaths,total_damage_taken,total_damage_dealt,win,side
0,Kikis,172122.0,TOP,Nautilus,111,5326.0,8.0,1.0,12504,233,2,0,6,1,18220,156270,True,BLUE
1,Broxah,193072.0,JGL,Lee Sin,64,5261.0,11.0,1.0,12352,153,4,132,7,1,22212,152183,True,BLUE
2,Nisqy,185791.0,MID,Syndra,134,6009.0,13.0,1.0,13393,229,5,6,8,1,10647,177681,True,BLUE
3,MrRallez,183407.0,BOT,Jhin,202,5304.0,11.0,0.0,13969,315,2,5,9,0,9758,228328,True,BLUE
4,Klaj,171882.0,SUP,Karma,43,2767.0,12.0,0.0,9740,38,1,0,11,0,11917,34299,True,BLUE


In [8]:
#Check that it matches 5 players per game
len(df_BLUE)/5

35320.0

In [9]:
#Make a dataframe dedicated to the team RED
df_normalized_teams['teams.RED.players'].explode()
df_normalized_RED = pd.json_normalize(df_normalized_teams['teams.RED.players'].explode())
df_RED = df_normalized_RED.copy()
df_RED.head(5)

Unnamed: 0,name,id,role,champion_name,champion_id,gold_15,kills_assists_15,deaths_15,total_gold,total_cs,total_kills,total_monster_kills,total_assists,total_deaths,total_damage_taken,total_damage_dealt,win,side
0,Phones,193289.0,TOP,Maokai,57,4528.0,2.0,7.0,9611,190,1,13,1,7,41065,111536,False,RED
1,Obvious,187241.0,JGL,Rengar,107,4728.0,2.0,1.0,9640,174,0,126,2,1,27879,147035,False,RED
2,MagiFelix,181359.0,MID,Ryze,13,4893.0,2.0,3.0,11840,301,0,2,2,3,16013,205899,False,RED
3,Sedrion,197437.0,BOT,Varus,110,5133.0,2.0,1.0,12010,283,2,1,0,1,10370,184927,False,RED
4,Noxiak,185879.0,SUP,Nami,267,2521.0,2.0,2.0,7348,16,0,0,2,2,13815,15418,False,RED


In [10]:
#Make a dataframe dedicated to the Bans
df_normalized_teams['picks_bans'].explode()
df_normalized_BANS = pd.json_normalize(df_normalized_teams['picks_bans'].explode())
df_BANS = df_normalized_BANS.copy()
df_BANS.isnull().sum()/len(df_BANS)

champion_name    0.019285
is_ban           0.019285
dtype: float64

##  Include the game id to match later different DataFrames

We have two different dataframes for team Blue and team Red, later, we can merge them on the game id. This will also be useful when we want to come back for the y parameter! 

In [11]:
get_index = df_normalized_teams['id'].tolist()
index_preproc = np.asarray([[index] * 5 for index in get_index])
index_teams = index_preproc.reshape(len(df_normalized_teams) * 5).tolist()
df_RED['game_id'] = index_teams
df_BLUE['game_id'] = index_teams
len(index_teams)/5

35320.0

In [12]:
df_BLUE[df_BLUE['game_id'] == '2016 International Wildcard Invitational/Scoreboards/Bracket Stage_1_1']

Unnamed: 0,name,id,role,champion_name,champion_id,gold_15,kills_assists_15,deaths_15,total_gold,total_cs,total_kills,total_monster_kills,total_assists,total_deaths,total_damage_taken,total_damage_dealt,win,side,game_id
165535,Smurf,197966.0,TOP,Trundle,48,5175.0,8.0,3.0,15494,294,0,15,8,3,36337,241203,True,BLUE,2016 International Wildcard Invitational/Score...
165536,PvPStejos,194522.0,JGL,Graves,104,4865.0,11.0,2.0,16049,177,6,148,5,2,25664,231200,True,BLUE,2016 International Wildcard Invitational/Score...
165537,Kira,172113.0,MID,Lissandra,127,5533.0,14.0,2.0,16549,325,3,25,11,2,25513,252607,True,BLUE,2016 International Wildcard Invitational/Score...
165538,Onesh0tiq,188541.0,BOT,Lucian,236,5722.0,13.0,1.0,18339,356,6,15,7,1,15931,251707,True,BLUE,2016 International Wildcard Invitational/Score...
165539,Likkrit,179739.0,SUP,Tahm Kench,223,3386.0,10.0,2.0,12309,80,1,6,9,2,23811,56257,True,BLUE,2016 International Wildcard Invitational/Score...


In [13]:
df_RED[df_RED['game_id'] == '2016 International Wildcard Invitational/Scoreboards/Bracket Stage_1_1']

Unnamed: 0,name,id,role,champion_name,champion_id,gold_15,kills_assists_15,deaths_15,total_gold,total_cs,total_kills,total_monster_kills,total_assists,total_deaths,total_damage_taken,total_damage_dealt,win,side,game_id
165535,Yang,205635.0,TOP,Maokai,57,4891.0,9.0,2.0,12994,260,1,12,8,2,26978,182014,False,RED,2016 International Wildcard Invitational/Score...
165536,Revolta,195157.0,JGL,Kindred,203,4730.0,6.0,3.0,12638,178,1,122,5,3,27564,195085,False,RED,2016 International Wildcard Invitational/Score...
165537,tockers,201599.0,MID,Ekko,245,5159.0,9.0,2.0,13866,285,5,10,4,2,31698,215422,False,RED,2016 International Wildcard Invitational/Score...
165538,micaO,182405.0,BOT,Jinx,222,5994.0,9.0,5.0,15322,356,3,14,6,5,19697,249792,False,RED,2016 International Wildcard Invitational/Score...
165539,Jockster,169596.0,SUP,Thresh,412,3155.0,6.0,4.0,8906,36,0,0,6,4,18449,20745,False,RED,2016 International Wildcard Invitational/Score...


# [SKIP for now] Filter years that are going to be analyzed

Currently not working... there is nothing in variable = 'start'

In [14]:
# Get the years and the number of games played
df_normalized_teams['start'] = pd.to_datetime(df_normalized_teams['start'])
df_normalized_teams['year'] = pd.DatetimeIndex(df_normalized_teams['start']).year
games_year = df_normalized_teams.year.unique()

for year in games_year:
    N_games = len(df_normalized_teams[df_normalized_teams['year'] == year])
    print(f'Number of games in {year}: {N_games}')

Number of games in nan: 0


In [15]:
# Function that returns a Dataframe just with the years of interest
def choose_years_to_work(df, *args):
    years_dict = {}
    for year in args:
        years_dict[year] = df[df['year'] == year]
    return pd.concat(years_dict).droplevel(0)

df_data_by_year = choose_years_to_work(df_normalized_teams, 2021)
df_data_by_year.drop('start', axis=1, inplace=True)

# [SKIP] Retrieve the champions stats given a champion id and merge with the BLUE/RED team

From the page https://ddragon.leagueoflegends.com/cdn/12.3.1/data/en_US/champion.json, get the stats of a champion given a key or id value.

In [16]:
'''champion_description = '../raw_data/lol_12_3_1.json'

with open(champion_description) as data_file:    
    data_champions = json.load(data_file)'''

"champion_description = '../raw_data/lol_12_3_1.json'\n\nwith open(champion_description) as data_file:    \n    data_champions = json.load(data_file)"

In [17]:
# get the key and the stats for every champion in this version
# create a new dictionary with only the values of interest

'''champions_dict = {}
champions_names = list(data_champions['data'].keys())
for champion in champions_names:
    champions_dict[int(data_champions['data'][champion]['key'])] = data_champions['data'][champion]['stats']
champions_df = pd.DataFrame(champions_dict)
champions_df = champions_df.T
champions_df.reset_index(level=0, inplace=True)
champions_df['champion'] = champions_df['index']
champions_df.drop('index', axis=1, inplace=True)
champions_df'''

"champions_dict = {}\nchampions_names = list(data_champions['data'].keys())\nfor champion in champions_names:\n    champions_dict[int(data_champions['data'][champion]['key'])] = data_champions['data'][champion]['stats']\nchampions_df = pd.DataFrame(champions_dict)\nchampions_df = champions_df.T\nchampions_df.reset_index(level=0, inplace=True)\nchampions_df['champion'] = champions_df['index']\nchampions_df.drop('index', axis=1, inplace=True)\nchampions_df"

In [18]:
#Merge with BLUE team and drop the id of the champion, no needed anymore
'''df_BLUE = pd.merge(df_normalized_BLUE, champions_df, left_on='champion_id', right_on='champion', how='left')
df_BLUE.drop(['champion', 'champion_id'], axis=1, inplace=True)'''

"df_BLUE = pd.merge(df_normalized_BLUE, champions_df, left_on='champion_id', right_on='champion', how='left')\ndf_BLUE.drop(['champion', 'champion_id'], axis=1, inplace=True)"

In [19]:
#Merge with RED team and drop the id of the champion, no needed anymore
'''df_RED = pd.merge(df_normalized_RED, champions_df, left_on='champion_id', right_on='champion', how='left')
df_RED.drop(['champion', 'champion_id'], axis=1, inplace=True)'''

"df_RED = pd.merge(df_normalized_RED, champions_df, left_on='champion_id', right_on='champion', how='left')\ndf_RED.drop(['champion', 'champion_id'], axis=1, inplace=True)"

In [20]:
# do some feature engineering

# Merge and Flatten Blue and Red data

## Flatten the numerical data

Add all the numerical data per game id, i.e., how many is the added value of one game (how much gold, kills, damage, etc., for the 5 players)

In [21]:
df_BLUE_flatten = df_BLUE.drop(['id', 'champion_id'], axis=1).groupby('game_id').sum()
df_RED_flatten = df_RED.drop(['id', 'champion_id'], axis=1).groupby('game_id').sum()

## Get the role and the champion name into one variable and flatten it out

How to do it for multiple variables: 
```Python
df_BLUE['name_with_role'] = df_BLUE['role'] + df_BLUE['champion_name']
 
df_BLUE[['side', 'champion_name', 'role', 'game_id', 'name_with_role']].groupby('game_id').aggregate({
    'role': lambda x: ' '.join(x),
    'side': lambda y: ' '.join(y),
    'champion_name': lambda y: ' '.join(x or '' for x in y),
    'name_with_role': lambda y: ' '.join(str(x) or '' for x in y),
})
```

In [22]:
# We are just going to join the role plus the champion per game
## For the BLUE team
df_BLUE['name_with_role'] = df_BLUE['role'] + df_BLUE['champion_name']
df_BLUE_ROLE_CHAMPION = df_BLUE[['game_id', 'name_with_role']].groupby('game_id').aggregate({
    'name_with_role': lambda y: ' '.join(str(x) or '' for x in y),
})

df_BLUE_ROLE_CHAMPION.reset_index(level=0, inplace=True)
df_BLUE_WITH_ROLE_CHAMPION = pd.merge(df_BLUE_flatten, df_BLUE_ROLE_CHAMPION, on='game_id', how='inner')
df_BLUE_WITH_ROLE_CHAMPION.sample(2)

Unnamed: 0,game_id,gold_15,kills_assists_15,deaths_15,total_gold,total_cs,total_kills,total_monster_kills,total_assists,total_deaths,total_damage_taken,total_damage_dealt,win,name_with_role
20068,LMS/2018 Season/Summer Playoffs/Scoreboards_1_3,22595.0,18.0,17.0,43519,939,6,135,12,17,74880,503754,0,TOPRumble JGLJarvan IV MIDAatrox BOTMiss Fortu...
1711,Benelux Premier League/Season 3 Playoffs/Score...,24070.0,60.0,9.0,56059,964,18,183,42,9,70951,637820,5,TOPJarvan IV JGLSylas MIDCassiopeia BOTKai'Sa ...


In [23]:
## For the RED team
df_RED['name_with_role'] = df_RED['role'] + df_RED['champion_name']
df_RED_ROLE_CHAMPION = df_RED[['game_id', 'name_with_role']].groupby('game_id').aggregate({
    'name_with_role': lambda y: ' '.join(str(x) or '' for x in y),
})

df_RED_ROLE_CHAMPION.reset_index(level=0, inplace=True)
df_RED_WITH_ROLE_CHAMPION = pd.merge(df_RED_flatten, df_RED_ROLE_CHAMPION, on='game_id', how='inner')
df_RED_WITH_ROLE_CHAMPION.sample(2)

Unnamed: 0,game_id,gold_15,kills_assists_15,deaths_15,total_gold,total_cs,total_kills,total_monster_kills,total_assists,total_deaths,total_damage_taken,total_damage_dealt,win,name_with_role
31162,TCL/2020 Season/Summer Season/Scoreboards/Week...,25141.0,31.0,7.0,55654,1037,12,230,19,7,61284,729733,5,TOPCamille JGLSejuani MIDOrianna BOTEzreal SUP...
3404,CIS Challenger League/2018 Season/Summer Seaso...,23283.0,37.0,9.0,73805,1390,14,281,23,9,86476,1124337,5,TOPAatrox JGLTaliyah MIDAzir BOTTristana SUPMo...


## Merge BLUE and RED team data into one DataFrame

In [24]:
df_BLUE_RED = pd.merge(df_BLUE_WITH_ROLE_CHAMPION, df_RED_WITH_ROLE_CHAMPION, on='game_id', how='inner')
df_BLUE_RED[['name_with_role_x', 'name_with_role_y']].head(2)

Unnamed: 0,name_with_role_x,name_with_role_y
0,TOPTrundle JGLGraves MIDLissandra BOTLucian SU...,TOPMaokai JGLKindred MIDEkko BOTJinx SUPThresh
1,TOPGragas JGLKindred MIDLeBlanc BOTKalista SUP...,TOPEkko JGLElise MIDAnivia BOTLucian SUPTrundle


In [25]:
df_BLUE_RED.keys()

Index(['game_id', 'gold_15_x', 'kills_assists_15_x', 'deaths_15_x',
       'total_gold_x', 'total_cs_x', 'total_kills_x', 'total_monster_kills_x',
       'total_assists_x', 'total_deaths_x', 'total_damage_taken_x',
       'total_damage_dealt_x', 'win_x', 'name_with_role_x', 'gold_15_y',
       'kills_assists_15_y', 'deaths_15_y', 'total_gold_y', 'total_cs_y',
       'total_kills_y', 'total_monster_kills_y', 'total_assists_y',
       'total_deaths_y', 'total_damage_taken_y', 'total_damage_dealt_y',
       'win_y', 'name_with_role_y'],
      dtype='object')

In [26]:
df_BLUE_RED.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35320 entries, 0 to 35319
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   game_id                35320 non-null  object 
 1   gold_15_x              35320 non-null  float64
 2   kills_assists_15_x     35320 non-null  float64
 3   deaths_15_x            35320 non-null  float64
 4   total_gold_x           35320 non-null  int64  
 5   total_cs_x             35320 non-null  int64  
 6   total_kills_x          35320 non-null  int64  
 7   total_monster_kills_x  35320 non-null  int64  
 8   total_assists_x        35320 non-null  int64  
 9   total_deaths_x         35320 non-null  int64  
 10  total_damage_taken_x   35320 non-null  int64  
 11  total_damage_dealt_x   35320 non-null  int64  
 12  win_x                  35320 non-null  int64  
 13  name_with_role_x       35320 non-null  object 
 14  gold_15_y              35320 non-null  float64
 15  ki

In [27]:
df_BLUE_RED.drop(['game_id', 'win_y'], axis=1, inplace=True)

In [28]:
df_BLUE_RED.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35320 entries, 0 to 35319
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   gold_15_x              35320 non-null  float64
 1   kills_assists_15_x     35320 non-null  float64
 2   deaths_15_x            35320 non-null  float64
 3   total_gold_x           35320 non-null  int64  
 4   total_cs_x             35320 non-null  int64  
 5   total_kills_x          35320 non-null  int64  
 6   total_monster_kills_x  35320 non-null  int64  
 7   total_assists_x        35320 non-null  int64  
 8   total_deaths_x         35320 non-null  int64  
 9   total_damage_taken_x   35320 non-null  int64  
 10  total_damage_dealt_x   35320 non-null  int64  
 11  win_x                  35320 non-null  int64  
 12  name_with_role_x       35320 non-null  object 
 13  gold_15_y              35320 non-null  float64
 14  kills_assists_15_y     35320 non-null  float64
 15  de

from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse = False, handle_unknown='ignore')
ohe.fit(df_BLUE_RED[['role_x']])
blue_role_encoded = ohe.transform(df_BLUE_RED[['role_x']])
red_role_encoded = ohe.transform(df_BLUE_RED[['role_y']])
ohe.categories_

df_BLUE_RED['Blue_BOT'], df_BLUE_RED['Blue_JGL'], df_BLUE_RED['Blue_MID'], df_BLUE_RED['Blue_SUP'], df_BLUE_RED['Blue_TOP'] = blue_role_encoded.T
df_BLUE_RED.drop('role_x', axis=1, inplace=True)
df_BLUE_RED.sample(5)

df_BLUE_RED['Red_BOT'], df_BLUE_RED['Red_JGL'], df_BLUE_RED['Red_MID'], df_BLUE_RED['Red_SUP'], df_BLUE_RED['Red_TOP'] = red_role_encoded.T
df_BLUE_RED.drop('role_y', axis=1, inplace=True)
df_BLUE_RED.sample(5)

ohe_champ = OneHotEncoder(sparse = False, handle_unknown='ignore')
ohe_champ.fit(df_BLUE_RED[['champion_name_x']])
blue_champ_role_encoded = ohe_champ.transform(df_BLUE_RED[['champion_name_x']])
red_champ_role_encoded = ohe_champ.transform(df_BLUE_RED[['champion_name_y']])
ohe_champ.categories_

df_BLUE_RED['Blue_BOT'], df_BLUE_RED['Blue_JGL'], df_BLUE_RED['Blue_MID'], df_BLUE_RED['Blue_SUP'], df_BLUE_RED['Blue_TOP'] = blue_role_encoded.T
df_BLUE_RED.drop('role_x', axis=1, inplace=True)
df_BLUE_RED.sample(5)

# [SKIP] Make a word to vect for the champions and roles

## Create a dictionary with all the role_champions 'words'

In [29]:
from itertools import chain

words_BLUE = list(chain.from_iterable([wv.split(' ') for wv in df_BLUE_RED.name_with_role_x]))
words_RED = list(chain.from_iterable([wv.split(' ') for wv in df_BLUE_RED.name_with_role_x]))
words = words_BLUE + words_RED
np.shape(words)

(376536,)

In [30]:
def create_unique_word_dict(text):
    
    # Getting all the unique words from our text and sorting them alphabetically
    words = list(set(text))
    #words.sort()

    # Creating the dictionary for the unique words
    unique_word_dict = {}
    for i, word in enumerate(words):
        unique_word_dict.update({
            word: i
        })

    return unique_word_dict 

In [31]:
champions_role_dict = create_unique_word_dict(words)
champions_role_dict

{'TOPMaster': 0,
 'SUPShaco': 1,
 'TOPYorick': 2,
 'SUPSivir': 3,
 'MIDRengar': 4,
 'BOTSejuani': 5,
 'JGLFiddlesticks': 6,
 'MIDKatarina': 7,
 'MIDLeBlanc': 8,
 'MIDMaster': 9,
 'TOPOrnn': 10,
 'BOTQuinn': 11,
 'MIDRenekton': 12,
 'BOTNidalee': 13,
 'BOTSion': 14,
 "JGLKai'Sa": 15,
 'BOTLux': 16,
 'TOPWarwick': 17,
 'SUPTrundle': 18,
 'SUPLulu': 19,
 "MIDKha'Zix": 20,
 'TOPYone': 21,
 'BOTAshe': 22,
 'MIDWukong': 23,
 'JGLNasus': 24,
 'Yi': 25,
 'JGLMorgana': 26,
 'MIDNocturne': 27,
 'BOTRakan': 28,
 'JGLDraven': 29,
 'MIDSwain': 30,
 'JGLTrundle': 31,
 'JGLPoppy': 32,
 "SUPCho'Gath": 33,
 'TOPCassiopeia': 34,
 'BOTAhri': 35,
 'MIDRyze': 36,
 'MIDGraves': 37,
 'BOTOrnn': 38,
 'BOTSeraphine': 39,
 'JGLKindred': 40,
 'MIDSett': 41,
 'SUPGragas': 42,
 'TOPAlistar': 43,
 'MIDQuinn': 44,
 'SUPZoe': 45,
 'JGLJanna': 46,
 'TOPRiven': 47,
 'JGLKayle': 48,
 'JGLNidalee': 49,
 'SUPXayah': 50,
 'MIDRiven': 51,
 'TOPMiss': 52,
 'JGLIvern': 53,
 "TOPKha'Zix": 54,
 'TOPHeimerdinger': 55,
 'BOTBrand

In [32]:
banana = df_BLUE_RED.name_with_role_x[0].split(' ')
banana[0]
#def replace_words_dict(df, dictionary):   

'TOPTrundle'

df_BLUE_RED.name_with_role_x.apply()

TOPTrundle JGLGraves MIDLissandra BOTLucian = ()

# ML model

In [34]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df_BLUE_RED.name_with_role_x)
pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
#vectorizer.get_feature_names_out()

Unnamed: 0,botaatrox,botahri,botakali,botalistar,botaphelios,botashe,botaurelion,botazir,botbard,botblitzcrank,...,topyasuo,topyone,topyorick,topzac,topzed,topzilean,willump,yi,zhao,zix
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35315,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35316,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35317,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35318,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


import sklearn
sklearn.__version__

!pip install scikit-learn==1.0.2

## Logit pipe w/MinMax Scaler, SimpleImputer, OneHot Encoder

In [66]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn import set_config; set_config(display='diagram')

# Impute then Scale for numerical variables: 
num_transformer = make_pipeline(
                    SimpleImputer(strategy = 'mean'),
                    MinMaxScaler())

# Encode categorical variables
cat_transformer = OneHotEncoder(sparse = False, handle_unknown='ignore')
champion_transformer = CountVectorizer()

# Paralellize "num_transformer" and "One hot encoder"
preproc = make_column_transformer(
    (num_transformer, make_column_selector(dtype_include=['float64'])),
    #(cat_transformer, make_column_selector(dtype_include=['object','bool'])),
    (champion_transformer, 'name_with_role_x'),
    (champion_transformer, 'name_with_role_y'),
    remainder='passthrough')

#add model
pipe = make_pipeline(preproc, LogisticRegression(solver='liblinear'))
pipe

pipe.get_feature_names_out()

In [73]:
df_BLUE_RED.win_x.unique()

array([5, 0])

In [74]:
from sklearn.preprocessing import LabelEncoder

y = LabelEncoder().fit(df_BLUE_RED.win_x).transform(df_BLUE_RED.win_x)
X_pre = df_BLUE_RED.drop('win_x', axis=1)#.groupby('game_id').sum()
X = X_pre

In [75]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35320 entries, 0 to 35319
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   gold_15_x              35320 non-null  float64
 1   kills_assists_15_x     35320 non-null  float64
 2   deaths_15_x            35320 non-null  float64
 3   total_gold_x           35320 non-null  int64  
 4   total_cs_x             35320 non-null  int64  
 5   total_kills_x          35320 non-null  int64  
 6   total_monster_kills_x  35320 non-null  int64  
 7   total_assists_x        35320 non-null  int64  
 8   total_deaths_x         35320 non-null  int64  
 9   total_damage_taken_x   35320 non-null  int64  
 10  total_damage_dealt_x   35320 non-null  int64  
 11  name_with_role_x       35320 non-null  object 
 12  gold_15_y              35320 non-null  float64
 13  kills_assists_15_y     35320 non-null  float64
 14  deaths_15_y            35320 non-null  float64
 15  to

In [76]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [77]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23664 entries, 20970 to 15795
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   gold_15_x              23664 non-null  float64
 1   kills_assists_15_x     23664 non-null  float64
 2   deaths_15_x            23664 non-null  float64
 3   total_gold_x           23664 non-null  int64  
 4   total_cs_x             23664 non-null  int64  
 5   total_kills_x          23664 non-null  int64  
 6   total_monster_kills_x  23664 non-null  int64  
 7   total_assists_x        23664 non-null  int64  
 8   total_deaths_x         23664 non-null  int64  
 9   total_damage_taken_x   23664 non-null  int64  
 10  total_damage_dealt_x   23664 non-null  int64  
 11  name_with_role_x       23664 non-null  object 
 12  gold_15_y              23664 non-null  float64
 13  kills_assists_15_y     23664 non-null  float64
 14  deaths_15_y            23664 non-null  float64
 15

In [78]:
np.shape(X_train), np.shape(y_train)

((23664, 24), (23664,))

In [80]:
pipe.steps[0]

('columntransformer',
 ColumnTransformer(remainder='passthrough',
                   transformers=[('pipeline',
                                  Pipeline(steps=[('simpleimputer',
                                                   SimpleImputer()),
                                                  ('minmaxscaler',
                                                   MinMaxScaler())]),
                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f5e9fef11f0>),
                                 ('countvectorizer-1', CountVectorizer(),
                                  'name_with_role_x'),
                                 ('countvectorizer-2', CountVectorizer(),
                                  'name_with_role_y')]))

In [81]:
# Train pipeline
pipe.fit(X_train,y_train)

# Score model
pipe.score(X_test,y_test)

0.9835277968428278

In [82]:
from sklearn.model_selection import cross_val_score

# Cross validate pipeline
cross_val_score(pipe, X_train, y_train, cv=10, scoring='accuracy').mean()

0.9808571007167087

In [None]:
pipe.evaluate()

In [83]:
# Check predictions
pipe.predict(X_test.iloc[0:5]), y_test[0:5]

(array([0, 0, 1, 0, 1]), array([0, 0, 1, 0, 1]))

In [84]:
from sklearn import metrics

predicted = pipe.predict(X_test)
print(metrics.accuracy_score(y_test, predicted))
print(metrics.classification_report(y_test, predicted)) 

0.9835277968428278
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      5407
           1       0.98      0.99      0.98      6249

    accuracy                           0.98     11656
   macro avg       0.98      0.98      0.98     11656
weighted avg       0.98      0.98      0.98     11656



In [85]:
np.unique(predicted)

array([0, 1])

In [86]:
model = pipe[-1]

In [87]:
model.coef_

array([[-5.00182428e-09,  2.67984178e-08, -1.37063086e-08, ...,
         9.18886419e-07,  3.25022064e-05,  2.97880836e-06]])