# Preprocessing match data
In this notebook we preprocess the matches.csv file to obtain a suitable input file for the models we want to run. 

The input is a matches.csv file generated by the get_match_data.ipynb notebook. 

The output is a dataframe with a BlueWin columne (1 for blue win, 0 for red win) as well as the the champions on the red and blue teams encoded as categorical variables (blue one-hot encoded as +1, red as -1). 

### TO DO
1. Add utilities file

### 1. Import packages and data

In [1]:
"""
@author: Mark Bugden
March 2023

Part of a ML project in predicting win rates for League of Legends games based on team composition.
Current update available on GitHub: https://github.com/Mark-Bugden
"""

# Import necessary packages
import requests
import pandas as pd
from ratelimit import limits, sleep_and_retry
import pickle
import math
import numpy as np
from matplotlib import pyplot as plt
import os
import glob
import h5py



from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


# This gives us a progress bar for longer computations. 
from tqdm.notebook import tqdm
# To use it, just wrap any iterable with tqdm(iterable).
# Eg: 
# for i in tqdm(range(100)):
#     ....

In [2]:
# Put the location of the data folder on your computer
data_location = 'C:\\Users\\Mark\\Code\\LoL Win Prediction\\Data Collection\\'
# Put the location of utilities file
# util_location = ''

In [3]:
# Here are the tiers and divisions
tier_list = ['DIAMOND', 'PLATINUM', 'GOLD', 'SILVER', 'BRONZE', 'IRON']
division_list = ['I', 'II', 'III', 'IV']

# Load the champion information
champion_url = 'http://ddragon.leagueoflegends.com/cdn/12.14.1/data/en_US/champion.json'
r = requests.get(champion_url)
json_data = r.json()
champion_data = json_data['data']

champions = list(champion_data.keys())
num_champs = len(champions)

# For some reason Fiddlesticks is listed as FiddleSticks in some of the other data. To avoid problems like this, I will convert all champion names to lowercase
champions = [champ.lower() for champ in champions]


champ_to_num = {k: v for v, k in enumerate(champions)}
num_to_champ = {v: k for v, k in enumerate(champions)}

# We can get champion information by accessing the champion_data dict
# Eg:
# champion_data['Zyra']

In [5]:
# Load the csv file produced in the Get_Match_Data iPython notebook
rankeddf = pd.read_csv(data_location + 'ranked_matches.csv') 

# Convert the champion names to lower as well to match up with our champions list.
rankeddf['championName'] = rankeddf['championName'].str.lower()

### 2. Format the data

In [6]:
# Here are three matches
rankeddf = rankeddf.sort_values(by=['matchId', 'team'])
rankeddf.head(20)

Unnamed: 0,matchId,team,win,championName,summonerName,gameMode
27250,EUN1_2670740139,Blue,True,camille,kisielsniper,420
27251,EUN1_2670740139,Blue,True,drmundo,ZaCarryDeBoteau,420
27252,EUN1_2670740139,Blue,True,katarina,mD czarny,420
27253,EUN1_2670740139,Blue,True,lucian,Voooler,420
27254,EUN1_2670740139,Blue,True,maokai,TabzFN,420
27255,EUN1_2670740139,Red,False,volibear,TheOneBlesser,420
27256,EUN1_2670740139,Red,False,khazix,ZiobroTyKurwiu,420
27257,EUN1_2670740139,Red,False,fizz,Chrobinho,420
27258,EUN1_2670740139,Red,False,missfortune,Matyaskapln,420
27259,EUN1_2670740139,Red,False,rell,Damuselll,420


In [7]:
rankeddf.shape

(213330, 6)

In [8]:
def process_dataframe(dataframe):
    '''
    Takes in the rankeddf dataframe consisting of all the ranked match data, and processes and formats the data into a suitable format.
    Steps:
    1. One-hot encode the championName column, storing +1 if they are on the Blue team and -1 if they are on the Red team
    2. Replace the 'team' and 'win' columns with a single column, 'BlueWin' (1 if blue team won, 0 if red team won).
    3. Drop any unnecessary columns (summonerName and gameMode)
    4. groupby matchId and melt
    '''
    
    
    # Step 1
    # One-hot encode the championName using pd.get_dummies
    df = pd.get_dummies(dataframe, columns=['championName'])
    # Store all the championName columns in a list
    championName_enc = [col for col in df.columns.tolist() if 'championName' in col]
    # If team == Red, multiply the one-hot encoded championNames by -1
    df.loc[df['team'] == 'Red', championName_enc] *= -1

    # Step 2
    # Insert the BlueWin column, initialising all its values to 0
    df.insert(3, 'BlueWin', 0)
    # Set the values of BlueWin to 1 if the Blue team won, or if the Red team lost
    df['BlueWin'] = np.where( ((df.team == 'Blue') & (df.win == True )) | ((df.team == 'Red') & (df.win == False )) , 1, 0)
    # Drop the 'team' and 'win' columns
    df = df.drop(['team', 'win'], axis=1)

    # Step 3
    # Drop the summonerName column
    df = df.drop(['summonerName', 'gameMode'], axis=1)
    
    # Step 4
    # groupby 'matchId' and add.
    df = df.groupby('matchId').sum()
    # Divide the BlueWin column by 10 so that it is just a 0 or 1 indicator
    df['BlueWin'] = (df['BlueWin']/10).astype(int)
    return df

In [9]:
%%time
df = process_dataframe(rankeddf)
df.head(10)

  exec(code, glob, local_ns)


Wall time: 1.44 s


Unnamed: 0_level_0,BlueWin,championName_aatrox,championName_ahri,championName_akali,championName_akshan,championName_alistar,championName_amumu,championName_anivia,championName_annie,championName_aphelios,...,championName_yone,championName_yorick,championName_yuumi,championName_zac,championName_zed,championName_zeri,championName_ziggs,championName_zilean,championName_zoe,championName_zyra
matchId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
EUN1_2670740139,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
EUN1_2671611908,0,0,0,0,0,0,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
EUN1_2671704791,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
EUN1_2672157655,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,-1,0,0,0
EUN1_2673422946,0,0,0,0,0,0,0,0,0,0,...,-1,0,1,0,0,0,0,0,0,0
EUN1_2675095228,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
EUN1_2676593461,1,0,0,0,0,0,0,0,0,0,...,-1,0,-1,0,0,0,0,0,0,0
EUN1_2676803554,1,0,0,-1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
EUN1_2677495185,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
EUN1_2678323705,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,-1,0,0,0,0,0


In [10]:
df.to_csv('featureslabels.csv')