In [88]:
import pandas as pd
import numpy as np
import sklearn.linear_model as lm
from scipy.stats import ks_2samp

# Multitype Columns Issue
- When loading csv file,there are some multi type columns
- Looking at the columns with multiple values, it looks like it's multi type because of the NaN values. Good opportunity to ask students to investigate some of this missingness such that we can have non-multitype columns.
    - From first viewing, I can assume that most of the missingness is missing by design. Some of the player names are missing because there are rows that represent the statistics of a whole team in a game (shown later).

In [2]:
league = pd.read_csv("./2022_LoL_esports_match_data_from_OraclesElixir_20221023.csv")

  league = pd.read_csv("./2022_LoL_esports_match_data_from_OraclesElixir_20221023.csv")


In [36]:
my_cols = league.columns[(league.applymap(type).apply(pd.value_counts).fillna(0) > 0).sum() > 1].values
my_cols

array(['url', 'split', 'playername', 'playerid', 'teamname', 'teamid',
       'champion', 'ban1', 'ban2', 'ban3', 'ban4', 'ban5'], dtype=object)

In [18]:
league.applymap(type).apply(pd.value_counts).fillna(0)[my_cols]

Unnamed: 0,url,split,playername,playerid,teamname,teamid,champion,ban1,ban2,ban3,ban4,ban5
<class 'str'>,21480.0,96924.0,118100.0,116495.0,141666.0,140064.0,118100.0,139944.0,140040.0,139710.0,139860.0,139554.0
<class 'float'>,120240.0,44796.0,23620.0,25225.0,54.0,1656.0,23620.0,1776.0,1680.0,2010.0,1860.0,2166.0
<class 'int'>,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
# Missingness in this column seems like a good opportunity to investigate MCAR
league["champion"].sample(100)

57964     Alistar
51962       Sylas
82585       Viego
26382     Hecarim
28948         Lux
           ...   
46890       Viego
37723     Orianna
127387      Sylas
107855        NaN
40457        Gwen
Name: champion, Length: 100, dtype: object

# Investigating Data by Game
- For each game there are 5 players on each team, thus we suspect that each unique gameid has 10 corresponding rows.
    - However there are 12 per game, this is because there are two additional rows for aggregate team results which is great for students who want to do data by teams.

In [49]:
# We want to run count on a non-nan column
league.columns[(~league.isna()).all()]

Index(['gameid', 'datacompleteness', 'league', 'year', 'playoffs', 'date',
       'game', 'participantid', 'side', 'position', 'gamelength', 'result',
       'kills', 'deaths', 'assists', 'teamkills', 'teamdeaths', 'team kpm',
       'ckpm', 'totalgold'],
      dtype='object')

In [51]:
# 12 rows per game
league.groupby("gameid")["participantid"].count().unique()

array([12], dtype=int64)

In [68]:
# NaN playername rows seem to contain the aggregate info for a team, note the unique "team" position value in contast
# To the typical 5 player roles (top, jungle, mid, bottom, support)
selected_game = "ESPORTSTMNT01_2690210"
game_info = league.loc[league["gameid"] == selected_game,["playername","teamname","kills","teamkills","position","result"]]
game_info

Unnamed: 0,playername,teamname,kills,teamkills,position,result
0,Soboro,Fredit BRION Challengers,2,9,top,0
1,Raptor,Fredit BRION Challengers,2,9,jng,0
2,Feisty,Fredit BRION Challengers,2,9,mid,0
3,Gamin,Fredit BRION Challengers,2,9,bot,0
4,Loopy,Fredit BRION Challengers,1,9,sup,0
5,DnDn,Nongshim RedForce Challengers,1,19,top,1
6,Sylvie,Nongshim RedForce Challengers,4,19,jng,1
7,FIESTA,Nongshim RedForce Challengers,6,19,mid,1
8,vital,Nongshim RedForce Challengers,8,19,bot,1
9,Blessing,Nongshim RedForce Challengers,0,19,sup,1


In [80]:
# Note some columns that are MD from looking at the "position" value. There is a mapping of team rows to missingness in the
# following columns which makes sence as they are player only statistics.
league.columns[league[league["position"] == "team"].isna().all()]

Index(['playername', 'playerid', 'champion', 'firstbloodkill',
       'firstbloodassist', 'firstbloodvictim', 'damageshare',
       'earnedgoldshare', 'total cs'],
      dtype='object')

## Trying to find MAR and NMAR data columns
- Note that this dataset has a `datacompleteness` column to represent rows that have missingness, we may tell students to ignore the column when determining NMAR and MAR because the information that this column provides is just that the data is missing thus being MAR with this column feels pretty redundant.

In [182]:
def assess_missingness(data,focus_col,compare_col,stat="tvd",n_repetitions=1000):
    def tvd(data,missing_col,compare_col):
        pivoted = (
            shuffled
            .pivot_table(index=missing_col, columns=compare_col, aggfunc='size')
            .apply(lambda x: x / x.sum(), axis=1)
        )

        tvd = pivoted.diff().iloc[-1].abs().sum() / 2
        return tvd
    
    def ks_test(data,missing_col,compare_col):
        missing_data = data.loc[data[missing_col],compare_col]
        nonmissing_data = data.loc[~data[missing_col],compare_col]
        return ks_2samp(missing_data, nonmissing_data).pvalue
    
    shuffled = data.copy()
    missing_col = focus_col+'_missing'
    shuffled[missing_col] = shuffled[focus_col].isna()
    
    
    if stat == "ks":
        return ks_test(shuffled,missing_col,compare_col)
    
    if stat == "tvd":
        obs_tvd = tvd(data,missing_col,compare_col)
        tvds = []
        for _ in range(n_repetitions):

            # Shuffling genders and assigning back to the DataFrame
            shuffled[compare_col] = np.random.permutation(shuffled[compare_col])
            tvds.append(tvd(shuffled,missing_col,compare_col))
        pval = np.mean(np.array(tvds) >= obs_tvd)
        return pval

In [183]:
# Missingness of the split is dependent on the league the game was help
# This makes sense because some leagues may not define their tourneys by splits
assess_missingness(league,"split","league")

0.0

In [184]:
# Missingness of the split is not dependent on the number of dragons a player or team killed 
assess_missingness(league,"split","dragons",stat="ks")

0.9999744015450113

## Potential Lines of inquiry for Project 3
- Looking at [tier one professional leagues](https://en.wikipedia.org/wiki/List_of_League_of_Legends_leagues_and_tournaments), which league has the most action packed games and for that league, is it's level of action significantly different than the others?
    - We will look at average kills per minute (kpm) and use a ks stat
- Who "carries" their team more often, ADCs (Bot lanes) or Mid laners?
    - We will look at gold per minute (gpm)
- Is the average win rate of my favorite champion (Talon) different or similar to the average win rate of all other champions in the dataset?
     - We will obviously look at winrates by averaging the results column