In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
team = esports2023[esports2023["position"] == "team"].reset_index(drop = True).drop("position", axis = 1)

In [None]:
esports2023 = pd.read_csv("2023_LoL_esports_match_data_from_OraclesElixir.csv")
esports2024 = pd.read_csv("2024_LoL_esports_match_data_from_OraclesElixir.csv")

### Preliminary EDA and Data Cleaning

Make sure we get data that is "complete" by OraclesElixir standards

In [None]:
esports2023.drop(esports2023[esports2023["datacompleteness"] != "complete"].index, inplace = True)
esports2024.drop(esports2024[esports2024["datacompleteness"] != "complete"].index, inplace = True)

Filter only by major regions. Since LPL is all incomplete, we will just not have LPL statistics

In [None]:
esports2023 = esports2023[(esports2023["league"] == "LCS") | (esports2023["league"] == "LEC")]
esports2024 = esports2024[(esports2024["league"] == "LCS") | (esports2024["league"] == "LEC")]

Since we are only interested in predicting matches given early game statistics, let's only look at the columns that are helpful in that. Furthermore, let's take away the opponent stats so that it seems like every game in this data set is its own separate game (which we know is to be false).

In [None]:
np.array([esports2023.columns])

#### 2024 data cleaning

### 2023

The most obvious explanation as to why the columns have NaNs is simply because the team did not get that particular objective. As such, we will replace those with 0s.

In [None]:
esports2023.fillna(0, inplace = True)
esports2024.fillna(0, inplace = True)

esports2023.replace("Blue", 0, inplace = True)
esports2023.replace("Red", 1, inplace = True)

esports2024.replace("Blue", 0, inplace = True)
esports2024.replace("Red", 1, inplace = True)

In [None]:
champion_list_2023 = esports2023.groupby('gameid')['champion'].apply(list)
teamname_list_2023 = esports2023.groupby('gameid')['teamname'].apply(list)

i = 0
for k, v in champion_list_2023.items():
    esports2023.loc[(esports2023['gameid'] == k) & (esports2023['side'] == 0) & (esports2023['position'] == 'team'), 'champ1'] = v[0]
    esports2023.loc[(esports2023['gameid'] == k) & (esports2023['side'] == 0) & (esports2023['position'] == 'team'), 'champ2'] = v[1]
    esports2023.loc[(esports2023['gameid'] == k) & (esports2023['side'] == 0) & (esports2023['position'] == 'team'), 'champ3'] = v[2]
    esports2023.loc[(esports2023['gameid'] == k) & (esports2023['side'] == 0) & (esports2023['position'] == 'team'), 'champ4'] = v[3]
    esports2023.loc[(esports2023['gameid'] == k) & (esports2023['side'] == 0) & (esports2023['position'] == 'team'), 'champ5'] = v[4]
    esports2023.loc[(esports2023['gameid'] == k) & (esports2023['side'] == 0) & (esports2023['position'] == 'team'), 'oppchamp1'] = v[5]
    esports2023.loc[(esports2023['gameid'] == k) & (esports2023['side'] == 0) & (esports2023['position'] == 'team'), 'oppchamp2'] = v[6]
    esports2023.loc[(esports2023['gameid'] == k) & (esports2023['side'] == 0) & (esports2023['position'] == 'team'), 'oppchamp3'] = v[7]
    esports2023.loc[(esports2023['gameid'] == k) & (esports2023['side'] == 0) & (esports2023['position'] == 'team'), 'oppchamp4'] = v[8]
    esports2023.loc[(esports2023['gameid'] == k) & (esports2023['side'] == 0) & (esports2023['position'] == 'team'), 'oppchamp5'] = v[9]
    esports2023.loc[(esports2023['gameid'] == k) & (esports2023['side'] == 0) & (esports2023['position'] == 'team'), 'oppteamname'] = teamname_list_2023[i][-1]
    i += 1
# weird gameid ordering

In [None]:
esports2023 = esports2023[["result",
                           "gameid",
                           "champion",
                           "teamname",
                           "side",
                           "position",
                           "playoffs",
                           "game",
                           "teamkills",
                           "teamdeaths",
                           "assists",
                           "firstblood",
                           "firstdragon",
                           "dragons",
                           "opp_dragons",
                           "heralds",
                           "opp_heralds",
                           "barons",
                           "opp_barons",
                           "firstherald",
                           "firsttower",
                           "towers",
                           "firstbloodvictim"
                          ]]

In [None]:
from copy import deepcopy

team2023 = esports2023[esports2023["position"] == "team"].reset_index(drop = True).drop("position", axis = 1)
team2023 = team.drop('gameid', axis=1)

In [None]:
team2023.dropna(inplace= True)
#team = team.drop('champion', axis=1)
#team = team.drop('playoffs', axis=1)
team2023 = team.drop('side', axis=1)

In [None]:
#team.head(10)
np.array(team.columns)

In [None]:
esports2023

In [None]:
esports2024 = esports2024[["result",
                           "gameid",
                           "champion",
                           "teamname",
                           "side",
                           "position",
                          # "playoffs",
                          # "game",
                          # "teamkills",
                          # "teamdeaths",
                          # "assists",
                          # "firstblood",
                          # "firstdragon",
                          # "dragons",
                          # "opp_dragons",
                          # "heralds",
                          # "opp_heralds",
                          # "barons",
                          # "opp_barons",
                          # "firstherald",
                          # "firsttower",
                          # "towers",
                          ]]

esports2024.fillna(0, inplace = True)

esports2024.replace("Blue", 0, inplace = True)
esports2024.replace("Red", 1, inplace = True)

champion_list_2024 = esports2024.groupby('gameid')['champion'].apply(list)
teamname_list_2024 = esports2024.groupby('gameid')['teamname'].apply(list)
j = 0
for k, v in champion_list_2024.items():
    esports2024.loc[(esports2024['gameid'] == k) & (esports2024['side'] == 0) & (esports2024['position'] == 'team'), 'champ1'] = v[0]
    esports2024.loc[(esports2024['gameid'] == k) & (esports2024['side'] == 0) & (esports2024['position'] == 'team'), 'champ2'] = v[1]
    esports2024.loc[(esports2024['gameid'] == k) & (esports2024['side'] == 0) & (esports2024['position'] == 'team'), 'champ3'] = v[2]
    esports2024.loc[(esports2024['gameid'] == k) & (esports2024['side'] == 0) & (esports2024['position'] == 'team'), 'champ4'] = v[3]
    esports2024.loc[(esports2024['gameid'] == k) & (esports2024['side'] == 0) & (esports2024['position'] == 'team'), 'champ5'] = v[4]
    esports2024.loc[(esports2024['gameid'] == k) & (esports2024['side'] == 0) & (esports2024['position'] == 'team'), 'oppchamp1'] = v[5]
    esports2024.loc[(esports2024['gameid'] == k) & (esports2024['side'] == 0) & (esports2024['position'] == 'team'), 'oppchamp2'] = v[6]
    esports2024.loc[(esports2024['gameid'] == k) & (esports2024['side'] == 0) & (esports2024['position'] == 'team'), 'oppchamp3'] = v[7]
    esports2024.loc[(esports2024['gameid'] == k) & (esports2024['side'] == 0) & (esports2024['position'] == 'team'), 'oppchamp4'] = v[8]
    esports2024.loc[(esports2024['gameid'] == k) & (esports2024['side'] == 0) & (esports2024['position'] == 'team'), 'oppchamp5'] = v[9]
    esports2024.loc[(esports2024['gameid'] == k) & (esports2024['side'] == 0) & (esports2024['position'] == 'team'), 'oppteamname'] = teamname_list_2024[j][-1]
    j += 1

team2024 = esports2024[esports2024["position"] == "team"].reset_index(drop = True).drop("position", axis = 1)
team2024 = team2024.drop('gameid', axis=1)

team2024.dropna(inplace= True)
team2024 = team2024.drop('champion', axis=1)
team2024 = team2024.drop('playoffs', axis=1)
team2024 = team2024.drop('side', axis=1)

### Data Visualization

How many times does Blue side win vs Red Side?

In [None]:
esports2023 = pd.read_csv("2023_LoL_esports_match_data_from_OraclesElixir.csv")
esports2023.drop(esports2023[esports2023["datacompleteness"] != "complete"].index, inplace = True)
esports2023 = esports2023[(esports2023["league"] == "LCS") | (esports2023["league"] == "LEC")]
esports2023.fillna(0, inplace = True)
esports2024.fillna(0, inplace = True)

esports2023.replace("Blue", 0, inplace = True)
esports2023.replace("Red", 1, inplace = True)

esports2024.replace("Blue", 0, inplace = True)
esports2024.replace("Red", 1, inplace = True)

In [None]:
esports2023 = esports2023[["result",
                           "side",
                           "position",
                           "firstblood",
                           "firstbloodvictim",
                           "firstdragon",
                           "firstherald",
                           "firsttower",
                           "golddiffat10",
                           "xpdiffat10",
                           "csdiffat10",
                           "killsat10",
                           "assistsat10",
                           "deathsat10"]]

In [None]:
top = esports2023[esports2023["position"] == "top"].reset_index(drop = True).drop("position", axis = 1)
jng = esports2023[esports2023["position"] == "jng"].reset_index(drop = True).drop("position", axis = 1)
mid = esports2023[esports2023["position"] == "mid"].reset_index(drop = True).drop("position", axis = 1)
bot = esports2023[esports2023["position"] == "bot"].reset_index(drop = True).drop("position", axis = 1)
sup = esports2023[esports2023["position"] == "sup"].reset_index(drop = True).drop("position", axis = 1)
team = esports2023[esports2023["position"] == "team"].reset_index(drop = True).drop("position", axis = 1)

In [None]:
def copy_over():
    return lanes["top"], lanes["jng"], lanes["mid"], lanes["bot"], lanes["sup"], lanes["team"] 

Finally, notice how each game is not independent of each other, despite all the cleaning we have done. This is partly because we have information of each lane. The way I will fix this is by separating by lane.

In [None]:
lanes = {"top" : top,
         "jng" : jng,
         "mid" : mid,
         "bot" : bot,
         "sup" : sup,
         "team" : team}

In [None]:
for key in lanes:
    print("First Dragon " + str(key) + " unique values: " + str(lanes[key].firstdragon.unique()))
    print("First Herald " + str(key) + " unique values: " + str(lanes[key].firstherald.unique()))
    print("First Tower " + str(key) + " unique values: " + str(lanes[key].firsttower.unique()))
    print("First Blood Victim " + str(key) + " unique values: " + str(lanes[key].firstbloodvictim.unique()))
    print("\n")

In [None]:
for key in [key for key in lanes.keys() if key not in ["team"]]:
    lanes[key].drop(["firstdragon", "firstherald", "firsttower"], axis = 1, inplace = True)

In [None]:
lanes["team"]

In [None]:
lanes["team"].drop("firstbloodvictim", axis = 1, inplace = True)

In [None]:
top, jng, mid, bot, sup, team = copy_over()

Let's calculate blue side's win percentage

In [None]:
winsbyside = team.groupby("side").sum().reset_index()[["side", "result"]]
blue_wins = winsbyside.iloc[0]["result"]
red_wins = winsbyside.iloc[1]["result"]

In [None]:
str(round(blue_wins/(blue_wins+red_wins) * 100, 2)) + '%'

In [None]:
plt.bar(winsbyside["side"], winsbyside["result"])
plt.xticks(winsbyside["side"])
plt.xlabel("Side")
plt.ylabel("Number of Wins")
plt.title("Blue Side vs Red Side Wins")
plt.savefig("./side selection vs wins./side_vs_num_wins_graph.png", dpi = 150, bbox_inches="tight", facecolor='white', transparent=False)
plt.show()

What is the percentage of first blood succcess relative to winning or losing a match?

In [None]:
ax = pd.crosstab(team.result, team.firstblood, normalize='index').plot.bar()
ax.set_title("team first blood success")
ax.set_ylabel("percentage of games")
for c in ax.containers:
    ax.bar_label(c)
plt.savefig("./firstbloodrate vs wins./team first blood success.png", dpi = 150, bbox_inches="tight", facecolor='white', transparent=False)

As expected, when a team gets first blood, they are more likely to win.

What about per role? What is the percentage of first blood success relative to winning or losing a match for each role?

In [None]:
def fbinfo(df, dfname):
    ax = pd.crosstab(df.result, df.firstblood, normalize='index').plot.bar()
    ax.set_title(dfname)
    ax.set_ylabel("Percentage of Games")
    for c in ax.containers:
        ax.bar_label(c)
    plt.savefig("./firstbloodrate vs wins./" + dfname + ".png", dpi = 150, bbox_inches="tight", facecolor='white', transparent=False)

In [None]:
for key in [key for key in lanes.keys() if key not in ["team"]]:
    fbinfo(lanes[key], str(key) + " first blood success")

Above are bar graphs that show the percentages of wins and losses relative to the first blood rate in each role. So for example, of all losses in the dataset (represented by the 0), around 86.7% did not have the top laner involved in the first blood while around 13.3% of lost games did have the top laner involved in the first bloods. Of all wins in the dataset (represented by the 1), around 80.8% did not have the top laner involved and around 19.2% did have the top laner involved.

Looking at the trends of each graph, it is clear that the more first bloods a team has, the more likely they are to win. This can be for a variety of reasons. Better teams may just be more likely to score first bloods since they are better. Or it could be the fact that first bloods do, indeed, give a big enough advantage that allows teams to score the win.

However, I am skeptical of the latter. While it is true that when a player scores first blood, they trend towards winning more than their counterparts, more often than not, a player does not score first blood and still wins the game (as evident by the fact that the blue bar in the wins for all roles is higher than the orange bar). This is definitely due to the fact that League of Legends is not a zero-sum game. There are many factors, other than first blood, that determine a win. Thus I am inclined to believe that first blood, by itself, is not a good indicator of games that are won. This, however, is for the team category (which takes into a whole team's first blood rate). It is obvious that when a team scores first blood, they are more likely to win: 58.6%. If the team does not score first blood, their chances of winning drop to 40.8% (if we were to eliminate all of variables). This is the only category where scoring a first blood leads to a higher likelihood of winning a game than not winning a game.

Perhaps a more interesting thing to do is to compare between roles first blood rate between roles. Jungle is by far the role that is involved in the the most first bloods as evidenced by the tallter orange bar in wins in comparison to all the other roles. This makes sense, as junglers are the focal point of early games for competitive teams. If we were to try to predict whether or not a game is won or loss using only first bloods and roles, we should look at a jungler's capability to score that first kill for their team.

What is the percentage of getting the first dragon, first herald, or first tower relative to winning or losing a match?

In [None]:
titles = ["First Dragon", "First Herald", "First Tower"]
index = 0
for column in team.columns[3:6]:
    ax = pd.crosstab(team["result"], team[column], normalize='index').plot.bar()
    ax.set_ylabel("Percentage of Games")
    ax.set_title(titles[index])
    for c in ax.containers:
        ax.bar_label(c)
    plt.savefig("./first objectives vs wins./" + titles[index] + ".png", dpi = 150, bbox_inches="tight", facecolor='white', transparent=False)
    index += 1

Of the games in the data set, 66.9% of the wins had the winning team score first dragon, 62.1% of the wins had the winning team score first herald, and 65.5% of wins had the winning team score first tower. This means that if we are to use these three categories to predict whether or not a team will win, we should first look at first tower, then first drake, and finally first herald. Furthermore, this shows us that first tower is one of the most important factors in winning games this season. When you score a first tower, you are more likely to win a game. This could be for a variety of reasons; it could be the fact that first towers do, indeed, lead teams to win more. However, what is more likely is that teams already have large leads (and thus were already projected to win) which lead them to getting the first tower. 

One suprising aspect of these graphs is how first herald and first tower do not seem to be that closely related. One would think that teams usually use first herald to take first tower. Thus, I would expect these two categories to be relatively close in percentages. However, getting first herald is the worst of the three objectives, with first dragon beating it by 5.3%.

Speaking of close relatedness, what is the correlation between each column in our data sets?

In [None]:
team.columns

In [None]:
plt.figure(figsize = (8, 8))
plt.title("Team")
sns.heatmap(team.corr(), annot = True, cmap = "rainbow", annot_kws = {"size":8})
plt.savefig("./heatmaps./team heatmap.png", dpi = 150, bbox_inches="tight", facecolor='white', transparent=False)

Some interesting things to point out in this correlation plot: of the wins, the golddiffat10 is the highest category correlated to wins at 0.42.

In [None]:
def createheatmaps(df, dfname):
    plt.figure(figsize = (8, 8))
    plt.title(dfname)
    sns.heatmap(df.corr(), annot = True, cmap = "rainbow", annot_kws = {"size":8})
    plt.savefig("./heatmaps./" + dfname + ".png", dpi = 150, bbox_inches="tight", facecolor='white', transparent=False)

In [None]:
for key in [key for key in lanes.keys() if key not in ["team"]]:
    createheatmaps(lanes[key], str(key) + " heatmap")

### Modeling

In [None]:
from matplotlib.gridspec import GridSpec
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn import metrics
from sklearn.inspection import permutation_importance

#### Standardize Data

In [None]:
def standardize_df(df):
    scaler = StandardScaler()
    return pd.concat([df.iloc[:, 0:index], pd.DataFrame(scaler.fit_transform(df.iloc[:,1:]), columns = df.iloc[:,1:].columns)], axis = 1)

In [None]:
# standardize numeric columns for team
team2023.values[:] = StandardScaler().fit_transform(team2023)
#st_team = standardize_df(team)

#### Champion IDs

In [None]:
all_champion_id = {
        1: 'Annie',
        2: 'Olaf',
        3: 'Galio',
        4: 'Twisted Fate',
        5: 'Xin Zhao',
        6: 'Urgot',
        7: 'LeBlanc',
        8: 'Vladimir',
        9: 'Fiddlesticks',
        10: 'Kayle',
        11: 'Master Yi',
        12: 'Alistar',
        13: 'Ryze',
        14: 'Sion',
        15: 'Sivir',
        16: 'Soraka',
        17: 'Teemo',
        18: 'Tristana',
        19: 'Warwick',
        20: 'Nunu & Willump',
        21: 'Miss Fortune',
        22: 'Ashe',
        23: 'Tryndamere',
        24: 'Jax',
        25: 'Morgana',
        26: 'Zilean',
        27: 'Singed',
        28: 'Evelynn',
        29: 'Twitch',
        30: 'Karthus',
        31: "Cho'Gath",
        32: 'Amumu',
        33: 'Rammus',
        34: 'Anivia',
        35: 'Shaco',
        36: 'Dr. Mundo',
        37: 'Sona',
        38: 'Kassadin',
        39: 'Irelia',
        40: 'Janna',
        41: 'Gangplank',
        42: 'Corki',
        43: 'Karma',
        44: 'Taric',
        45: 'Veigar',
        48: 'Trundle',
        50: 'Swain',
        51: 'Caitlyn',
        53: 'Blitzcrank',
        54: 'Malphite',
        55: 'Katarina',
        56: 'Nocturne',
        57: 'Maokai',
        58: 'Renekton',
        59: 'Jarvan IV',
        60: 'Elise',
        61: 'Orianna',
        62: 'Wukong',
        63: 'Brand',
        64: 'Lee Sin',
        67: 'Vayne',
        68: 'Rumble',
        69: 'Cassiopeia',
        72: 'Skarner',
        74: 'Heimerdinger',
        75: 'Nasus',
        76: 'Nidalee',
        77: 'Udyr',
        78: 'Poppy',
        79: 'Gragas',
        80: 'Pantheon',
        81: 'Ezreal',
        82: 'Mordekaiser',
        83: 'Yorick',
        84: 'Akali',
        85: 'Kennen',
        86: 'Garen',
        89: 'Leona',
        90: 'Malzahar',
        91: 'Talon',
        92: 'Riven',
        96: "Kog'Maw",
        98: 'Shen',
        99: 'Lux',
        101: 'Xerath',
        102: 'Shyvana',
        103: 'Ahri',
        104: 'Graves',
        105: 'Fizz',
        106: 'Volibear',
        107: 'Rengar',
        110: 'Varus',
        111: 'Nautilus',
        112: 'Viktor',
        113: 'Sejuani',
        114: 'Fiora',
        115: 'Ziggs',
        117: 'Lulu',
        119: 'Draven',
        120: 'Hecarim',
        121: "Kha'Zix",
        122: 'Darius',
        126: 'Jayce',
        127: 'Lissandra',
        131: 'Diana',
        133: 'Quinn',
        134: 'Syndra',
        136: 'Aurelion Sol',
        141: 'Kayn',
        142: 'Zoe',
        143: 'Zyra',
        145: "Kai'Sa",
        147: "Seraphine",
        150: 'Gnar',
        154: 'Zac',
        157: 'Yasuo',
        161: "Vel'Koz",
        163: 'Taliyah',
        166: "Akshan",
        164: 'Camille',
        201: 'Braum',
        202: 'Jhin',
        203: 'Kindred',
        222: 'Jinx',
        223: 'Tahm Kench',
        234: 'Viego',
        235: 'Senna',
        236: 'Lucian',
        238: 'Zed',
        240: 'Kled',
        245: 'Ekko',
        246: 'Qiyana',
        254: 'Vi',
        266: 'Aatrox',
        267: 'Nami',
        268: 'Azir',
        350: 'Yuumi',
        360: 'Samira',
        412: 'Thresh',
        420: 'Illaoi',
        421: "Rek'Sai",
        427: 'Ivern',
        429: 'Kalista',
        432: 'Bard',
        497: 'Rakan',
        498: 'Xayah',
        516: 'Ornn',
        517: 'Sylas',
        526: 'Rell',
        518: 'Neeko',
        523: 'Aphelios',
        555: 'Pyke',
        875: "Sett",
        711: "Vex",
        777: "Yone",
        887: "Gwen",
        876: "Lillia",
        877: "Zeri",
        878: "Renata Glasc",
        879: "Bel'Veth",
        880: "Nilah",
        881: "K'Sante",
        882: "Milio",
        883: "Naafiri",
        884: "Briar",
        885: "Hwei",
        886: "Smolder"
    }

champion_list = list(all_champion_id.values())

#### Separating the predictors and outcome variables

In [None]:
le1 = LabelEncoder()
le2 = LabelEncoder()
unique_teamnames = team2023['teamname'].unique()

le1.fit(unique_teamnames)
le2.fit(champion_list)


vals = le1.transform(unique_teamnames)
champion_vals = le2.transform(champion_list)

In [None]:
team_dict = { k:v for (k,v) in zip(le1.classes_, vals)}  
team2023['teamname'].replace(to_replace=team_dict, inplace=True)
team2023['oppteamname'].replace(to_replace=team_dict, inplace=True)

champ_dict = { k:v for (k,v) in zip(le2.classes_, champion_vals)} 
team2023['champ1'].replace(to_replace=champ_dict, inplace=True)
team2023['champ2'].replace(to_replace=champ_dict, inplace=True)
team2023['champ3'].replace(to_replace=champ_dict, inplace=True)
team2023['champ4'].replace(to_replace=champ_dict, inplace=True)
team2023['champ5'].replace(to_replace=champ_dict, inplace=True)
team2023['oppchamp1'].replace(to_replace=champ_dict, inplace=True)
team2023['oppchamp2'].replace(to_replace=champ_dict, inplace=True)
team2023['oppchamp3'].replace(to_replace=champ_dict, inplace=True)
team2023['oppchamp4'].replace(to_replace=champ_dict, inplace=True)
team2023['oppchamp5'].replace(to_replace=champ_dict, inplace=True)


### 2024 modeling

In [None]:
le1 = LabelEncoder()
le2 = LabelEncoder()
unique_teamnames = team2024['teamname'].unique()

le1.fit(unique_teamnames)
le2.fit(champion_list)


vals = le1.transform(unique_teamnames)
champion_vals = le2.transform(champion_list)

In [None]:
team_dict = { k:v for (k,v) in zip(le1.classes_, vals)}  
team2024['teamname'].replace(to_replace=team_dict, inplace=True)
team2024['oppteamname'].replace(to_replace=team_dict, inplace=True)

champ_dict = { k:v for (k,v) in zip(le2.classes_, champion_vals)} 
team2024['champ1'].replace(to_replace=champ_dict, inplace=True)
team2024['champ2'].replace(to_replace=champ_dict, inplace=True)
team2024['champ3'].replace(to_replace=champ_dict, inplace=True)
team2024['champ4'].replace(to_replace=champ_dict, inplace=True)
team2024['champ5'].replace(to_replace=champ_dict, inplace=True)
team2024['oppchamp1'].replace(to_replace=champ_dict, inplace=True)
team2024['oppchamp2'].replace(to_replace=champ_dict, inplace=True)
team2024['oppchamp3'].replace(to_replace=champ_dict, inplace=True)
team2024['oppchamp4'].replace(to_replace=champ_dict, inplace=True)
team2024['oppchamp5'].replace(to_replace=champ_dict, inplace=True)

In [None]:
team2024
team2024.drop('side', axis=1)

### Splitting data

In [None]:
xteam, yteam = team2023.iloc[:, 1:], team2023["result"]

In [None]:
team2023

#### Splitting Data 80/20

In [None]:
xteam_train, xteam_test, yteam_train, yteam_test = train_test_split(xteam, yteam, test_size = 0.2, random_state = 0)

#### Create Function to help with Modeling

In [None]:
# prints out a model's classification report, confusion matrix, and AUCROC curve
def evaluate_model(model, xtrain, ytrain, xtest, ytest, digits = 4, title = "Data", folderpath = "./"):
    
    # get predictions
    yhattrain = model.predict(xtrain)
    yhattest = model.predict(xtest)

    # Classification Reports
    
    print("CLASSIFICATION REPORT FOR TRAINING DATA")
    print(metrics.classification_report(ytrain, yhattrain, digits = digits))
    
    print("CLASSIFICATION REPORT FOR TEST DATA")
    print(metrics.classification_report(ytest, yhattest, digits = digits))
    
    # make confusion matrix
    cmat = metrics.confusion_matrix(ytest, yhattest)
    
    fig = plt.figure(figsize=(13, 13))
    gs = GridSpec(nrows=2, ncols=2)

    ax0 = fig.add_subplot(gs[0, 0])
    ax1 = fig.add_subplot(gs[1, 0])
    ax2 = fig.add_subplot(gs[:, 1])
    
    # show confusion matrix
    sns.heatmap(cmat, ax = ax0, annot = True, cmap = "Blues")
    ax0.set_xlabel("Predicted Wins")
    ax0.set_ylabel("Actual Wins")
    ax0.set_title("Confusion Matrix For " + title)
    
    # show ROC curve
    yhat_prob = model.predict_proba(xtest)[::,1]
    fpr, tpr, _ = metrics.roc_curve(ytest,  yhat_prob)
    auc = metrics.roc_auc_score(ytest, yhat_prob)
    ax2.grid()
    ax2.plot(fpr, tpr, label = "Model Auc = " + str(auc))
    ax2.set_ylabel("True Positive Rate")
    ax2.set_xlabel("False Positive Rate")
    ax2.set_title("ROC AUC Curve For " + title)
    ax2.legend(loc = 4)
    
    # show importance plot
    permutation_results = permutation_importance(model, xtrain, ytrain)
    sorted_indices = permutation_results.importances_mean.argsort()
    ax1.barh(xtrain.columns[sorted_indices], permutation_results.importances[sorted_indices].mean(axis = 1).T)
    ax1.set_title("Permutation Importance For " + title)
    plt.savefig(folderpath + title + ".png")
    plt.show()

In [None]:
# fits model on training data and displays everything from evalulate_model function
def fit_eval(model, xtrain, ytrain, xtest, ytest, digits = 4, title = "Data", folderpath = "./"):
    model.fit(xtrain, ytrain)
    evaluate_model(model, xtrain, ytrain, xtest, ytest, digits = 4, title = title, folderpath = folderpath)
    return model

In [None]:
# create dataframe to store scores
df_scores = pd.DataFrame(columns = ["Model", "Lane", "Accuracy", "AUCROC"])

#### Logistic Regression

In [None]:
from sklearn.model_selection import RandomizedSearchCV

clf = LogisticRegression(random_state = 213)


log_params = {'penalty': ['l2'], 'C': [1E-7, 1E-6, 1E-6, 1E-4, 1E-3]}
#clf = LogisticRegression()
search = RandomizedSearchCV(clf, scoring='average_precision', cv=10,
                            n_iter=10, param_distributions=log_params,
                            refit=True, n_jobs=-1)
search.fit(xteam_train, yteam_train)
clf = search.best_estimator_

fit_eval(clf, xteam_train, yteam_train, xteam_test, yteam_test, title = "Team Data", folderpath = "./LR./")


In [None]:
from sklearn.model_selection import cross_val_score, KFold
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True)

scores = cross_val_score(clf, xteam_train, yteam_train, cv=5)
scores

In [None]:
clf

Now that we have put everything into one data frame, let's look at difference aspects.

In [None]:
xteam2024, yteam2024 = team2024.iloc[:, 1:], team2024["result"]

In [None]:
# prints out a model's classification report, confusion matrix, and AUCROC curve
def validate_model(model, xval, yval, digits = 4, title = "Data", folderpath = "./"):
    
    # get predictions
    yhatval = model.predict(xval)

    # Classification Reports
    print("CLASSIFICATION REPORT FOR VALIDATION DATA")
    print(metrics.classification_report(yval, yhatval, digits = digits))
    
    # make confusion matrix
    cmat = metrics.confusion_matrix(yval, yhatval)
    
    fig = plt.figure(figsize=(13, 13))
    gs = GridSpec(nrows=2, ncols=2)

    ax0 = fig.add_subplot(gs[0, 0])
    ax1 = fig.add_subplot(gs[1, 0])
    ax2 = fig.add_subplot(gs[:, 1])
    
    # show confusion matrix
    sns.heatmap(cmat, ax = ax0, annot = True, cmap = "Blues")
    ax0.set_xlabel("Predicted Wins")
    ax0.set_ylabel("Actual Wins")
    ax0.set_title("Confusion Matrix For " + title)
    
    # show importance plot
    permutation_results = permutation_importance(model, xval, yval)
    sorted_indices = permutation_results.importances_mean.argsort()
    ax1.barh(xval.columns[sorted_indices], permutation_results.importances[sorted_indices].mean(axis = 1).T)
    ax1.set_title("Permutation Importance For " + title)
    plt.savefig(folderpath + title + ".png")
    plt.show()

In [None]:
validate_model(clf, xteam2024, yteam2024)

Can we sort by accuracy?

In [None]:
df_scores.sort_values("Accuracy", ascending = False)

What are the most accurate models by lane?

In [None]:
df_scores.groupby("Lane").idxmax()

In [None]:
df_scores.loc[:]

What are the most accurate models?

In [None]:
df_scores.groupby("Model").mean().sort_values("Accuracy", ascending = False)

Which lane was the most accurate?

In [None]:
df_scores.groupby("Lane").mean().sort_values("Accuracy", ascending = False)