# LEAGUE (title tba)

**Name(s)**: Palina Volskaya & Hieu Ngyuen

**Website Link**: (your website link)

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

import plotly.express as px
pd.options.plotting.backend = 'plotly'

from dsc80_utils import * # Feel free to uncomment and use this.

## Step 1: Introduction

In [None]:
league = pd.read_csv('data/LoL2022data.csv')
league.head()

In [None]:
teams = league[league['position']=='team']
teams['firstdragon'] = teams['firstdragon'].astype(bool)
players = league[league['position']!='team']
teams.head()

In [None]:
league_counts = teams['league'].value_counts() / 2
league_counts = league_counts.reset_index()
league_counts.columns = ['league', 'games_played']

fig = px.bar(
    league_counts,
    x='league',
    y='games_played',
    title="Number of games played by league",
    labels={'league': 'League', 'games_played': 'Number of games'}
)

fig.show()

In [None]:
group_teams = teams.groupby('teamid')[['goldspent','result']].sum()
fig = px.scatter(
    group_teams,
    x='goldspent',
    y='result',
    title="Plot of wins by a team against their total gold spent",
    labels={
        'goldspent': 'Gold spent',
        'result': 'Wins'
    }
)

fig.update_traces(marker=dict(size=4, color='green'))

fig.show()

In [None]:
yes_first_dragon = teams[teams['firstdragon']]
no_first_dragon = teams[teams['firstdragon']==False]
wins_yes = yes_first_dragon.groupby('teamid')['result'].sum()
wins_no = no_first_dragon.groupby('teamid')['result'].sum()

fig = go.Figure()

fig.add_trace(go.Histogram(
    x=wins_yes,
    name="With first dragon",
    marker=dict(color="blue"),
    opacity=0.75
))

fig.add_trace(go.Histogram(
    x=wins_no,
    name="Without first dragon",
    marker=dict(color="red"),
    opacity=0.7
))

fig.update_layout(
    title="Conditional distribution plot of team wins with or without first dragon",
    xaxis_title="Wins",
    yaxis_title="Frequency",
    barmode="overlay"
)

fig.show()


In [None]:
copy = teams.copy()
result_arr = copy['kills']/copy['deaths']
final_arr = np.where(np.isinf(result_arr), 0, result_arr)
final_arr = np.nan_to_num(final_arr, nan=0, posinf=0, neginf=0)
copy['kills per death'] = final_arr
pivot_overall = pd.pivot_table(
    copy,
    values=['result','kills per death','goldspent'],
    index='firstdragon',      
    aggfunc='mean'
)

pivot_overall = pivot_overall.rename_axis('firstdragon')
pivot_overall = pivot_overall.rename(columns={"result":"win percentage"})
pivot_overall['win percentage'] = (pivot_overall['win percentage']*100).round(2)
pivot_overall['kills per death'] = (pivot_overall['kills per death']).round(2)
pivot_overall['goldspent'] = (pivot_overall['goldspent']).round(2)
pivot_overall.index = pivot_overall.index.map(
    {True: 'Got first dragon', False: 'Did NOT get first dragon'}
)
pivot_overall

In [None]:
pivot_team = pd.pivot_table(
    copy,
    values='result',
    index='teamname',           # one row per team
    columns=['firstdragon'],    # two columns: 0 = no first drake, 1 = got first drake
    aggfunc='mean'
)

# Convert to percentages & rename columns
pivot_team = (pivot_team * 100).round(2)
pivot_team = pivot_team.rename(
    columns={
        0: 'no_first_dragon_win_pct',
        1: 'first_dragon_win_pct'
    }
)

## Step 3: Assessment of Missingness

After analyzing the dataset aggregated by teams, we found the `pick1` column to be NMAR due to its inability to be predicted from other columns, and the missingness potentially occuring due to a variable not included in the dataset. Unlike many of the columns we analyzed, the missingness of `pick1` did not depend on league, as many leagues didn't report certain variables as a whole. Instead, 31 out of the 55 leagues had missingness for `pick1` specifically. The missingness also did not depend on `teamid`, as 381 out of 593 teams had a missing value in `pick1`. Instead, the column's missingness depends on the value itself - the champion chosen. Missingness in this column can occur due to the champion being a newly added character to the game, resulting in a null value being recorded. Additional data that would make this column MAR would be data about whether the champion is a newly added character at the time of the game. 

In [None]:
observed_statistic = teams[teams['goldat25'].isna()]['gamelength'].mean()
observed_means = []
for i in range(1, 500):
    permuted_gamelengths = teams.copy()
    permuted_gamelengths['permuted_lengths'] = np.random.permutation(teams['gamelength'])
    observed_means.append(permuted_gamelengths[permuted_gamelengths['goldat25'].isna()]['permuted_lengths'].mean())
sum(observed_statistic >= observed_means) / 1000
gamelength_plot = px.histogram(observed_means)
gamelength_plot.add_vline(x=observed_statistic, line_dash="dash", line_color="red", line_width=2, opacity=1)
gamelength_plot.update_layout(showlegend=False)

In [None]:
observed_statistic = teams[teams['goldat25'].isna()]['firstdragon'].mean()
observed_means = []
for i in range(1, 500):
    permuted_firstdragons = teams.copy()
    permuted_firstdragons['permuted_lengths'] = np.random.permutation(teams['firstdragon'])
    observed_means.append(permuted_firstdragons[permuted_firstdragons['goldat25'].isna()]['permuted_lengths'].mean())
sum(observed_statistic >= observed_means) / 1000
fig2 = px.histogram(observed_means)
fig2.add_vline(x=observed_statistic, line_dash="dash", line_color="red", line_width=2, opacity=1)
fig2.update_layout(showlegend=False)

## Step 4: Hypothesis Testing

In [None]:
dropped_columns = teams[['gameid','teamid','firstdragon','result']]
N = 1000
# Observed difference in proportions
obs_diff = dropped_columns[dropped_columns['firstdragon'] == 1]['result'].mean() - \
           dropped_columns[dropped_columns['firstdragon'] == 0]['result'].mean()

# Permutation test
n_permutations = 10000
diffs = np.zeros(n_permutations)

for i in range(n_permutations):
    shuffled = np.random.permutation(dropped_columns['result'])
    diffs[i] = shuffled[dropped_columns['firstdragon'] == 1].mean() - \
               shuffled[dropped_columns['firstdragon'] == 0].mean()

# Two-sided p-value
p_value = np.sum(diffs >= obs_diff) / N
fig = px.histogram(diffs,nbins=30)
fig.add_vline(x=obs_diff, line_color='red', line_dash='dash', line_width=2)
fig.add_annotation(
    x=obs_diff,  
    y=1,  
    text=f'Observed diff = {obs_diff:.3f}',
    yshift=100, 
    showarrow=False,
    xshift=10,
    textangle=-90, 
)
fig.update_layout(showlegend=False, title="Permutation test for firstdragon")
fig.show()
print("Observed difference:", obs_diff)
print(f"Permutation p-value: {p_value:.3g}")

## Step 5: Framing a Prediction Problem

## Step 6: Baseline Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score
cols = teams.columns
cols = [i for i in cols if '25' in i]
cols.append('result')
X_train, X_test, y_train, y_test = train_test_split(
    teams[['goldat10','xpat10']], teams['result'], test_size=0.2, random_state=42
)

pipe = Pipeline([       # preprocessing step
    ("clf", DecisionTreeClassifier(
    criterion="gini",     # or "entropy", "log_loss"
    max_depth=None,       # None = grow until pure / min_samples
    random_state=42       # controls randomness (None = fully random each run)
))
])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
acc = accuracy_score(y_test, y_pred)
precision = y_pred.astype(int) @ y_test.astype(int) / y_pred.sum()
recall = y_pred.astype(int) @ y_test.astype(int) / y_test.sum()
print(f"Test accuracy: {acc:.3f}")
print(f"Test precision: {precision:.3f}")
print(f'Test recall: {recall:.3f}')


## Step 7: Final Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, KFold 



final= teams[cols].dropna(axis=0,how="any")

# Creating Pipeline 

pl = Pipeline([
 ('clf', RandomForestClassifier(
 random_state=42,
 n_jobs=-1,
 ))
])
n_features = X_train.shape[1]
kf = KFold(n_splits=5, shuffle=True, random_state=42)
hyperparameters = {
 'clf__max_depth': [5],
 'clf__n_estimators': [100],
 'clf__max_features':list({
        'log2',
    }),
 'clf__bootstrap': [True],
}
# hyperparameters = {
#  'clf__max_depth': [None, 5, 10, 20],
#  'clf__n_estimators': [100, 200, 300, 500],
#  'clf__max_features':list({
#         'sqrt',
#         'log2',
#     }),
#  'clf__bootstrap': [True, False],
# }
grids = GridSearchCV(
 pl,
 n_jobs=-1, 
 param_grid=hyperparameters,
 return_train_score=True,
 cv=kf,
 scoring="f1",
 verbose=2,
)


In [None]:
import itertools
final_no_result = final[[i for i in cols if i !="result"]]
final_result = final['result']

def make_quadratic(df):
    combinations =itertools.combinations(df.columns,r=2)
    return_dict = {}
    for combination in combinations:
        return_dict[f'{combination[0]}_{combination[1]}'] = df[combination[0]]*df[combination[1]]
    return pd.DataFrame(return_dict)

fe_final = make_quadratic(final_no_result)
X_train_fe, X_test_fe, y_train_fe, y_test_fe = train_test_split(
    fe_final,final_result, test_size=0.2, random_state=42
)

grids.fit(X_train_fe, y_train_fe)


In [None]:
predictions = grids.predict(X_test_fe)
precision = predictions.astype(int) @ y_test_fe.astype(int) / predictions.sum()
recall = predictions.astype(int) @ y_test_fe.astype(int) / y_test_fe.sum()
print("Best params:", grids.best_params_)
print("Best CV score:", grids.best_score_)
print("Test score:", grids.score(X_test_fe, y_test_fe))

## Step 8: Fairness Analysis

In [None]:
leagues = teams['league'].unique()
dataframe = {}

for league in leagues:
    filtered_no_results = teams[teams['league']==league][[i for i in cols if i !='result']]
    filtered_results = teams[teams['league']==league]['result']
    filtered_final = make_quadratic(filtered_no_results)
    predictions = grids.predict(filtered_final)
    F_1 = grids.score(filtered_final,filtered_results)
    
    dataframe[league] = {"F_1": F_1}

split_dataframe = pd.DataFrame(dataframe).T
split_dataframe.plot(y='F_1',kind='bar')
fig = px.bar(split_dataframe, barmode='group')
fig.update_layout(title="F-1 scores for each league", yaxis_title="rate", xaxis_title="league")
fig.show()
fig.write_html("Side-by-side_precision_for_each_league")

In [None]:
LCL = teams[teams['league']=='LCL'][[i for i in cols if i !='result']]
LCL_results = teams[teams['league']=='LCL']['result']
observed_df = make_quadratic(LCL)
predictions = grids.predict(observed_df)
observed_f1 = grids.score(observed_df,LCL_results)

N = 1000
f1s = []
for i in range(N):
    copy = teams.copy()
    copy['league'] = np.random.permutation(teams['league'])
    LCL_permute = copy[copy['league']=='LCL'][[i for i in cols if i !='result']]
    LCL_results_permute = copy[copy['league']=='LCL']['result']
    test_df = make_quadratic(LCL_permute)
    predictions = grids.predict(test_df)
    test_f1 = grids.score(test_df,LCL_results_permute)
    f1s.append(test_f1)
fig = px.histogram(f1s, nbins=12)
fig.add_vline(x=observed_f1,line_width=2,line_dash="dash",line_color="red", opacity=1)
fig.update_layout(showlegend=False, 
                  title_text="Distribution of model precision",
                  title_x=0.5,  # Center the title
                  title_xanchor='center')
fig.show()
fig.write_html('Distribution of Model Precision')

In [None]:
p_value = (f1s>=observed_f1).sum() / len(f1s)
print(f'p_value: {p_value:.3g}')