### Load libraires

In [1]:
# read the data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from causalml.inference.meta import BaseSRegressor, BaseTRegressor, BaseXRegressor, BaseRRegressor
from causalml.inference.tree import UpliftTreeClassifier, UpliftRandomForestClassifier
from causalml.metrics import get_cumgain
from causalml.propensity import ElasticNetPropensityModel

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from xgboost import XGBRegressor

import shap
import time
import os
import warnings
warnings.filterwarnings('ignore')

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
Failed to import duecredit due to No module named 'duecredit'


### Load dataset (feature selection and engineering)

In [2]:
df = pd.read_csv("/Users/zy/Documents/GitHub/League-of-Legends/high_diamond_ranked_10min.csv")

df.head()

Unnamed: 0,gameId,blueWins,blueWardsPlaced,blueWardsDestroyed,blueFirstBlood,blueKills,blueDeaths,blueAssists,blueEliteMonsters,blueDragons,...,redTowersDestroyed,redTotalGold,redAvgLevel,redTotalExperience,redTotalMinionsKilled,redTotalJungleMinionsKilled,redGoldDiff,redExperienceDiff,redCSPerMin,redGoldPerMin
0,4519157822,0,28,2,1,9,6,11,0,0,...,0,16567,6.8,17047,197,55,-643,8,19.7,1656.7
1,4523371949,0,12,1,0,5,5,5,0,0,...,1,17620,6.8,17438,240,52,2908,1173,24.0,1762.0
2,4521474530,0,15,0,0,7,11,4,1,1,...,0,17285,6.8,17254,203,28,1172,1033,20.3,1728.5
3,4524384067,0,43,1,0,4,5,5,1,0,...,0,16478,7.0,17961,235,47,1321,7,23.5,1647.8
4,4436033771,0,75,4,0,6,6,6,0,0,...,0,17404,7.0,18313,225,67,1004,-230,22.5,1740.4


In [3]:
# drop unnecessary columns
col_to_drop = ['gameId', 'redFirstBlood', 'redKills','blueCSPerMin','blueGoldPerMin','redDeaths','redGoldDiff','redExperienceDiff','redCSPerMin','redGoldPerMin','redTotalGold','redTotalExperience']
df = df.drop(col_to_drop, axis=1)
#'redEliteMonsters', 'redDragons', 'redCSPerMin', is killing minions matter? speed? 'blueGoldDiff',
# cs per min same as total minions killed
#'redTotalJungleMinionsKilled', may be showing how they are moving?
#Red team gold difference compared to the enemy
#feature enginner: jungle minion/total minion; wards remaining
duplicated_rows = df.duplicated().sum()

if duplicated_rows == 0:
    print('There are 0 rows that are duplicated, which means each row in the DataFrame is unique.')
    print('So that we do not need to continue processing duplicate lines')
else:
    print(f'There are {duplicated_rows} rows that are duplicated so we need to drop those {duplicated_rows} rows')
    df = df.drop_duplicates()
    print(f'After drop duplicated rows, there are {df.shape[0]} rows left')

There are 0 rows that are duplicated, which means each row in the DataFrame is unique.
So that we do not need to continue processing duplicate lines


In [4]:
df['blueHelpful'] = np.where(df['blueKills'] != 0, df['blueAssists'] / df['blueKills'], 0)
df['redHelpful'] = np.where(df['blueDeaths'] != 0, df['redAssists'] / df['blueDeaths'], 0)
#maybe hinting direction of attacking
df['blueJunglePercentage'] = df['blueTotalJungleMinionsKilled'] / df['blueTotalMinionsKilled']
df['redJunglePercentage'] = df['redTotalJungleMinionsKilled'] / df['redTotalMinionsKilled']

df['redWardsRemaining'] = df['redWardsPlaced'] - df['blueWardsDestroyed']
df['blueWardsRemaining'] = df['blueWardsPlaced'] - df['redWardsDestroyed']

In [5]:
df.drop(['blueJunglePercentage','redWardsPlaced','blueWardsPlaced'],axis=1,inplace=True)# its not that the number of wards placed matters, but the number of wards remaining

## Casual inference model

In [6]:
df['treatment'] = (df['blueGoldDiff'] > 14).astype(int) # this can be identify as adopting the strategy or not(try to get more gold in the first 10 mins than the enemy)
y = df['blueWins'].values
w = df['treatment'].values
w_multi = np.array(['treatment_A' if x==1 else 'control' for x in w])

X = df.drop(columns=['blueWins', 'treatment', 'blueGoldDiff']).values

In [7]:
# Define tau based on the difference in outcomes between treated and control groups
tau = df.groupby('treatment')['blueWins'].mean()[1] - df.groupby('treatment')['blueWins'].mean()[0]
tau

0.44690703890353317

In [8]:
# Assuming X, w_multi, and y are your features, treatment, and outcome variables, respectively
X_train, X_test, w_train, w_test, _, _, y_train, y_test = train_test_split(X, w_multi, w, y, test_size=0.2, random_state=42)

In [9]:
# Define the base algorithm
base_algo = LogisticRegression()

#### S learner

In [10]:
# Initialize the S-Learner with a base regressor
slearner = BaseSRegressor(base_algo, control_name='control')

# Fit the learner
slearner.fit(X_train, w_train, y_train)

# Estimate ATE
ate = slearner.estimate_ate(X_test, w_test, y_test)

# Print the ATE estimate
print(f"Estimated ATE with Logistic Regression: {ate}")

Estimated ATE with Logistic Regression: [0.00303644]


In [11]:
# Check the accuracy of the CATE estimation with the bootstrap CI
cate_p, _, _ = slearner.fit_predict(X_test, w_test, y_test, return_ci=True, n_bootstraps=10)

# Create DataFrame for metrics
auuc_metrics = pd.DataFrame({
    "cate_p": cate_p.flatten(),
    "W": w_test,
    "y": y_test,
    "treatment_effect_col": tau,
})

# Convert 'treatment_effect_col' to numeric type if needed
auuc_metrics['treatment_effect_col'] = pd.to_numeric(auuc_metrics['treatment_effect_col'])

# Calculate Cumulative Gain
cumgain = get_cumgain(
    auuc_metrics, outcome_col="y", treatment_col="W", treatment_effect_col="treatment_effect_col"
)

# Check if the cumulative gain when using the model's prediction is higher than it would be under random targeting
assert cumgain["cate_p"].sum() > cumgain["Random"].sum()

100%|██████████| 10/10 [00:02<00:00,  3.66it/s]


AssertionError: 

In [12]:
print("Sum of 'cate_p' column:", cumgain["cate_p"].sum())
print("Sum of 'Random' column:", cumgain["Random"].sum())

Sum of 'cate_p' column: 872932.7933213362
Sum of 'Random' column: 872932.7933213362


This may suggests that the model's prediction is not performing better than random targeting according to the current evaluation metric.

#### X learner

In [13]:
# Initialize BaseXRegressor with GradientBoostingClassifier
xlearner_gb = BaseXRegressor(base_algo, control_name='control')

# Fit the learner
xlearner_gb.fit(X_train, w_train, y_train)

# Estimate ATE
ate_x = xlearner_gb.estimate_ate(X_test, w_test, y_test)

# Print the ATE estimate
print(f"Estimated ATE with GradientBoostingClassifier: {ate_x}")

Estimated ATE with GradientBoostingClassifier: (array([0.79050799]), array([0.74540451]), array([0.83561147]))


In [14]:
# Check the accuracy of the CATE estimation with the bootstrap CI
cate_p, _, _ = xlearner_gb.fit_predict(X_test, w_test, y_test, return_ci=True, n_bootstraps=10)

# Create DataFrame for metrics
auuc_metrics = pd.DataFrame({
    "cate_p": cate_p.flatten(),
    "W": w_test,
    "y": y_test,
    "treatment_effect_col": tau,
})

# Convert 'treatment_effect_col' to numeric type if needed
auuc_metrics['treatment_effect_col'] = pd.to_numeric(auuc_metrics['treatment_effect_col'])

# Calculate Cumulative Gain
cumgain = get_cumgain(
    auuc_metrics, outcome_col="y", treatment_col="W", treatment_effect_col="treatment_effect_col"
)

# Check if the cumulative gain when using the model's prediction is
# higher than it would be under random targeting
assert cumgain["cate_p"].sum() > cumgain["Random"].sum()

100%|██████████| 10/10 [00:13<00:00,  1.30s/it]


AssertionError: 