In [2]:
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#from ydata_profiling import ProfileReport
import src.dataHandling.cleaningUtils as clean
import os
import xgboost as xgb
import shap
import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance
import seaborn as sns
import matplotlib

matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
})

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


In [3]:
def drop_irrelevant(df: pd.DataFrame) -> pd.DataFrame:
    """
    drops columns which are irrelevant for the model (mostly ids)
    :param df: pd.DataFrame
    :return: None
    """
    irrelevant_cols = []
    general_irrelevant = ['gameDuration', 'gameCreation', 'gameVersion', 'mapId', 'queueId', 'patch', 'seasonId',
                          'platformId']
    irrelevant_cols.extend(general_irrelevant)
    participant_irrelevant = ['win', 'lp', 'inactive', 'teamId']
    for i in range(1, 11):
        for col in participant_irrelevant:
            irrelevant_cols.append(f'participant{i}_{col}')
    df_new = df.drop(columns=irrelevant_cols)
    return df_new

In [4]:
try:
    with open("data/static_16_12_23/raw/static_full.pkl", "rb") as f:
        df = pickle.load(f)
except FileNotFoundError:
    os.chdir('../')
    with open("data/static_16_12_23/raw/static_full.pkl", "rb") as f:
        df = pickle.load(f)
df = clean.drop_wrong_data(df)
df.reset_index(drop=True, inplace=True)
df = clean.fix_rank(df)
df = clean.calc_winrate(df)
df = clean.fix_teamId(df)
df = clean.convert_booleans(df)
df = clean.convert_lastPlayTime(df)
df = clean.convert_championTier(df)
df = clean.get_winning_team(df)  # this has to be the last step where a column is inserted
df = clean.drop_wrong_teamIds(df)
df = drop_irrelevant(df)
df = clean.drop_missing(df)
assert df.columns[-1] == 'label'
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1],
                                                    df.iloc[:, -1],
                                                    test_size=0.1,
                                                    random_state=42,
                                                    shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                  y_train,
                                                  test_size=0.1,
                                                  random_state=42,
                                                  shuffle=True)
df = df.copy()

found 51361 rows
dropped wrong mapId
dropped wrong queueId
dropped wrong gameDuration
dropped wrong seasonId
dropped wrong gameVersion
dropped wrong patch
dropped 0 wrong rows
dropped 78 rows because of wrong teamIds
dropped 12710 rows


In [5]:
len(df)

38573

In [6]:
df.head()

Unnamed: 0,participant1_level,participant1_champion_championNumber,participant1_champion_infoAttack,participant1_champion_infoDefense,participant1_champion_infoMagic,participant1_champion_infoDifficulty,participant1_champion_tier,participant1_champion_win_rate,participant1_champion_pick_rate,participant1_champion_ban_rate,...,participant2_winrate,participant3_winrate,participant4_winrate,participant5_winrate,participant6_winrate,participant7_winrate,participant8_winrate,participant9_winrate,participant10_winrate,label
0,988,83,6,6,4,6,2,49.86,2.2,2.2,...,0.8,0.544262,0.666667,0.512315,0.630631,0.524548,0.511364,0.522696,0.550562,1
1,121,150,6,5,5,8,2,49.75,0.4,0.4,...,0.833333,0.539648,0.530172,0.576923,0.53406,0.537559,0.527559,0.588235,0.511971,1
2,739,164,8,6,3,4,5,53.02,3.0,3.0,...,0.0,0.513369,0.588235,0.510145,0.6,0.537559,0.539648,0.52795,0.5,1
3,841,54,5,9,7,2,5,51.5,15.2,15.2,...,0.541528,0.524,0.653846,0.511971,0.506912,0.553476,0.525275,0.46875,0.508163,1
4,766,24,7,5,7,5,0,48.49,16.8,16.8,...,0.517751,0.536443,0.50436,0.562264,0.368421,0.666667,0.428571,0.583333,0.622449,0


In [7]:
kda_columns = ['participant1_champion_kda', 'participant2_champion_kda', 'participant3_champion_kda',
               'participant4_champion_kda', 'participant5_champion_kda', 'participant6_champion_kda',
               'participant7_champion_kda', 'participant8_champion_kda', 'participant9_champion_kda',
               'participant10_champion_kda']


In [8]:
corr_pearson = df.corr()
corr_pearson_target = abs(corr_pearson["label"])
relevant_features = corr_pearson_target[corr_pearson_target > 0.05]
corr_pearson_target.sort_values(ascending=False)


label                                1.000000
participant4_kda                     0.150714
participant7_kda                     0.144979
participant9_kda                     0.144269
participant5_kda                     0.142817
                                       ...   
participant5_champion_infoAttack     0.000370
participant7_level                   0.000297
participant3_champion_infoDefense    0.000260
participant9_level                   0.000204
participant5_level                   0.000113
Name: label, Length: 301, dtype: float64

In [9]:
nunique = df.nunique()
cols_to_drop = nunique[nunique == 1].index
cols_to_drop

Index([], dtype='object')

In [10]:
from scipy.stats import pearsonr

results = []
for column in df.columns:
    if column != 'label':
        corr, pval = pearsonr(df[column], df['label'])
        results.append({'Feature': column, 'Correlation': corr, 'P-Value': pval})

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='Correlation', ascending=False, key=abs)

In [11]:
results_df

Unnamed: 0,Feature,Correlation,P-Value
108,participant4_kda,-0.150714,9.719834e-195
195,participant7_kda,0.144979,3.349315e-180
253,participant9_kda,0.144269,1.918576e-178
137,participant5_kda,-0.142817,7.112282e-175
50,participant2_kda,-0.141945,9.490762e-173
...,...,...,...
118,participant5_champion_infoAttack,-0.000370,9.421306e-01
174,participant7_level,-0.000297,9.534331e-01
61,participant3_champion_infoDefense,0.000260,9.592288e-01
232,participant9_level,0.000204,9.680916e-01


In [12]:
results_df['Correlation'] = results_df['Correlation'].abs().sort_values(ascending=False)

In [13]:
results_df['category'] = results_df['Feature'].str.extract(r'(participant\d+)_(\w+)', expand=False)[1]
#pd.set_option('display.max_rows', results_df.shape[0]+1)
pd.set_option('display.max_rows', 10)
results_df.head()

Unnamed: 0,Feature,Correlation,P-Value,category
108,participant4_kda,0.150714,9.719833999999999e-195,kda
195,participant7_kda,0.144979,3.349315e-180,kda
253,participant9_kda,0.144269,1.918576e-178,kda
137,participant5_kda,0.142817,7.112282000000001e-175,kda
50,participant2_kda,0.141945,9.490762e-173,kda


In [46]:
average_per_category = results_df.groupby('category')['Correlation'].mean().sort_values(ascending=False)
df_avg = average_per_category.reset_index()
df_avg.columns = ['Category', 'Correlation']
df_avg['Category'] = df_avg['Category'].str.replace('_', '')

In [49]:
blue_palette = sns.color_palette("blend:cornflowerblue,navy", 5)  # Adjust the number if needed
red_palette = sns.color_palette("blend:lightcoral,firebrick", 5)  # Adjust the number if needed

# Combine the palettes
combined_palette = sns.color_palette("coolwarm_r", len(df_avg))

plt.figure(figsize=(12, 8))
ax = sns.barplot(y='Category', x='Correlation', data=df_avg, palette=combined_palette)
ax.set_ylabel('')
plt.xticks(rotation=45)
plt.title("Pearson's Correlation")
plt.tight_layout()
plt.savefig('thesis/images/pearson_correlation.pgf', bbox_inches='tight')

In [41]:
bst = xgb.XGBClassifier(n_estimators=100, max_depth=30, learning_rate=0.1, objective='binary:logistic')

In [42]:
bst.fit(X_train, y_train)

Parameters: { "device" } are not used.


KeyboardInterrupt: 

In [None]:
bst.score(X_test, y_test)

In [None]:
feature_important = bst.get_booster().get_score(importance_type='weight')
keys = list(feature_important.keys())
values = list(feature_important.values())

data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by="score", ascending=False)
data

In [None]:
data_reset = data.reset_index()
data_reset['category'] = data_reset['index'].str.extract(r'(participant\d+)_(\w+)', expand=False)[1]
average_per_category = data_reset.groupby('category').mean()
average_per_category.sort_values(by='score', ascending=False)

In [None]:
X_train.columns.values

In [None]:
sortedidx = bst.feature_importances_.argsort()
X_train.columns.values[sortedidx]

In [None]:
bst.feature_importances_[sortedidx]

In [None]:
sorted_idx = bst.feature_importances_.argsort()[::-1]
fig, ax = plt.subplots(figsize=(25, 120))
ax.set_yticklabels(X_train.columns.values[sorted_idx], fontsize=15)
ax.margins(y=0)
plt.barh(X_train.columns.values[sorted_idx], bst.feature_importances_[sorted_idx])
plt.xlabel("Xgboost Feature Importance")
plt.show()

In [None]:
X_train.columns.values[sorted_idx].tolist()[0:20]

In [None]:
bst2 = xgb.XGBClassifier(n_estimators=100, max_depth=80, learning_rate=0.1, objective='binary:logistic', device='cpu')
bst2.fit(X_train[X_train.columns.values[sorted_idx].tolist()], y_train)
bst2.score(X_test[X_train.columns.values[sorted_idx].tolist()], y_test)

In [None]:
num_round = 100
param = {
    "eta": 0.05,
    "max_depth": 10,
    "tree_method": "hist",
    "device": "cuda",
}

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=X_train.columns.values.tolist())
model = xgb.train(param, dtrain, num_round)

In [None]:
model.set_param({"device": "cuda"})
shap_values = model.predict(dtrain, pred_contribs=True)

In [None]:
shap_interaction_values = model.predict(dtrain, pred_interactions=True)

In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_train)

In [None]:
shap.summary_plot(shap_values, X_train, plot_type="bar")

In [None]:
shap.force_plot(
    explainer.expected_value,
    shap_values[0, :],
    X_train.to_numpy()[0, :],
    feature_names=X_train.columns.values.tolist(),
    matplotlib=True,
)

In [None]:
perm_importance = permutation_importance(bst, X_test, y_test, n_repeats=10, random_state=42)
sorted_idx = perm_importance.importances_mean.argsort()
fig, ax = plt.subplots(figsize=(15, 105))
ax.set_yticklabels(X_train.columns.values[sorted_idx], fontsize=10)
ax.margins(y=0)
plt.barh(X_train.columns.values[sorted_idx], bst.feature_importances_[sorted_idx])
plt.xlabel("Xgboost Feature Importance")
plt.show()

In [None]:
X_train[X_train.columns.values[sorted_idx][-1]]