# Gradient Boosting 모델

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

# matplotlib 한글 문제
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname = "c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family = font_name)

In [3]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from tqdm import tqdm 

In [4]:
final_df = pd.read_csv('C:/Users/User/ML/final_df.csv', header = 0, engine="python")
final_df = final_df.drop(columns='Unnamed: 0')
final_df.head()

Unnamed: 0,gameDuration,blueWins,blueFirstBlood,blueFirstTower,blueFirstBaron,blueFirstDragon,blueFirstInhibitor,dragonKillsDiff,baronKillsDiff,inhibitorKillsDiff,...,killsDiff,assistDiff,championDamageDealtDiff,totalGoldDiff,totalMinionKillsDiff,avgLevelDiff,killingSpreeDiff,objectDamageDealtDiff,avgKDADiff,sustainedWardsDiff
0,20분이상 30분미만,0,1,1,0,0,0,-3,0,0,...,-0.461538,-0.551724,-0.112096,-0.000965,0.029482,-0.032258,-0.333333,0.145882,-0.79351,-0.191489
1,20분이상 30분미만,0,0,1,0,0,0,-2,0,0,...,-0.571429,-0.653846,-0.189156,-0.050191,0.049394,-0.058824,-0.6,-0.061948,-0.881579,-0.342105
2,20분이상 30분미만,1,0,0,1,1,1,-1,1,2,...,0.395349,0.46988,0.023432,0.111697,0.016393,0.064748,0.555556,0.270663,0.714286,0.030928
3,20분이상 30분미만,0,1,1,0,1,0,1,0,0,...,-0.095238,-0.278689,0.05094,-0.038586,-0.033852,-0.0625,-0.166667,-0.212191,-0.293424,-0.044444
4,30분이상 40분미만,1,1,1,0,1,1,3,0,2,...,0.166667,0.195876,0.164727,0.042377,-0.042332,0.025974,0.2,0.374054,0.340886,-0.045455


In [5]:
game_length_bins = list(range(20, 50, 10))
game_length_bins_label = [str(x)+"분이상 "+str(x+10)+"분미만" for x in game_length_bins]

In [6]:
from sklearn.metrics import accuracy_score
result_mat = pd.DataFrame(columns=['accuracy'], index =game_length_bins_label)

In [7]:
import eli5
from eli5.sklearn import PermutationImportance

In [8]:
for x in tqdm(game_length_bins_label):
    temp = final_df['gameDuration'] == x
    X = final_df[temp].drop(columns=['gameDuration','blueWins'])
    Y = final_df[temp]['blueWins']

    indices_to_keep = ~X.isin([np.nan, np.inf, -np.inf]).any(1)
    X = X[indices_to_keep].astype(np.float64) 
    Y = Y[indices_to_keep]
    # Input contains NaN, infinity or a value too large for dtype('float64') 에러 방지

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, stratify=Y,train_size=0.7,test_size=0.3,random_state=1234)
    
    gb = GradientBoostingClassifier(random_state=0)
    gb.fit(X_train, Y_train)
    
    Y_predict_gb = gb.predict(X_test)
    accuracy = accuracy_score(Y_test, Y_predict_gb)

    result_mat.loc[x]=[accuracy]

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [01:40<00:00, 33.54s/it]


In [9]:
result_mat
# 시간이 오래 걸리는 게임일수록 예측도 떨어짐

Unnamed: 0,accuracy
20분이상 30분미만,0.994854
30분이상 40분미만,0.983706
40분이상 50분미만,0.919118


In [10]:
temp = final_df['gameDuration'] == '20분이상 30분미만' # 20-30 그룹의 변수 중요도 체크
X = final_df[temp].drop(columns=['gameDuration','blueWins'])
Y = final_df[temp]['blueWins']

indices_to_keep = ~X.isin([np.nan, np.inf, -np.inf]).any(1)
X = X[indices_to_keep].astype(np.float64) 
Y = Y[indices_to_keep]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, stratify=Y,train_size=0.7,test_size=0.3,random_state=1234)

gb = GradientBoostingClassifier(random_state=0)
gb.fit(X_train, Y_train)

perm = PermutationImportance(gb, random_state = 42).fit(X_test, Y_test)

# 20-30 그룹 변수 중요도
X_test = pd.DataFrame(X_test, columns=X.columns)
eli5.show_weights(perm, feature_names = X_test.columns.tolist()) 

Weight,Feature
0.0593  ± 0.0021,avgLevelDiff
0.0244  ± 0.0009,towerKillsDiff
0.0061  ± 0.0005,totalGoldDiff
0.0027  ± 0.0010,inhibitorKillsDiff
0.0015  ± 0.0003,assistDiff
0.0006  ± 0.0004,totalMinionKillsDiff
0.0003  ± 0.0002,blueFirstTower
0.0002  ± 0.0001,blueFirstBlood
0.0001  ± 0.0002,objectDamageDealtDiff
0.0001  ± 0.0002,killsDiff


In [11]:
temp = final_df['gameDuration'] == '30분이상 40분미만' # 30-40 그룹의 변수 중요도 체크
X = final_df[temp].drop(columns=['gameDuration','blueWins'])
Y = final_df[temp]['blueWins']

indices_to_keep = ~X.isin([np.nan, np.inf, -np.inf]).any(1)
X = X[indices_to_keep].astype(np.float64) 
Y = Y[indices_to_keep]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, stratify=Y,train_size=0.7,test_size=0.3,random_state=1234)

gb = GradientBoostingClassifier(random_state=0)
gb.fit(X_train, Y_train)

perm = PermutationImportance(gb, random_state = 42).fit(X_test, Y_test)

# 30-40 그룹 변수 중요도
X_test = pd.DataFrame(X_test, columns=X.columns)
eli5.show_weights(perm, feature_names = X_test.columns.tolist()) 

Weight,Feature
0.0999  ± 0.0033,avgLevelDiff
0.0814  ± 0.0047,towerKillsDiff
0.0096  ± 0.0024,inhibitorKillsDiff
0.0036  ± 0.0004,assistDiff
0.0025  ± 0.0008,totalMinionKillsDiff
0.0022  ± 0.0002,baronKillsDiff
0.0013  ± 0.0013,totalGoldDiff
0.0006  ± 0.0005,blueFirstInhibitor
0.0005  ± 0.0006,objectDamageDealtDiff
0.0002  ± 0.0003,wardkillsDiff


In [12]:
temp = final_df['gameDuration'] == '40분이상 50분미만' # 40-50 그룹의 변수 중요도 체크
X = final_df[temp].drop(columns=['gameDuration','blueWins'])
Y = final_df[temp]['blueWins']

indices_to_keep = ~X.isin([np.nan, np.inf, -np.inf]).any(1)
X = X[indices_to_keep].astype(np.float64) 
Y = Y[indices_to_keep]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, stratify=Y,train_size=0.7,test_size=0.3,random_state=1234)

gb = GradientBoostingClassifier(random_state=0)
gb.fit(X_train, Y_train)

perm = PermutationImportance(gb, random_state = 42).fit(X_test, Y_test)

# 40-50 그룹 변수 중요도
X_test = pd.DataFrame(X_test, columns=X.columns)
eli5.show_weights(perm, feature_names = X_test.columns.tolist()) 

Weight,Feature
0.1387  ± 0.0221,towerKillsDiff
0.0635  ± 0.0141,avgLevelDiff
0.0301  ± 0.0144,totalGoldDiff
0.0103  ± 0.0037,totalMinionKillsDiff
0.0049  ± 0.0064,assistDiff
0.0020  ± 0.0020,blueFirstInhibitor
0.0012  ± 0.0022,killsDiff
0.0007  ± 0.0025,sustainedWardsDiff
0.0007  ± 0.0040,objectDamageDealtDiff
0  ± 0.0000,killingSpreeDiff


In [14]:
# gbm 변수 중요도 top 5
feature_importance_rank = pd.DataFrame(columns=['features','1위','2위','3위','4위','5위'], index =[1,2,3,4,5])
feature_importance_rank.loc[1] = ['avgLevelDiff', 2,1,0,0,0]
feature_importance_rank.loc[2] = ['towerKillsDiff', 1,2,0,0,0]
feature_importance_rank.loc[3] = ['totalGoldDiff', 0,0,2,0,0]
feature_importance_rank.loc[4] = ['inhibitorKillsDiff', 0,0,1,1,0]
feature_importance_rank.loc[5] = ['assistDiff', 0,0,0,1,2]

feature_importance_rank.head()

Unnamed: 0,features,1위,2위,3위,4위,5위
1,avgLevelDiff,2,1,0,0,0
2,towerKillsDiff,1,2,0,0,0
3,totalGoldDiff,0,0,2,0,0
4,inhibitorKillsDiff,0,0,1,1,0
5,assistDiff,0,0,0,1,2
