# Data Preprocessing & Modeling

In [None]:
!pip install sklearn
!pip install xgboost

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from ast import literal_eval
from sklearn.metrics import accuracy_score
from matplotlib import pyplot
from xgboost import plot_importance

In [None]:
df = pd.read_csv('basis_dataset.csv', converters={"gold_diff": literal_eval,
                                                "kill_diff": literal_eval,
                                                "dragon_diff": literal_eval,
                                                "baron_diff": literal_eval,
                                                "herald_diff": literal_eval,
                                                "tower_diff": literal_eval,
                                                "inhib_diff": literal_eval,
                                                "gameId": literal_eval})

In [None]:
df

## Preprocessing

In [None]:
 #Delete early surrenders (< 15 Min GameDuration)
i = 0
for rows in df.iterrows():
    if len(rows[1]["gold_diff"]) < 15:
        df.drop(df.index[i], inplace= True)
        i +=1

In [None]:
#Delete unnecessary columns
df.drop(["Unnamed: 0"], axis=1, inplace=True)
df

In [None]:
#Split basis_dataset into samples per min
transformed_df = pd.DataFrame()
transformed_df = df.convert_dtypes()
final_df = pd.DataFrame()


valuelist = []
minlist = []


dfcontainer = []
i = 0

while i <= 60:
    for cols in df.columns:
        for rows in df.iterrows():
            try:
                value = rows[1][cols][i]
                valuelist.append(value)
                minlist.append(i)
            except:
                valuelist.append("N/A")
                minlist.append(i)

        transformed_df[cols] = valuelist
        transformed_df["timestamp"] = minlist
        valuelist = []
        minlist = []

    transformed_df["Team100win"] = df.Team100win
    transformed_df["gameId"] = df.gameId
    dfcontainer.append(transformed_df)
    transformed_df = pd.DataFrame()
    transformed_df = df.convert_dtypes()
    i += 1
bigdf = pd.concat(dfcontainer, ignore_index=True)

In [None]:
bigdf

In [None]:
#Clear all rows with N/A
bigdf.drop(bigdf[bigdf.gold_diff == "N/A"].index, inplace=True)
bigdf.reset_index(drop=True, inplace=True)

In [None]:
#Check cleaned dataset
bigdf

## Modeling Preliminary

In [None]:
# Create Train and Test Dataset / drop unnecessary columns

#Train Dataset
X = bigdf.drop(["Team100win", "gameId"], axis=1).copy()
X = X.astype(int)
X

In [None]:
#Test Dataset
y = pd.DataFrame(bigdf['Team100win'])
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.80, test_size=0.20, random_state=42)


In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic', missing=1, seed=42)

clf_xgb.fit(X_train,
            y_train,
            verbose=True,
            early_stopping_rounds=10,
            eval_metric="aucpr",
            eval_set=[(X_test, y_test)])

In [None]:
# make predictions for test data
y_pred = clf_xgb.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

## Model tuning

In [None]:
#eval on test&train set for learning curve
evalset = [(X_train, y_train), (X_test,y_test)]
clf_xgb = xgb.XGBClassifier(objective='binary:logistic', missing=1, seed=42)
clf_xgb.fit(X_train,
            y_train,
            verbose=True,
            early_stopping_rounds=10,
            eval_metric="logloss",
            eval_set=evalset)

In [None]:
results = clf_xgb.evals_result()

In [None]:
pyplot.plot(results['validation_0']['logloss'], label='train')
pyplot.plot(results['validation_1']['logloss'], label='test')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()


In [None]:
#Find optimal hyperparameter with GridSearch
param_grid = {
    'max_depth': [10, 11, 12],
    'learning_rate': [0.2],
    'gamma': [0, 0.5, 0.25],
    'reg_lambda': [0, 0.5, 0.25, 0.75]
}


optimal_params = GridSearchCV(
    estimator=xgb.XGBClassifier(objective='binary:logistic',
                               seed=42),
    param_grid=param_grid,
    scoring='roc_auc',
    verbose=2,
    n_jobs= 10,
    cv=5)


In [None]:
optimal_params.fit(X_train,
                  y_train,
                  early_stopping_rounds=10,
                  eval_metric='auc',
                  eval_set=[(X_test, y_test)],
                    verbose=False)

print(optimal_params.best_params_)


## Final Model

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic', 
                            missing=1, 
                            seed=42,
                           gamma=0,
                           learn_rate=0.2,
                           max_depth=12,
                           reg_lambda=0.5,
                           subsample=1,
                           colsample_bytree=1)

clf_xgb.fit(X_train,
            y_train,
            verbose=True,
            early_stopping_rounds=10,
            eval_metric="aucpr",
            eval_set=[(X_test, y_test)])

In [None]:
# make predictions for test data
y_pred = clf_xgb.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
plot_confusion_matrix(clf_xgb,
                      X_test,
                      y_test,
                      values_format="d",
                      display_labels=["win", "lose"])

In [None]:
#Feature importance
plot_importance(clf_xgb)
pyplot.show()

In [None]:
#Single Game Prediction
singlematch = bigdf[bigdf.gameId == 1490579]
singlematch.drop(["Team100win", "gameId"], axis=1, inplace=True)
singlematch.head()

In [None]:
results_single = clf_xgb.predict_proba(singlematch)
Team100 = results_single[:,0]
Team200 = results_single[:,1]

In [None]:
pyplot.plot(Team100, label="Team100")
pyplot.plot(Team200, label="Team200")
pyplot.xlabel('Minute', fontsize='x-large')
pyplot.ylabel('Win probabilty', fontsize='x-large')
pyplot.legend()
pyplot.show()