In [1]:
%pip install flatten_json pymongo pandas sklearn matplotlib np xgboost seaborn
import pandas as pd
import pymongo
import sklearn
from flatten_json import flatten
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


Note: you may need to restart the kernel to use updated packages.


In [2]:
# version we want: Version 12.18.468.3251 (Sep 16 2022/13:11:56) [PUBLIC] <Releases/12.18>

mongo_client = pymongo.MongoClient("mongodb://localhost:27017/")
mongo_database = mongo_client["tft_data"]
mongo_container = mongo_database["challenger"]
version = 'Version 12.18.468.3251 (Sep 16 2022/13:11:56) [PUBLIC] <Releases/12.18>'

all_data = list(mongo_container.find({"info.game_version": version}))

In [3]:
df = pd.json_normalize(all_data, ["info", "participants"], ["metadata"])

df['played_with_partner'] = np.where(np.isnan(df['partner_group_id']), False, True)

# drop columns gold_left, last_round, level, players_eliminated, time_eliminated, total_damage_to_players, traits, units  
df = df.drop(columns=['gold_left', 'last_round', 'level', 'players_eliminated', 'time_eliminated', 'total_damage_to_players'])

# drop columns starting with companion
df = df.loc[:, ~df.columns.str.startswith('companion')]

df['match_id'] = df['metadata'].apply(lambda x: x['match_id'])
df = df.drop(columns=['metadata', 'partner_group_id'])

def get_rid_of_strings_in_string(diry, dirt):
    for d in dirt:
        diry = diry.replace(d, '')
    return diry

def get_augment_at_position(row, position):
    return get_rid_of_strings_in_string(row[position], ['TFT6_Augment_', 'TFT7_Augment_']) if len(row) > position else None

# create new columns with items of augments column
df['augment_1'] = df['augments'].apply(get_augment_at_position, position=0)
df['augment_2'] = df['augments'].apply(get_augment_at_position, position=1)
df['augment_3'] = df['augments'].apply(get_augment_at_position, position=2)
df = df.drop(columns=['augments'])

# drop colum traits
df = df.drop(columns=['traits'])

# add a column "top4" that is true if placement is 1,2,3 or 4 
df['top4'] = np.where(df['placement'] <= 4, True, False)

def units_object_array_to_unit_array_without_items(unit_object_array):
    return [f"{get_rid_of_strings_in_string(unit['character_id'], ['TFT7_', 'TFT6_'])}_{unit['tier']}" for unit in unit_object_array]


df['units'] = df['units'].apply(units_object_array_to_unit_array_without_items)

from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('units')),  # type: ignore
                          columns=mlb.classes_,
                          index=df.index))

df = pd.get_dummies(df, columns=['augment_1', 'augment_2', 'augment_3'])
df = df.drop(columns=['placement', 'puuid', 'match_id'])

df_start = df
df

Unnamed: 0,played_with_partner,top4,AoShin_1,AoShin_2,AoShin_3,Aphelios_1,Aphelios_2,Aphelios_3,AquaticDragon_1,AquaticDragon_2,...,augment_3_VerdantVeil,augment_3_WarriorEmblem,augment_3_WarriorEmblem2,augment_3_WarriorTiamat,augment_3_WarriorTrait,augment_3_Weakspot,augment_3_WhispersTerrify,augment_3_WhispersTrait,augment_3_WindfallPlusPlus,augment_3_WoodlandCharm
0,True,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,True,False,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,True,True,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,True,True,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,True,False,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42851,False,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
42852,False,True,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
42853,False,True,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
42854,False,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# compare logistic regression, random forest, xgboost, decision tree, svm, knn, naive bayes, perceptron, linear svc

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_precision_recall_curve

X = df.drop(columns=['top4'])
y = df['top4']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, y_train) * 100, 2)
print(f"Logistic Regression: {acc_log}")

# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)
random_forest.score(X_train, y_train)
acc_random_forest = round(random_forest.score(X_train, y_train) * 100, 2)
print(f"Random Forest: {acc_random_forest}")

# XGBoost

import xgboost as xgb

xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
acc_xgb = round(xgb_model.score(X_train, y_train) * 100, 2)
print(f"XGBoost: {acc_xgb}")

# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, y_train) * 100, 2)
print(f"Decision Tree: {acc_decision_tree}")

# KNN

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, y_train) * 100, 2)
print(f"KNN: {acc_knn}")

# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, y_train) * 100, 2)
print(f"Gaussian Naive Bayes: {acc_gaussian}")

# Perceptron

perceptron = Perceptron()
perceptron.fit(X_train, y_train)
y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, y_train) * 100, 2)
print(f"Perceptron: {acc_perceptron}")

# Stochastic Gradient Descent

from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier()
sgd.fit(X_train, y_train)
y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, y_train) * 100, 2)
print(f"Stochastic Gradient Descent: {acc_sgd}")

# compare models

results = pd.DataFrame({
    'Model': ['KNN', 'Logistic Regression',
                'Random Forest', 'Naive Bayes', 'Perceptron',
                'Stochastic Gradient Decent',
                'Decision Tree', 'XGBoost'],
    'Score': [acc_knn, acc_log,
                acc_random_forest, acc_gaussian, acc_perceptron,
                acc_sgd, acc_decision_tree, acc_xgb]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
print(result_df.head(9))

# cross validation

kfold = KFold(n_splits=10, random_state=22)
xyz = []
accuracy = []
std = []
classifiers=['KNN', 'Logistic Regression',
                'Random Forest', 'Naive Bayes', 'Perceptron',
                'Stochastic Gradient Decent',
                'Decision Tree', 'XGBoost']

models=[KNeighborsClassifier(n_neighbors=3), LogisticRegression(), RandomForestClassifier(n_estimators=100), GaussianNB(), Perceptron(), SGDClassifier(), DecisionTreeClassifier(), xgb.XGBClassifier(objective="binary:logistic", random_state=42)]
for i in models:
    model = i
    cv_result = cross_val_score(model,X,y, cv = kfold, scoring = "accuracy")
    cv_result = cv_result
    xyz.append(cv_result.mean())
    std.append(cv_result.std())
    accuracy.append(cv_result)
new_models_dataframe2 = pd.DataFrame({'CV Mean':xyz,'Std':std}, index=classifiers)

# visualize cross validation

plt.subplots(figsize=(12,6))
box = pd.DataFrame(accuracy, index=[classifiers])
box.T.boxplot()

new_models_dataframe2



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression: 82.31
Random Forest: 100.0
XGBoost: 82.48
