In [None]:
import pandas as pd
import numpy as np
from dataset_functions import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV,StratifiedKFold

# Random Forest

In [None]:
params = {
    'n_estimators': [50, 100, 150], #[100,200,250,300]
    'class_weight': ['balanced',None],
    'criterion': ['entropy','gini'],
    'max_depth': [5,10], #[None,10,20,30]
    'min_samples_leaf': [1,2,5],
}

model = RandomForestClassifier(random_state=42)
scorings = ["roc_auc","accuracy","recall","precision","f1"]
best_model = GridSearchCV(estimator=model,param_grid=params,scoring=scorings,refit="roc_auc",cv=StratifiedKFold(n_splits=5,shuffle=True), return_train_score=True,verbose=1,n_jobs=-1)

In [None]:
df, target = get_dataset()
win = target['radiant_win']

Filtering Df:  (game_mode == 2 or game_mode == 22) and game_time > 0 

Dropped:  ['lobby_type', 'chat_len', 'game_mode', 'match_id_hash'] 

Dataframe Shape:  (32153, 242) 

Target shape: (32153, 6)


In [None]:
df_tt = teamstats_teamheros_transform(df.copy())
df_tt_average = team_mean_position_transform(df_tt.copy())
df_tt_weighted = team_weighted_mean_position_transform(df_tt.copy())

df_tt = feature_selection_transform(df_tt,win,0.01)
df_tt_average = feature_selection_transform(df_tt_average,win,0.01)
df_tt_weighted = feature_selection_transform(df_tt_weighted,win,0.01)

Hero Id Labels: ['r1_hero_id', 'r2_hero_id', 'r3_hero_id', 'r4_hero_id', 'r5_hero_id', 'd1_hero_id', 'd2_hero_id', 'd3_hero_id', 'd4_hero_id', 'd5_hero_id'] 

Numbers of Heros:  115 

NaN Count:  0 

Single Player Labels: ['r1_kills', 'r1_deaths', 'r1_assists', 'r1_denies', 'r1_gold', 'r1_lh', 'r1_xp', 'r1_health', 'r1_max_health', 'r1_max_mana', 'r1_level', 'r1_x', 'r1_y', 'r1_stuns', 'r1_creeps_stacked', 'r1_camps_stacked', 'r1_rune_pickups', 'r1_firstblood_claimed', 'r1_teamfight_participation', 'r1_towers_killed', 'r1_roshans_killed', 'r1_obs_placed', 'r1_sen_placed', 'r2_kills', 'r2_deaths', 'r2_assists', 'r2_denies', 'r2_gold', 'r2_lh', 'r2_xp', 'r2_health', 'r2_max_health', 'r2_max_mana', 'r2_level', 'r2_x', 'r2_y', 'r2_stuns', 'r2_creeps_stacked', 'r2_camps_stacked', 'r2_rune_pickups', 'r2_firstblood_claimed', 'r2_teamfight_participation', 'r2_towers_killed', 'r2_roshans_killed', 'r2_obs_placed', 'r2_sen_placed', 'r3_kills', 'r3_deaths', 'r3_assists', 'r3_denies', 'r3_gold', 'r

## Random forest teamstats teamheroes

In [None]:
best_model.fit(df_tt,win)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [None]:
best_model.cv_results_

{'mean_fit_time': array([ 3.36850796,  7.41986814, 10.5593997 ,  3.60090456,  6.97528963,
        10.93707552,  3.49894648,  7.15659556, 10.64179354,  7.36600137,
        15.44495211, 23.43167834,  7.61266418, 15.06625805, 22.33352966,
         7.09669299, 14.5078692 , 22.35001612,  3.38354635,  6.42164311,
         9.3259028 ,  2.9624156 ,  6.29170923, 10.00910788,  3.22926235,
         6.28859844,  9.74377975,  6.11203446, 12.93962841, 18.68276076,
         6.49404554, 12.73720131, 19.03813734,  6.54321995, 12.82660532,
        19.41329708,  4.09982204,  7.36987548, 11.23931389,  3.51350641,
         7.22242036, 10.9500155 ,  3.82850008,  7.23623638, 10.52326384,
         7.40612726, 14.75679049, 21.47770557,  7.27199645, 14.66911273,
        21.36584172,  7.13787622, 14.46390982, 21.02793679,  3.17327833,
         6.74563785,  9.50270643,  3.24211092,  6.41782932,  9.19921317,
         3.13408799,  6.37278337,  9.33604226,  6.0551785 , 12.4656002 ,
        18.81334534,  6.30912824, 

In [None]:
cv_results:dict = best_model.cv_results_

print("Best Parameters:\n",best_model.best_params_,'\n')

results = []

for scoring in scorings:

    mean_test_roc_label = f"mean_test_{scoring}"

    mean_test_score_list = cv_results[f"mean_test_{scoring}"]

    best_test_position = np.argmin(best_model.cv_results_["rank_test_roc_auc"])

    mean_train_roc_label = f"mean_train_{scoring}"

    mean_train_score_list = cv_results[f"mean_train_{scoring}"]

    results.append({"Scoring": scoring, "Mean Test Score": mean_test_score_list[best_test_position], "Mean Train Score": mean_train_score_list[best_test_position]})

df_results = pd.DataFrame(results)

df_results

Best Parameters:
 {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 5, 'n_estimators': 150} 



Unnamed: 0,Scoring,Mean Test Score,Mean Train Score
0,roc_auc,0.799801,0.911544
1,accuracy,0.71561,0.827092
2,recall,0.762583,0.861599
3,precision,0.716364,0.81964
4,f1,0.738697,0.840087


In [None]:
depths = [tree.get_depth() for tree in best_model.best_estimator_.estimators_]
print(np.max(depths))
print(np.min(depths))
print(np.mean(depths))

10
10
10.0


In [None]:
feature_importance = {
    name: value
    for name,value in zip(best_model.best_estimator_.feature_names_in_,best_model.best_estimator_.feature_importances_)
}

feature_importance = dict(reversed(sorted(feature_importance.items(), key=lambda item: item[1])))
feature_names = list(feature_importance.keys())
print("Most Important:\n", feature_names[:10])
print("Least Important:\n", feature_names[-10:])


Most Important:
 ['d_towers_killed', 'd_kills', 'r_towers_killed', 'r_kills', 'r_deaths', 'd_deaths', 'd_gold', 'r_rune_pickups', 'r_gold', 'r5_x']
Least Important:
 ['d3_x', 'd_lh', 'r_health', 'd5_y', 'd_max_mana', 'd4_y', 'r_level', 'd_level', 'r_max_mana', 'game_time']


# Random forest teamstats teamheroes + average position

In [23]:
best_model.fit(df_tt_average,win)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [24]:
best_model.cv_results_

{'mean_fit_time': array([ 3.59956179,  6.93062286, 10.70651908,  3.41351204,  7.00031748,
        10.41279964,  3.61599464,  7.28941994, 11.01637359,  7.28742752,
        14.51440692, 22.0852788 ,  7.26860704, 14.34817495, 21.66682129,
         7.16909556, 13.93132339, 21.1345027 ,  3.04804673,  6.15665307,
         9.47744632,  3.34782863,  6.15256767,  9.42713046,  3.33765392,
         6.18975511,  9.0545733 ,  5.74173684, 11.4200295 , 17.52550097,
         5.88171844, 11.71105299, 17.33438578,  5.54805799, 11.47151189,
        16.71041989,  3.64951253,  6.89569883, 10.6514863 ,  3.43401217,
         7.32548656, 11.03914342,  3.74024687,  7.15720716, 10.93245473,
         7.21100359, 14.20457187, 21.35917134,  6.95595961, 13.90723619,
        21.45435314,  6.7023324 , 13.60996141, 20.16195407,  2.94668636,
         6.24455609,  9.00310717,  2.97403259,  6.27048197,  9.01494546,
         3.11588697,  5.89976959,  8.95720787,  5.76156735, 11.24408731,
        17.49874887,  5.971877  , 

In [25]:
cv_results:dict = best_model.cv_results_

print("Best Parameters:\n",best_model.best_params_,'\n')

results = []

for scoring in scorings:

    mean_test_roc_label = f"mean_test_{scoring}"

    mean_test_score_list = cv_results[f"mean_test_{scoring}"]

    best_test_position = np.argmin(best_model.cv_results_["rank_test_roc_auc"])

    mean_train_roc_label = f"mean_train_{scoring}"

    mean_train_score_list = cv_results[f"mean_train_{scoring}"]

    results.append({"Scoring": scoring, "Mean Test Score": mean_test_score_list[best_test_position], "Mean Train Score": mean_train_score_list[best_test_position]})

df_results = pd.DataFrame(results)

df_results

Best Parameters:
 {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 5, 'n_estimators': 150} 



Unnamed: 0,Scoring,Mean Test Score,Mean Train Score
0,roc_auc,0.803675,0.888233
1,accuracy,0.718471,0.79327
2,recall,0.759101,0.83117
3,precision,0.721444,0.78821
4,f1,0.739749,0.809115


In [26]:
depths = [tree.get_depth() for tree in best_model.best_estimator_.estimators_]
print(np.max(depths))
print(np.min(depths))
print(np.mean(depths))

10
10
10.0


In [27]:
feature_importance = {
    name: value
    for name,value in zip(best_model.best_estimator_.feature_names_in_,best_model.best_estimator_.feature_importances_)
}

feature_importance = dict(reversed(sorted(feature_importance.items(), key=lambda item: item[1])))
feature_names = list(feature_importance.keys())
print("Most Important:\n", feature_names[:10])
print("Least Important:\n", feature_names[-10:])


Most Important:
 ['radiant_avg_y', 'radiant_avg_x', 'dire_avg_x', 'dire_avg_y', 'r_towers_killed', 'd_kills', 'r_kills', 'r_deaths', 'd_towers_killed', 'd_deaths']
Least Important:
 ['d_health', 'd_max_health', 'r_health', 'd_lh', 'r_lh', 'r_level', 'd_level', 'd_max_mana', 'r_max_mana', 'game_time']


# Random forest teamstats teamheroes + weighted average position

In [28]:
best_model.fit(df_tt_weighted,win)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [35]:
best_model.cv_results_

{'mean_fit_time': array([ 4.20684996,  8.65327568, 12.56515598,  3.8908711 ,  8.26610498,
        12.25947714,  4.02231054,  8.15330091, 12.1625463 ,  7.54305081,
        15.78928695, 22.97164359,  7.80396714, 15.41209712, 23.2611752 ,
         8.5514174 , 15.42764935, 24.01304874,  3.44734998,  7.52265368,
        10.74369378,  3.77008448,  7.18679905, 10.79596725,  3.34240355,
         7.53226094, 10.46471677,  6.61116304, 13.50836353, 19.49708114,
         6.536131  , 12.88287354, 19.52677093,  6.47058921, 12.85908608,
        19.64561148,  4.48682208,  8.67233849, 12.99454699,  4.19748583,
         8.90110168, 12.94639955,  4.28721256,  8.59261236, 12.47834086,
         7.89250112, 16.0640099 , 24.01301107,  7.8990047 , 15.91529312,
        24.53720727,  7.95627751, 16.12513561, 24.12650371,  3.39749618,
         7.41727633, 10.38695049,  3.55424843,  6.78571358, 10.54871097,
         3.41567917,  7.09029446, 10.52642856,  6.3513279 , 13.16377459,
        19.79367638,  6.32615695, 

In [36]:
cv_results:dict = best_model.cv_results_

print("Best Parameters:\n",best_model.best_params_,'\n')

results = []

for scoring in scorings:

    mean_test_roc_label = f"mean_test_{scoring}"

    mean_test_score_list = cv_results[f"mean_test_{scoring}"]

    best_test_position = np.argmin(best_model.cv_results_["rank_test_roc_auc"])

    mean_train_roc_label = f"mean_train_{scoring}"

    mean_train_score_list = cv_results[f"mean_train_{scoring}"]

    results.append({"Scoring": scoring, "Mean Test Score": mean_test_score_list[best_test_position], "Mean Train Score": mean_train_score_list[best_test_position]})

df_results = pd.DataFrame(results)

df_results

Best Parameters:
 {'class_weight': None, 'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 5, 'n_estimators': 150} 



Unnamed: 0,Scoring,Mean Test Score,Mean Train Score
0,roc_auc,0.803117,0.89065
1,accuracy,0.71676,0.78911
2,recall,0.812024,0.878356
3,precision,0.699187,0.759316
4,f1,0.751374,0.814507


In [37]:
depths = [tree.get_depth() for tree in best_model.best_estimator_.estimators_]
print(np.max(depths))
print(np.min(depths))
print(np.mean(depths))

10
10
10.0


In [38]:
feature_importance = {
    name: value
    for name,value in zip(best_model.best_estimator_.feature_names_in_,best_model.best_estimator_.feature_importances_)
}

feature_importance = dict(reversed(sorted(feature_importance.items(), key=lambda item: item[1])))
feature_names = list(feature_importance.keys())
print("Most Important:\n", feature_names[:10])
print("Least Important:\n", feature_names[-10:])


Most Important:
 ['radiant_Weighted_avg_y', 'radiant_Weighted_avg_x', 'dire_Weighted_avg_y', 'dire_Weighted_avg_x', 'r_deaths', 'r_kills', 'd_kills', 'd_towers_killed', 'r_towers_killed', 'd_deaths']
Least Important:
 ['d_health', 'r_lh', 'd_lh', 'r_health', 'd_max_health', 'r_level', 'd_level', 'r_max_mana', 'd_max_mana', 'game_time']
