In [1]:
import pandas as pd
import numpy as np
from dataset_functions import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV,StratifiedKFold

# Random Forest

In [None]:
params = {
    'n_estimators': [100,200,250],
    'class_weight': ['balanced'],
    'criterion': ['entropy','gini'],
}

model = RandomForestClassifier()

In [3]:
df, target = get_dataset()
win = target['radiant_win']

Filtering Df:  (game_mode == 2 or game_mode == 22) and game_time > 0 

Dropped:  ['lobby_type', 'chat_len', 'game_mode', 'match_id_hash'] 

Dataframe Shape:  (32153, 242) 

Target shape: (32153, 6)


In [4]:
df_tt = teamstats_teamheros_transform(df.copy())
df_tt = feature_selection_transform(df_tt,win,0.01)

Hero Id Labels: ['r1_hero_id', 'r2_hero_id', 'r3_hero_id', 'r4_hero_id', 'r5_hero_id', 'd1_hero_id', 'd2_hero_id', 'd3_hero_id', 'd4_hero_id', 'd5_hero_id'] 

Numbers of Heros:  115 

NaN Count:  0 

Single Player Labels: ['r1_kills', 'r1_deaths', 'r1_assists', 'r1_denies', 'r1_gold', 'r1_lh', 'r1_xp', 'r1_health', 'r1_max_health', 'r1_max_mana', 'r1_level', 'r1_x', 'r1_y', 'r1_stuns', 'r1_creeps_stacked', 'r1_camps_stacked', 'r1_rune_pickups', 'r1_firstblood_claimed', 'r1_teamfight_participation', 'r1_towers_killed', 'r1_roshans_killed', 'r1_obs_placed', 'r1_sen_placed', 'r2_kills', 'r2_deaths', 'r2_assists', 'r2_denies', 'r2_gold', 'r2_lh', 'r2_xp', 'r2_health', 'r2_max_health', 'r2_max_mana', 'r2_level', 'r2_x', 'r2_y', 'r2_stuns', 'r2_creeps_stacked', 'r2_camps_stacked', 'r2_rune_pickups', 'r2_firstblood_claimed', 'r2_teamfight_participation', 'r2_towers_killed', 'r2_roshans_killed', 'r2_obs_placed', 'r2_sen_placed', 'r3_kills', 'r3_deaths', 'r3_assists', 'r3_denies', 'r3_gold', 'r

In [7]:
scorings = ["roc_auc","accuracy","recall","precision","f1"]
best_model = GridSearchCV(estimator=model,param_grid=params,scoring=scorings,refit="roc_auc",cv=StratifiedKFold(n_splits=5,shuffle=True),return_train_score=True)

best_model.fit(df_tt,win)

In [8]:
best_model.cv_results_

{'mean_fit_time': array([21.09880719]),
 'std_fit_time': array([7.28749591]),
 'mean_score_time': array([0.53894048]),
 'std_score_time': array([0.19813256]),
 'param_class_weight': masked_array(data=['balanced'],
              mask=[False],
        fill_value=np.str_('?'),
             dtype=object),
 'params': [{'class_weight': 'balanced'}],
 'split0_test_roc_auc': array([0.79114307]),
 'split1_test_roc_auc': array([0.79393578]),
 'split2_test_roc_auc': array([0.81055336]),
 'split3_test_roc_auc': array([0.79505963]),
 'split4_test_roc_auc': array([0.80718765]),
 'mean_test_roc_auc': array([0.7995759]),
 'std_test_roc_auc': array([0.0077687]),
 'rank_test_roc_auc': array([1], dtype=int32),
 'split0_train_roc_auc': array([1.]),
 'split1_train_roc_auc': array([1.]),
 'split2_train_roc_auc': array([1.]),
 'split3_train_roc_auc': array([1.]),
 'split4_train_roc_auc': array([1.]),
 'mean_train_roc_auc': array([1.]),
 'std_train_roc_auc': array([0.]),
 'split0_test_accuracy': array([0.7036

In [None]:
cv_results:dict = best_model.cv_results_

print("Best Parameters:\n",best_model.best_params_,'\n')

results = []

for scoring in scorings:
    
    mean_test_roc_label = f"mean_test_{scoring}"

    mean_test_score_list = cv_results[f"mean_test_{scoring}"]

    best_test_position = np.argmin(best_model.cv_results_["rank_test_roc_auc"])

    mean_train_roc_label = f"mean_train_{scoring}"

    mean_train_score_list = cv_results[f"mean_train_{scoring}"]

    results.append({"Scoring": scoring, "Mean Test Score": mean_test_score_list[best_test_position], "Mean Train Score": mean_train_score_list[best_test_position]})

df_results = pd.DataFrame(results)

df_results

Best Parameters:
 {'class_weight': 'balanced'} 



Unnamed: 0,Scoring,Mean Test Score,Mean Train Score
0,roc_auc,0.799576,1.0
1,accuracy,0.715921,1.0
2,recall,0.762169,1.0
3,precision,0.716905,1.0
4,f1,0.738819,1.0


In [None]:
depths = [tree.get_depth() for tree in best_model.best_estimator_.estimators_]
print(np.max(depths))
print(np.min(depths))
print(np.mean(depths))

41
27
32.89


In [34]:
feature_importance = {
    name: value 
    for name,value in zip(best_model.best_estimator_.feature_names_in_,best_model.best_estimator_.feature_importances_)
}

feature_importance = dict(reversed(sorted(feature_importance.items(), key=lambda item: item[1])))
feature_names = list(feature_importance.keys())
print("Most Important:\n", feature_names[:10])
print("Least Important:\n", feature_names[-10:])


Most Important:
 ['d_gold', 'r_kills', 'd_kills', 'd_deaths', 'd_health', 'r_deaths', 'r1_y', 'r_gold', 'r_denies', 'r_health']
Least Important:
 ['r4_x', 'd3_x', 'd2_x', 'd_assists', 'r_towers_killed', 'd4_y', 'r_assists', 'game_time', 'r_level', 'd_level']
