I created a new notebook because the one with XGBoost was requiring too much ram, this one is built not to study the model but to be compared with the FBT in terms of f1 scores

#NONOTEBOOK

In [6]:
# just to be shure we will retrain the final model
import sys
import os
sys.path.append(os.path.abspath(os.path.join('..')))

from itertools import product #for grid search
from sklearn.tree import DecisionTreeClassifier # for classification
from tqdm import tqdm # for progress bar
from sklearn.metrics import classification_report, f1_score # for evaluation
from utility.classification_utility import make_dataset_for_classification
from imblearn.over_sampling import RandomOverSampler # for oversampling
import json
f1_macro = lambda x, y: f1_score(x, y, average='macro') # for our evaluation

cyc_path = '../dataset/cyclists_cleaned.csv'
races_path = '../dataset/races_cleaned.csv'
print('Loading data...')
dataset = make_dataset_for_classification(races_df=races_path, cyclists_df=cyc_path)

TO_USE_COLS = [
    # over time
    'total_points', 'avg_points_per_race', 
    'average_position', 'avg_speed_cyclist',
    'race_count',
    'average_position_var',
    # race related
    'profile', 
    'startlist_quality', 'cyclist_age_rac', 'steepness', 
    'partecipants_number',
    # cyclist related
    'bmi',
]

tr_data = dataset[(dataset['date'] < '2019-01-01') & (dataset['date'] >= '1996-01-01')]
tr_out = tr_data['target']
tr_data = tr_data[TO_USE_COLS]

val_data = dataset[(dataset['date'] >= '2019-01-01') & (dataset['date'] < '2022-01-01')]
val_out = val_data['target']
val_data = val_data[TO_USE_COLS]

Loading data...
100.00%  


In [7]:
DecisionTreeClassifier().get_params().keys()

dict_keys(['ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'monotonic_cst', 'random_state', 'splitter'])

In [8]:
val_out.value_counts()

False    37243
True      6237
Name: target, dtype: int64

In [9]:
max_depths = [5, 10, 15, 20]
min_samples_splits = [2, 5, 10, 25]
min_samples_leafs = [1, 2, 4, 8, 16, 32]
criterions = ['gini', 'entropy']
class_weights = ['balanced', {0: 1, 1: 2}, {0: 1, 1: 4}, {0: 1, 1: 6}]

#max_depths = [5, 10]
#min_samples_splits = [2, 5]
#min_samples_leafs = [1, 2]
#criterions = ['entropy']
#class_weights = [{0: 1, 1: 4}]

params = list(product(max_depths, min_samples_splits, min_samples_leafs, criterions, class_weights))

results = []
for param in tqdm(params, total=len(params), desc='Grid search', colour='green', smoothing=0.1):
    max_depth, min_samples_split, min_samples_leaf, criterion, class_weight = param
    clf = DecisionTreeClassifier(
        max_depth=max_depth, 
        min_samples_split=min_samples_split, 
        min_samples_leaf=min_samples_leaf, 
        criterion=criterion,
        class_weight=class_weight,
        random_state=42
    )
    clf.fit(tr_data, tr_out)
    tr_pred = clf.predict(tr_data)
    val_pred = clf.predict(val_data)
    results.append({
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'criterion': criterion,
        'class_weight': class_weight,
        'tr_f1_macro': f1_macro(tr_out, tr_pred),
        'val_f1_macro': f1_macro(val_out, val_pred),
        'tr_report': classification_report(tr_out, tr_pred, output_dict=True),
        'val_report': classification_report(val_out, val_pred, output_dict=True)
    })

with open('DTree_results.json', 'w') as f:
    json.dump(results, f, indent=4)

Grid search: 100%|[32m██████████[0m| 8/8 [00:24<00:00,  3.08s/it]


In [11]:
with open('DTree_results.json', 'w') as f:
    results = json.load(f)

best = max(results, key=lambda x: x['val_f1_macro'])
best

{'max_depth': 5,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'criterion': 'entropy',
 'class_weight': {'0': 1, '1': 4},
 'tr_f1_macro': 0.6332986370679715,
 'val_f1_macro': 0.6568971047681522}