In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

import metrics
import evaluation

In [2]:
data = pd.read_csv('./data_preprocess/tran.csv')
features = np.loadtxt('./out/features_forward.txt', dtype='str')
y = data['Positivo']
data = data[features]

In [3]:
model_params = {
    'n_estimators': (100, 250, 500, 1000),
    'max_depth': (3, 5, 7, 9, 11),
    'max_features': (2, 4, 5)
}

params_values = [*model_params.values()]

In [13]:
def make_combinations(lists, combinations=None):
    if combinations is None:
        combinations = [[li,] for li in lists.pop(0)]
    else:
        list_i = lists.pop(0)
        new_combinations = []
        for li in list_i:
            for combination in combinations:
                new_combinations.append([ *combination, li])
                
        combinations = new_combinations
        
    if lists:
        combinations = make_combinations(lists, combinations)
    return combinations

combinations = make_combinations(params_values.copy())

options_parms = [dict(zip(model_params.keys(), comb)) for comb in combinations]

In [22]:
best_params = None
best_score = None

for params in options_parms:
    model = RandomForestClassifier(**params)
    ytrue, ypred = evaluation.predict_model_loo(model, data, y)
    score = metrics.balanced_accuracy(ytrue, ypred)
    
    if best_params is None:
        best_params = params
        best_score = score
    elif score > best_score:
        best_params = params
        best_score = score

110it [00:09, 11.46it/s]
110it [00:23,  4.65it/s]
110it [00:46,  2.34it/s]
110it [01:33,  1.18it/s]
110it [00:09, 11.41it/s]
110it [00:23,  4.61it/s]
110it [00:47,  2.33it/s]
110it [01:34,  1.17it/s]
110it [00:09, 11.38it/s]
110it [00:23,  4.61it/s]
110it [00:47,  2.31it/s]
110it [01:34,  1.17it/s]
110it [00:09, 11.32it/s]
110it [00:23,  4.60it/s]
110it [00:47,  2.32it/s]
110it [01:34,  1.16it/s]
110it [00:09, 11.35it/s]
110it [00:23,  4.60it/s]
110it [00:47,  2.32it/s]
110it [01:34,  1.17it/s]
110it [00:09, 11.18it/s]
110it [00:24,  4.49it/s]
110it [00:48,  2.27it/s]
110it [01:36,  1.14it/s]
110it [00:10, 10.99it/s]
110it [00:24,  4.45it/s]
110it [00:49,  2.24it/s]
110it [01:37,  1.12it/s]
110it [00:10, 10.95it/s]
110it [00:24,  4.46it/s]
110it [00:49,  2.24it/s]
110it [01:37,  1.13it/s]
110it [00:09, 11.02it/s]
110it [00:24,  4.46it/s]
110it [00:49,  2.23it/s]
110it [01:38,  1.12it/s]
110it [00:10, 10.98it/s]
110it [00:24,  4.46it/s]
110it [00:49,  2.24it/s]
110it [01:37,  1.12it/s]


In [23]:
best_params

{'n_estimators': 1000, 'max_depth': 5, 'max_features': 2}

In [24]:

model = RandomForestClassifier(**best_params)
ytrue, ypred = evaluation.predict_model_loo(model, data, y, proba=True)
score = metrics.balanced_accuracy(ytrue, ypred)

score

110it [01:35,  1.16it/s]


0.928235294117647

In [25]:
metrics.confusion_matrix(ytrue, ypred)

array([[83.,  3.],
       [ 2., 22.]])