In [1]:
import warnings
warnings.filterwarnings("ignore")

In [27]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.io import arff
from sklearn.model_selection import train_test_split

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn import model_selection

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, recall_score, roc_curve, auc


RANDOM = 51

Import dos dados


In [31]:
# Conjunto de treino
X_train = pd.read_csv('../data/processed/X_train.csv')
Y_train = pd.read_csv('../data/processed/Y_train.csv')

Y_train['class'] = Y_train['class'].apply(lambda val: 1 if val == 'UP' else 0)

# Conjunto de validação
X_val = pd.read_csv('../data/processed/X_val.csv')
Y_val = pd.read_csv('../data/processed/Y_val.csv')


Y_val['class'] = Y_val['class'].apply(lambda val: 1 if val == 'UP' else 0)

# Conjunto de teste
X_test = pd.read_csv('../data/processed/X_test.csv')
Y_test = pd.read_csv('../data/processed/Y_test.csv')

Y_test['class'] = Y_test['class'].apply(lambda val: 1 if val == 'UP' else 0)


In [30]:
Y_train

Unnamed: 0,class
0,UP
1,UP
2,UP
3,DOWN
4,DOWN
...,...
19516,UP
19517,DOWN
19518,DOWN
19519,DOWN


# Busca de Hiperparâmetros

- Randomized Grid Search
  - 5 K-fold
  - Devemos escolher o modelo conforme: 
    - recall ou F1-score
  
Utilizar n_jobs=-1

In [38]:
def search_params(model, params: dict) -> dict:

    dict_results = {}

    #numero de folds atraves do metodo k-fold
    kfold = model_selection.StratifiedKFold(n_splits=5,random_state=RANDOM, shuffle=True) # Verificar se fazer o shuffle faz sentido, acho que nao

    search_model = RandomizedSearchCV(
        estimator=model,
        param_distributions=params,
        n_iter=5,
        cv=3,
        random_state=51,
        n_jobs=-1
    )

    #treinamento do Random Forest em dez folds
    results = model_selection.cross_val_score(search_model, X_train, Y_train, cv=kfold)
    
    model_rf = search_model.fit(X_train,Y_train)

    # Métricas de Treinamento
    Y_pred_train = model_rf.predict(X_train)
    
    train_F1_score = f1_score(Y_train, Y_pred_train)   
    train_recall = recall_score(Y_train, Y_pred_train)


    fpr, tpr, _ = roc_curve(Y_train, Y_pred_train)
    train_roc_auc = auc(fpr, tpr)

    # Métricas de Teste

    dict_results['train'] = [train_F1_score, train_recall, train_roc_auc]

    Y_pred_test = model_rf.predict(X_test)
    
    test_F1_score = f1_score(Y_test, Y_pred_test)   
    test_recall = recall_score(Y_test, Y_pred_test)


    fpr, tpr, _ = roc_curve(Y_test, Y_pred_test)
    test_roc_auc = auc(fpr, tpr)

    dict_results['test'] = [test_F1_score, test_recall, test_roc_auc]


    print("Random Forest folds:", results, "\nMedia treinamento: " ,results.mean())
    print("Treinamento: ", model_rf.score(X_train,Y_train))
    print("Teste: ", model_rf.score(X_test,Y_test))

    print(dict_results)

    return dict_results
    





In [39]:
n_runs = 12  # você pode alterar conforme necessário
best_scores_dt = []
best_params_dt = []

param_distributions_dt = {
    'max_depth': [1, 2, 5, 10, 50, 100],
    'min_samples_split': [2, 3, 5, 7, 10, 20, 50, 100], # minimum number of samples required to split an internal node:
    'min_samples_leaf': [1, 2, 3, 5, 7, 20, 50, 100]    #  minimum number of samples required to be at a leaf node
}


print(search_params(DecisionTreeClassifier(random_state=42), param_distributions_dt))

Random Forest folds: [0.85352113 0.86116803 0.84964139 0.84349385 0.84938525] 
Media treinamento:  0.8514419302701455
Treinamento:  0.9833000358588188
Teste:  0.8546845124282982
{'train': [0.9798541589420343, 0.9719259531690573, np.float64(0.9816951131561583)], 'test': [0.8222222222222222, 0.8236672524897481, np.float64(0.8498635010934813)]}
{'train': [0.9798541589420343, 0.9719259531690573, np.float64(0.9816951131561583)], 'test': [0.8222222222222222, 0.8236672524897481, np.float64(0.8498635010934813)]}
