# Rodar algortimos de Machine Learning

In [1]:
import sys
import os
import pandas as pd

# Caminho absoluto para a pasta do projeto (nível acima de 'src')
project_root = os.path.abspath("..")

if project_root not in sys.path:
    sys.path.append(project_root)

# Importa Classes de objetos
from src.etl.extract import extract_csv_processed
from src.models.pipeline_classification import pipeline_classification
from src.models.pipeline_regression import pipeline_regression
from src.models.evaluation.cross_validation import avaliar_cross_validation, avaliar_modelo, executar_random_search
from src.models.classification.param_distributions import get_classification_param_distributions
from src.visualization.plot_roc import plot_roc_curve

#### Extrair dados para o uso do modelo

In [None]:
input_path = 'arquivo_modelo.csv'
df = extract_csv_processed(input_path)
df.sort_values('col')

#### Prepação para uso do pipeline de classificação e regressão

In [None]:
# Exemplos
bins =[2.5, 5.5, 6.5, 8.5]
labels = [0, 1, 2]

bins_1 = [2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5]
labels_1 = [0, 1, 2, 3, 4, 5]

In [None]:
# catagorizando as coluna qualidade para 3 categoria 0 = Qualidade Baixa, 1 = Qualidade média, 2 = Qualidade Alta
df['target'] = pd.cut(
    df['col'],
    bins=bins_1,
    labels= labels_1,
    include_lowest=True,
    right=True
)
df.drop('col', axis=1, inplace=True)
df.sort_values('target')

### **Modelo de Classificação**

In [None]:
# Base Line de Classificação
results_tree = pipeline_classification(
    data_path = df,
    target_column = 'target',
    model_name = 'tree_classifier',
    custom_params =None,
    scale_type='standard',
    test_size=0.2,
    return_data=True,
    avarage='weighted'
)

In [None]:
# Logistic Regression
results_lr = pipeline_classification(
    data_path = df,
    target_column = 'target',
    model_name = 'logistic_regression',
    custom_params =None,
    scale_type='standard',
    test_size=0.2,
    return_data=True,
    avarage='weighted'
)

In [None]:
results_xgb = pipeline_classification(
    data_path = df,
    target_column = 'target',
    model_name = 'xgboost',
    custom_params =None,
    scale_type=None,
    test_size=0.2,
    return_data=True,
    avarage='weighted'
)

In [None]:
# Lightgbm
results_lgm = pipeline_classification(
    data_path = df,
    target_column = 'target',
    model_name = 'lightgbm',
    custom_params =None,
    scale_type=None,
    test_size=0.2,
    return_data=True,
    avarage='weighted'
)

In [None]:
# Random Forest
results_rf = pipeline_classification(
    data_path = df,
    target_column = 'target',
    model_name = 'random_forest',
    custom_params =None,
    scale_type=None,
    test_size=0.2,
    return_data=True,
    avarage='weighted'
)

In [None]:
# catboost
results_cat = pipeline_classification(
    data_path = df,
    target_column = 'target',
    model_name = 'catboost',
    custom_params =None,
    scale_type=None,
    test_size=0.2,
    return_data=True,
    avarage='weighted'
)

In [None]:
# SVM
results_svc = pipeline_classification(
    data_path = df,
    target_column = 'target',
    model_name = 'svm_classifier',
    custom_params =None,
    scale_type='standard',
    test_size=0.2,
    return_data=True,
    avarage='weighted'
)

# Validações do modelo de Classificação

In [91]:
# Desempacotando o resultado do pipeline treinado e testado
model_rf, X_train, X_test, y_train, y_test =  results_rf['train_model']

In [92]:
# Pega disctribuição dinâmica de parâmetros
model_name = 'random_forest'
param_grid_rfc = get_classification_param_distributions(model_name)

In [93]:
# Roda RandomSearch fazendo a validação cruzada
busca = executar_random_search(model_rf, param_grid_rfc, X_train, y_train, cv=10)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [None]:
# Roda o RandomSearch após efetuar a validação cruzada
avaliar_cross_validation(busca, X_train, y_train, cv=5)

In [None]:
# Avalia o teste com a validação cruzada
avaliar_modelo(busca, X_test, y_test)

In [None]:
# Avalia o teste com a validação
avaliar_modelo(model_rf, X_test, y_test)

In [None]:
train_score = model_rf.score(X_train, y_train)
test_score = model_rf.score(X_test, y_test)
print(f'Treino: {train_score:.4f} | Teste: {test_score:.4f}')

In [None]:
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
import numpy as np

X = df.drop(columns=['target'])
y = df['target']
train_sizes, train_scores, test_scores = learning_curve(model_rf, X, y, cv=5)

plt.plot(train_sizes, np.mean(train_scores, axis=1), label='Treino')
plt.plot(train_sizes, np.mean(test_scores, axis=1), label='Validação')
plt.legend()
plt.show()


### **Modelo de Regressão**

In [None]:
# Base Line de regressão
results_lrr = pipeline_regression(
    data_path=df,
    target_column='target',
    model_name='linear_regression',
    custom_params=None,
    scale_type='standard',
    test_size=0.2,
    return_data=True
)

In [None]:
# Random Forest
results_rfr = pipeline_regression(
    data_path=df,
    target_column='target',
    model_name='random_forest',
    custom_params=None,
    scale_type='standard',
    test_size=0.2,
    return_data=True
)

In [None]:
# XGBoost
results_xgbr = pipeline_regression(
    data_path=df,
    target_column='target',
    model_name='xgboost',
    custom_params=None,
    scale_type='standard',
    test_size=0.2,
    return_data=True
)

In [None]:
# Lightgbm
results_lgmr = pipeline_regression(
    data_path=df,
    target_column='target',
    model_name='lightgbm',
    custom_params=None,
    scale_type='standard',
    test_size=0.2,
    return_data=True
)

In [None]:
# catboost
results_catr = pipeline_regression(
    data_path=df,
    target_column='target',
    model_name='catboost',
    custom_params=None,
    scale_type='standard',
    test_size=0.2,
    return_data=True
)

In [None]:
# gradient_boosting
results_gbgr = pipeline_regression(
    data_path=df,
    target_column='target',
    model_name='gradient_boosting',
    custom_params=None,
    scale_type='standard',
    test_size=0.2,
    return_data=True
)

In [None]:
# extra_trees
results_etr = pipeline_regression(
    data_path=df,
    target_column='target',
    model_name='extra_trees',
    custom_params=None,
    scale_type='standard',
    test_size=0.2,
    return_data=True
)