# Rodar algortimos dde Machine Learning

In [1]:
import sys
import os
import pandas as pd

# Caminho absoluto para a pasta do projeto (nível acima de 'src')
project_root = os.path.abspath("..")

if project_root not in sys.path:
    sys.path.append(project_root)

# Importa Classes de objetos
from src.etl.extract import extract_csv_processed
from src.models.pipeline_classification import pipeline_classification
from src.models.pipeline_regression import pipeline_regression
from src.models.classification.train_classification_model import train_model

# Diretorios
from src.config import DATA_PROCESSED

#### Extrair dados para o uso do modelo

In [2]:
input_path = 'arquivos_exploratorio.csv'
df = extract_csv_processed(input_path)
df

Unnamed: 0,acidez_fixa,acidez_volatil,acido_citrico,acucar_residual,cloretos,dioxido_enxofre_livre,dioxido_enxofre_total,densidade,ph,sulfatos,alcool,qualidade
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


#### Prepação para uso do pipeline de classificação e regressão

In [3]:
bins =[2.5, 5.5, 6.5, 8.5]
labels = [0, 1, 2]

In [4]:
# catagorizando as coluna qualidade para 3 categoria 0 = Qualidade Baixa, 1 = Qualidade média, 2 = Qualidade Alta
df['target'] = pd.cut(
    df['qualidade'],
    bins=bins,
    labels= labels,
    include_lowest=True,
    right=True
)
df.drop('qualidade', axis=1, inplace=True)
df.sort_values('target')

Unnamed: 0,acidez_fixa,acidez_volatil,acido_citrico,acucar_residual,cloretos,dioxido_enxofre_livre,dioxido_enxofre_total,densidade,ph,sulfatos,alcool,target
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,0
1261,6.3,1.020,0.00,2.0,0.083,17.0,24.0,0.99437,3.59,0.55,11.2,0
1260,8.6,0.635,0.68,1.8,0.403,19.0,56.0,0.99632,3.02,1.15,9.3,0
610,8.8,0.240,0.54,2.5,0.083,25.0,57.0,0.99830,3.39,0.54,9.2,0
611,13.2,0.380,0.55,2.7,0.081,5.0,16.0,1.00060,2.98,0.54,9.4,0
...,...,...,...,...,...,...,...,...,...,...,...,...
903,6.8,0.590,0.06,6.0,0.060,11.0,18.0,0.99620,3.41,0.59,10.8,2
904,6.8,0.590,0.06,6.0,0.060,11.0,18.0,0.99620,3.41,0.59,10.8,2
1177,7.1,0.660,0.00,2.4,0.052,6.0,11.0,0.99318,3.35,0.66,12.7,2
898,8.3,0.310,0.39,2.4,0.078,17.0,43.0,0.99444,3.31,0.77,12.5,2


#### **Modelo de Classificação**

In [5]:
# Base Line de Classificação
results_c = pipeline_classification(
    data_path = df,
    target_column = 'target',
    model_name = 'tree_classifier',
    custom_params =None,
    scale_type='standard',
    test_size=0.2,
    return_data=True,
    avarage='weighted'
)

Iniciando pipeline de classificação com modelo: tree_classifier
Modelo tree_classifier criado com sucesso!

Métricas:
  Accuracy: 0.6094
  Precision: 0.6108
  Recall: 0.6094
  F1-score: 0.6100
  Confusion Matrix:
    [93, 41, 7]
    [39, 76, 17]
    [6, 15, 26]
✅ Modelo salvo em: ../models_storage\tree_classifier_model.pkl
✅ Modelo salvo em: ../models_storage\tree_classifier_scaler.pkl


#### **Modelo de Regressão**

In [None]:
# Base Line de regressão
results_r = pipeline_regression(
    data_path=input_path,
    target_column='qualidade',
    custom_params=None,
    scale_type='standard',
    test_size=0.2,
    return_data=True
)

Iniciando pipeline de regressão com modelo: linear_regression
Modelo linear_regression criado com sucesso!
Métricas:
  MAE: 0.5035
  MSE: 0.3900
  R2: 0.4032
  RMSE: 0.6245
  MAPE: 8.9911
✅ Modelo salvo em: models_storage\linear_regression_model.pkl
✅ Modelo salvo em: models_storage\linear_regression_scaler.pkl


In [6]:
from sklearn.tree import  DecisionTreeClassifier
model = DecisionTreeClassifier()
res = train_model(model= model, X=df.drop('target', axis=1), y=df['target'], return_data=True)

In [8]:
modelo, X_train, X_test, y_train, y_test = res

In [9]:
X_train

Unnamed: 0,acidez_fixa,acidez_volatil,acido_citrico,acucar_residual,cloretos,dioxido_enxofre_livre,dioxido_enxofre_total,densidade,ph,sulfatos,alcool
493,8.7,0.690,0.31,3.0,0.086,23.0,81.0,1.00020,3.48,0.74,11.6
354,6.1,0.210,0.40,1.4,0.066,40.5,165.0,0.99120,3.25,0.59,11.9
342,10.9,0.390,0.47,1.8,0.118,6.0,14.0,0.99820,3.30,0.75,9.8
834,8.8,0.685,0.26,1.6,0.088,16.0,23.0,0.99694,3.32,0.47,9.4
705,8.4,1.035,0.15,6.0,0.073,11.0,54.0,0.99900,3.37,0.49,9.9
...,...,...,...,...,...,...,...,...,...,...,...
1130,9.1,0.600,0.00,1.9,0.058,5.0,10.0,0.99770,3.18,0.63,10.4
1294,8.2,0.635,0.10,2.1,0.073,25.0,60.0,0.99638,3.29,0.75,10.9
860,7.2,0.620,0.06,2.7,0.077,15.0,85.0,0.99746,3.51,0.54,9.5
1459,7.9,0.200,0.35,1.7,0.054,7.0,15.0,0.99458,3.32,0.80,11.9


In [11]:
y_train

493     1
354     1
342     1
834     0
705     0
       ..
1130    1
1294    1
860     0
1459    2
1126    1
Name: target, Length: 1279, dtype: category
Categories (3, int64): [0 < 1 < 2]

In [10]:
X_test

Unnamed: 0,acidez_fixa,acidez_volatil,acido_citrico,acucar_residual,cloretos,dioxido_enxofre_livre,dioxido_enxofre_total,densidade,ph,sulfatos,alcool
803,7.7,0.560,0.08,2.50,0.114,14.0,46.0,0.99710,3.24,0.66,9.6
124,7.8,0.500,0.17,1.60,0.082,21.0,102.0,0.99600,3.39,0.48,9.5
350,10.7,0.670,0.22,2.70,0.107,17.0,34.0,1.00040,3.28,0.98,9.9
682,8.5,0.460,0.31,2.25,0.078,32.0,58.0,0.99800,3.33,0.54,9.8
1326,6.7,0.460,0.24,1.70,0.077,18.0,34.0,0.99480,3.39,0.60,10.6
...,...,...,...,...,...,...,...,...,...,...,...
1259,6.8,0.640,0.00,2.70,0.123,15.0,33.0,0.99538,3.44,0.63,11.3
1295,6.6,0.630,0.00,4.30,0.093,51.0,77.5,0.99558,3.20,0.45,9.5
1155,8.3,0.600,0.25,2.20,0.118,9.0,38.0,0.99616,3.15,0.53,9.8
963,8.8,0.270,0.39,2.00,0.100,20.0,27.0,0.99546,3.15,0.69,11.2


In [12]:
y_test

803     1
124     0
350     1
682     0
1326    1
       ..
1259    1
1295    0
1155    0
963     1
704     0
Name: target, Length: 320, dtype: category
Categories (3, int64): [0 < 1 < 2]

In [13]:
modelo

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0
