## Imports

In [None]:
import sys
from pathlib import Path
PROJECT_DIR = Path.cwd().parent
sys.path.append(str(PROJECT_DIR))

# basics
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
import warnings
import joblib

# models
from pycaret.regression import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import (
    train_test_split
)

# utils
import os
from ppm.nodes.metrics_calculator import (
    metrics_calculate,
    show_results
)
from ppm.nodes.process_outputs import (
    ranking_output
)
from ppm.nodes.cross_validation import (
    cross_validation
)
from ppm.nodes.tunning_model import (
    perform_grid_search
)
from ppm.nodes.plots import (
    scatterplot_yx
)

## Parameters

In [None]:
path_root = os.path.join("..","data")

path_primary = os.path.join(
    path_root, "03_primary"
)
path_model = os.path.join(
    path_root, "04_model"
)
path_encoders = os.path.join(
    path_model, "encoders"
)
path_model_final = os.path.join(
    path_model, "model"
)

file_path_input_data = os.path.join(
    path_primary, "data_input.csv"
)

file_path_metrics_features_test = os.path.join(
    path_primary, "features_test_metrics.json"
)
file_path_metrics_features_selected = os.path.join(
    path_primary, "features_selected.json"
)
file_path_best_params = os.path.join(
    path_model, "best_params.json"
)
file_path_best_model = os.path.join(
    path_model, "model_tunned.joblib"
)
file_path_not_outliers = os.path.join(
    path_primary, "data_not_outliers.csv"
)
file_path_encoder_obj = os.path.join(
    path_encoders, "{}_encoder.joblib"
)
file_path_encoder_order = os.path.join(
    path_encoders, "encoders_orders.csv"
)
file_path_pycared_model = os.path.join(
    path_model_final, "{}.joblib"
)

## Read dataset

In [None]:
data_input = pd.read_csv(
    file_path_input_data, # file_path_input_data, file_path_not_outliers
    index_col = 0
)

In [None]:
if os.path.exists(file_path_metrics_features_selected):
    with open(file_path_metrics_features_selected, 'r') as json_file:
        features_selected = json.load(json_file)
    features_selected = features_selected["features_selected"]

In [None]:
if os.path.exists(file_path_best_params):
    with open(file_path_best_params, 'r') as json_file:
        best_params = json.load(json_file)
    best_params = best_params["params"]
else:
    best_params = {
        "random_state": 42
    }

## Train model

In [None]:
target = [
    "price"
]
cols_id_drop = [
    "cd_setor",
    "ID"
]
cols_drop = cols_id_drop + target

In [None]:
X = data_input[features_selected]

y = data_input[target[0]]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    test_size = 0.2,
                                                    random_state = best_params["random_state"])
model = RandomForestRegressor(**best_params)
model.fit(X_train, y_train)

### Metrics

In [None]:
args_train = [
    y_train.values,
    model.predict(X_train)
]
args_preds = [
    y_test.values,
    model.predict(X_test)
]

metrics_train = metrics_calculate(args_train)
metrics_pred = metrics_calculate(args_preds)

metrics_train = show_results(metrics_train, "train")
metrics_test = show_results(metrics_pred, "test")
metrics_results = metrics_train|metrics_test

In [None]:
data_values = {}
for name, args in zip(['train', 'test'], [args_train, args_preds]):
    content = ranking_output(args)
    data_values[name] = content

## Cross validation

In [None]:
scores = cross_validation(X,
                          y, 
                          [model])

## Tunning

In [None]:
parameters = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 3],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2, 3],
    'max_features': [1, 'auto'],
    "random_state": [42]
}

rf = RandomForestRegressor()

scorer = make_scorer(
    mean_squared_error,
    greater_is_better = False
)
best_model, best_params = perform_grid_search(
    X_train,
    y_train,
    rf,
    parameters,
    scorer,
    cv = 5
)
best_params

In [None]:
best_model_params = {
    "params": best_params
}

In [None]:
with open(file_path_best_params, 'w') as json_file:
    json.dump(
        best_model_params, 
        json_file
    )

In [None]:
joblib.dump(
    best_model, 
    file_path_best_model
)

## Plots

In [None]:
args_train = [data_values["train"]["y_true"], data_values["train"]["y_pred"]]
args_preds = [data_values["test"]["y_true"], data_values["test"]["y_pred"]]
fig_args = {"figsize": (20, 8), "nrows": 1, "ncols": 2}

fig_scatter_xy = scatterplot_yx(args_train, args_preds, fig_args)

## Select best model (pycaret)

In [None]:
regression_setup = setup(
    data = data_input.drop(cols_id_drop, axis = 1),
    target = target[0], 
    verbose = False, 
    fold = 5
)

best_model = compare_models()
tuned_model = tune_model(best_model)
final_model = finalize_model(tuned_model)
predictions = predict_model(final_model)

evaluate_model(final_model)
df_metrics = pull()

In [None]:
plot_model(final_model)

## Pipeline (encoders)

In [None]:
if not os.path.exists(path_encoders):
    os.makedirs(path_encoders)
encoders_order = []
for encoder_name, encoder_obj in final_model.steps[:(-1)]:
    joblib.dump(
        encoder_obj,
        file_path_encoder_obj.format(encoder_name)
    )
    encoders_order.append([encoder_name, file_path_encoder_obj.format(encoder_name)])
encoders_order = pd.DataFrame(encoders_order, columns = ["name", "url"])
encoders_order.to_csv(file_path_encoder_order, index = False)

## Pipeline (pycaret model tunned)

In [None]:
for model_name, model_obj in [final_model.steps[(-1)]]:
    joblib.dump(
        model_obj,
        file_path_pycared_model.format(model_name)
    )