## Imports

In [1]:
import sys
from pathlib import Path
PROJECT_DIR = Path.cwd().parent
sys.path.append(str(PROJECT_DIR))

# basics
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
import warnings
import joblib

# feature_importance
import shap

# viz
import matplotlib.pyplot as plt

# models
from pycaret.regression import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    StratifiedKFold
)
from sklearn.model_selection import StratifiedKFold

# metrics
from sklearn.metrics import (
    r2_score,
    mean_absolute_percentage_error,
    mean_squared_error,
    median_absolute_error,
    mean_absolute_error,
    median_absolute_error,
    make_scorer
)

# utils
import os
from ppm.nodes.metrics_calculator import (
    metrics_calculate,
    show_results
)
from ppm.nodes.process_outputs import (
    ranking_output
)
from ppm.nodes.cross_validation import (
    cross_validation
)
from ppm.nodes.tunning_model import (
    perform_grid_search
)
from ppm.nodes.plots import (
    scatterplot_yx
)

  def _pt_shuffle_rec(i, indexes, index_mask, partition_tree, M, pos):
  def delta_minimization_order(all_masks, max_swap_size=100, num_passes=2):
  def _reverse_window(order, start, length):
  def _reverse_window_score_gain(masks, order, start, length):
  def _mask_delta_score(m1, m2):
  def identity(x):
  def _identity_inverse(x):
  def logit(x):
  def _logit_inverse(x):
  def _build_fixed_single_output(averaged_outs, last_outs, outputs, batch_positions, varying_rows, num_varying_rows, link, linearizing_weights):
  def _build_fixed_multi_output(averaged_outs, last_outs, outputs, batch_positions, varying_rows, num_varying_rows, link, linearizing_weights):
  def _init_masks(cluster_matrix, M, indices_row_pos, indptr):
  def _rec_fill_masks(cluster_matrix, indices_row_pos, indptr, indices, M, ind):
  def _single_delta_mask(dind, masked_inputs, last_mask, data, x, noop_code):
  def _delta_masking(masks, x, curr_delta_inds, varying_rows_out,
  def _jit_build_partition_tree(xmin, xmax, ymi

## Parameters

In [2]:
path_root = os.path.join("..","data")

path_primary = os.path.join(
    path_root, "03_primary"
)
path_model = os.path.join(
    path_root, "04_model"
)
path_encoders = os.path.join(
    path_model, "encoders"
)
path_model_final = os.path.join(
    path_model, "model"
)

file_path_input_data = os.path.join(
    path_primary, "data_input.csv"
)

file_path_metrics_features_test = os.path.join(
    path_primary, "features_test_metrics.json"
)
file_path_metrics_features_selected = os.path.join(
    path_primary, "features_selected.json"
)
file_path_best_params = os.path.join(
    path_model, "best_params.json"
)
file_path_best_model = os.path.join(
    path_model, "model_tunned.joblib"
)
file_path_not_outliers = os.path.join(
    path_primary, "data_not_outliers.csv"
)
file_path_encoder_obj = os.path.join(
    path_encoders, "{}_encoder.joblib"
)
file_path_encoder_order = os.path.join(
    path_encoders, "encoders_orders.csv"
)
file_path_pycared_model = os.path.join(
    path_model_final, "{}.joblib"
)

## Read dataset

In [3]:
data_input = pd.read_csv(
    file_path_input_data, # file_path_input_data, file_path_not_outliers
    index_col = 0
)

In [4]:
if os.path.exists(file_path_metrics_features_selected):
    with open(file_path_metrics_features_selected, 'r') as json_file:
        features_selected = json.load(json_file)
    features_selected = features_selected["features_selected"]

In [5]:
if os.path.exists(file_path_best_params):
    with open(file_path_best_params, 'r') as json_file:
        best_params = json.load(json_file)
    best_params = best_params["params"]
else:
    best_params = {
        "random_state": 42
    }

## Train model

In [6]:
target = [
    "price"
]
cols_id_drop = [
    "cd_setor",
    "ID"
]
cols_drop = cols_id_drop + target

In [7]:
X = data_input[features_selected]

y = data_input[target[0]]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    test_size = 0.2,
                                                    random_state = best_params["random_state"])
model = RandomForestRegressor(**best_params)
model.fit(X_train, y_train)

### Metrics

In [9]:
args_train = [
    y_train.values,
    model.predict(X_train)
]
args_preds = [
    y_test.values,
    model.predict(X_test)
]

metrics_train = metrics_calculate(args_train)
metrics_pred = metrics_calculate(args_preds)

show_results(metrics_train, "train")
show_results(metrics_pred, "test")

-------- [ train ] ----------
r2 : 0.909
mape : 0.1057
rmse : 58700.0641
mse : 3445697529.233
mae : 37380.8003
median_ae : 23620.5629
correlation : 0.9559
size : 697
-------- [ test ] ----------
r2 : 0.7777
mape : 0.1915
rmse : 82603.9244
mse : 6823408319.1853
mae : 60570.3053
median_ae : 49543.8062
correlation : 0.8842
size : 175


In [10]:
data_values = {}
for name, args in zip(['train', 'test'], [args_train, args_preds]):
    content = ranking_output(args)
    data_values[name] = content

## Cross validation

In [None]:
scores = cross_validation(X,
                          y, 
                          [model])

## Tunning

In [None]:
parameters = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 3],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2, 3],
    'max_features': [1, 'auto'],
    "random_state": [42]
}

rf = RandomForestRegressor()

scorer = make_scorer(
    mean_squared_error,
    greater_is_better = False
)
best_model, best_params = perform_grid_search(
    X_train,
    y_train,
    rf,
    parameters,
    scorer,
    cv = 5
)
best_params

In [None]:
best_model_params = {
    "params": best_params
}

In [None]:
with open(file_path_best_params, 'w') as json_file:
    json.dump(
        best_model_params, 
        json_file
    )

In [None]:
joblib.dump(
    best_model, 
    file_path_best_model
)

## Plots

In [None]:
args_train = [data_values["train"]["y_true"], data_values["train"]["y_pred"]]
args_preds = [data_values["test"]["y_true"], data_values["test"]["y_pred"]]
fig_args = {"figsize": (20, 8), "nrows": 1, "ncols": 2}

fig_scatter_xy = scatterplot_yx(args_train, args_preds, fig_args)

## Select best model (pycaret)

In [None]:
regression_setup = setup(
    data = data_input.drop(cols_id_drop, axis = 1),
    target = target[0], 
    verbose = False, 
    fold = 5
)

best_model = compare_models()
tuned_model = tune_model(best_model)
final_model = finalize_model(tuned_model)
predictions = predict_model(final_model)

evaluate_model(final_model)
df_metrics = pull()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,67774.194,9374241830.1535,96462.7796,0.7249,0.2514,0.2046,10.086
gbr,Gradient Boosting Regressor,68426.6461,9632582114.3378,97753.6715,0.7177,0.2541,0.2039,4.62
lightgbm,Light Gradient Boosting Machine,70161.9852,10182913180.9742,100320.3286,0.7023,0.2622,0.2093,2.012
et,Extra Trees Regressor,69359.8814,10384674741.6827,100974.3005,0.6972,0.264,0.2098,8.592
ada,AdaBoost Regressor,87648.5569,12800065979.9492,112997.8183,0.6217,0.3244,0.3058,2.554
dt,Decision Tree Regressor,84760.4505,15406854405.7728,123711.8491,0.5475,0.3245,0.2564,0.526
en,Elastic Net,88459.5812,15761700659.2,125029.6969,0.5359,0.4248,0.2974,0.92
knn,K Neighbors Regressor,95403.6312,17132496691.2,130841.3312,0.4948,0.3414,0.2916,0.332
llar,Lasso Least Angle Regression,107999.6109,20670757273.6,143647.1781,0.3949,0.3853,0.3309,0.39
br,Bayesian Ridge,108978.7484,20795758182.4,144017.6375,0.3933,0.3748,0.3383,0.71


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [None]:
plot_model(final_model)

## Pipeline (encoders)

In [None]:
if not os.path.exists(path_encoders):
    os.makedirs(path_encoders)
encoders_order = []
for encoder_name, encoder_obj in final_model.steps[:(-1)]:
    joblib.dump(
        encoder_obj,
        file_path_encoder_obj.format(encoder_name)
    )
    encoders_order.append([encoder_name, file_path_encoder_obj.format(encoder_name)])
encoders_order = pd.DataFrame(encoders_order, columns = ["name", "url"])
encoders_order.to_csv(file_path_encoder_order, index = False)

## Pipeline (pycaret model tunned)

In [None]:
for model_name, model_obj in [final_model.steps[(-1)]]:
    joblib.dump(
        model_obj,
        file_path_pycared_model.format(model_name)
    )