## Imports

In [None]:
# basics
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
import warnings
import joblib

# feature_importance
import shap

# viz
import matplotlib.pyplot as plt

# models
from pycaret.regression import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    StratifiedKFold
)
from sklearn.model_selection import StratifiedKFold

# metrics
from sklearn.metrics import (
    r2_score,
    mean_absolute_percentage_error,
    mean_squared_error,
    median_absolute_error,
    mean_absolute_error,
    median_absolute_error,
    make_scorer
)

# utils
import os

## Parameters

In [None]:
path_root = os.path.join("..","data")

path_primary = os.path.join(
    path_root, "03_primary"
)
path_model = os.path.join(
    path_root, "04_model"
)
path_encoders = os.path.join(
    path_model, "encoders"
)
path_model_final = os.path.join(
    path_model, "model"
)

file_path_input_data = os.path.join(
    path_primary, "data_input.csv"
)

file_path_metrics_features_test = os.path.join(
    path_primary, "features_test_metrics.json"
)
file_path_metrics_features_selected = os.path.join(
    path_primary, "features_selected.json"
)
file_path_best_params = os.path.join(
    path_model, "best_params.json"
)
file_path_best_model = os.path.join(
    path_model, "model_tunned.joblib"
)
file_path_not_outliers = os.path.join(
    path_primary, "data_not_outliers.csv"
)
file_path_encoder_obj = os.path.join(
    path_encoders, "{}_encoder.joblib"
)
file_path_encoder_order = os.path.join(
    path_encoders, "encoders_orders.csv"
)
file_path_pycared_model = os.path.join(
    path_model_final, "{}.joblib"
)

## Read dataset

In [None]:
data_input = pd.read_csv(
    file_path_input_data, # file_path_input_data, file_path_not_outliers
    index_col = 0
)

In [None]:
if os.path.exists(file_path_metrics_features_selected):
    with open(file_path_metrics_features_selected, 'r') as json_file:
        features_selected = json.load(json_file)
    features_selected = features_selected["features_selected"]

In [None]:
if os.path.exists(file_path_best_params):
    with open(file_path_best_params, 'r') as json_file:
        best_params = json.load(json_file)
    best_params = best_params["params"]
else:
    best_params = {
        "random_state": 42
    }

## Train model

In [None]:
target = [
    "price"
]
cols_id_drop = [
    "cd_setor",
    "ID"
]
cols_drop = cols_id_drop + target

In [None]:
X = data_input[features_selected]

y = data_input[target[0]]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    test_size = 0.2,
                                                    random_state = best_params["random_state"])
rf_model = RandomForestRegressor(**best_params)
rf_model.fit(X_train, y_train)

### Metrics

In [None]:
args_train = [
    y_train.values,
    rf_model.predict(X_train)
]
args_preds = [
    y_test.values,
    rf_model.predict(X_test)
]
basics_metrics = {
        "r2": r2_score,
        "rmse": mean_squared_error,
        "mape": mean_absolute_percentage_error,
        "mse": mean_squared_error,
        "mae": mean_absolute_error,
        "median_ae": median_absolute_error
}
metrics_train = {
        "correlation": np.corrcoef(
            *args_train
        )[0,1],
        "size_train": len(args_train[0])
}

metrics_pred = {
        "correlation": np.corrcoef(
            *args_preds
        )[0,1],
        "size_test": len(args_preds[0])
}

for name, content in basics_metrics.items():
    if name != "rmse":
        metrics_train[name] = content(*args_train)
        metrics_pred[name] = content(*args_preds)
    else:
        metrics_train[name] = np.sqrt(content(*args_train))
        metrics_pred[name] = np.sqrt(content(*args_preds))
        
for me in [["train",metrics_train], ["test", metrics_pred]]:
    print(f'-------- [ {me[0]} ] ----------')
    for metric, result in me[1].items():
        print(f"{metric} : {round(result, 4)}")

In [None]:
data_values = {}
for content_all in zip(
    ['train', 'test'],
    [args_train, args_preds]
):
    name, content, arg = content_all
    content = pd.DataFrame(arg, 
                              index = ["y_true", "y_pred"]).T
    content = content.sort_values("y_true", ascending = False)
    content["rank_true"] = (
        content
        .sort_values("y_true", ascending = False)
        .reset_index(drop=True)
        .index
    )
    content = content.sort_values("y_pred", ascending = False)
    content["rank_pred"] = (
        content
        .sort_values("y_pred", ascending = False)
        .reset_index(drop=True)
        .index
    )
    content = content.sample(frac=1).reset_index(drop = True)
    data_values[name] = content

## Cross validation

In [None]:
n_folds = 5
stratified_kfold = StratifiedKFold(
    n_splits = n_folds, 
    shuffle = True,
    random_state = best_params["random_state"]
)

scores = {
    "rmse": [mean_squared_error, []],
    "r2": [r2_score, []],
    "mape": [mean_absolute_percentage_error, []],
    "mse": [mean_squared_error, []],
    "median_ae": [median_absolute_error, []],
    "mae": [mean_absolute_error, []],
    "mae": [median_absolute_error, []]
}
score_validation = []
for train_index, test_index in tqdm(stratified_kfold.split(X, y)):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]
    
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_test)

    args = [y_test, y_pred]
    for i, content in scores.items():
        if i == "rmse":
            content[1].append(
                np.sqrt(content[0](*args))
            )
        else:
            content[1].append(
                content[0](*args)
            )
            
for i, content in scores.items():
    text_mean = f"{i}: {round(np.mean(content[1]), 5)}"
    text_std = f"{i}_std: {round(np.std(content[1]), 5)}"
    print("-" * max((len(text_mean), len(text_std))))
    print(text_mean)
    print(text_std)
    print("-" * max((len(text_mean), len(text_std))))

## Tunning

In [None]:
parameters = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 3],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2, 3],
    'max_features': [1, 'auto'],
    "random_state": [42]
}

rf = RandomForestRegressor()

scorer = make_scorer(mean_squared_error,
                     greater_is_better=False)

grid_search = GridSearchCV(
    estimator = rf,
    param_grid = parameters,
    scoring = scorer, 
    cv = 5
)

with tqdm(total=len(parameters)) as pbar:
    def update_pbar(*args):
        pbar.update()

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Melhores parâmetros encontrados:")
for param, value in best_params.items():
    print(f"{param}: {value}")

best_score = grid_search.best_score_
print(f"Melhor resultado de erro quadrático médio: {best_score}")

In [None]:
best_model_params = {
    "params": best_params
}

In [None]:
with open(file_path_best_params, 'w') as json_file:
    json.dump(
        best_model_params, 
        json_file
    )

In [None]:
joblib.dump(
    best_model, 
    file_path_best_model
)

## Plots

In [None]:
fig, axes = plt.subplots(
    1, 2, figsize = (20, 8)
)
for idx, (name, content) in enumerate(data_values.items()):
    ax = axes[idx]
    ax.plot(content["y_true"], content["y_pred"], '*')
    ax.grid()
    ax.set_title(name.title())
    ax.set_xlabel(f"True {target[0]}")
    ax.set_ylabel(f"Predict {target[0]}")

## Select best model (pycaret)

In [None]:
regression_setup = setup(
    data = data_input.drop(cols_id_drop, axis = 1),
    target = target[0], 
    verbose = False, 
    fold = 5
)

best_model = compare_models()
tuned_model = tune_model(best_model)
final_model = finalize_model(tuned_model)
predictions = predict_model(final_model)

evaluate_model(final_model)
df_metrics = pull()

In [None]:
plot_model(final_model)

## Pipeline (encoders)

In [None]:
if not os.path.exists(path_encoders):
    os.makedirs(path_encoders)
encoders_order = []
for encoder_name, encoder_obj in final_model.steps[:(-1)]:
    joblib.dump(
        encoder_obj,
        file_path_encoder_obj.format(encoder_name)
    )
    encoders_order.append([encoder_name, file_path_encoder_obj.format(encoder_name)])
encoders_order = pd.DataFrame(encoders_order, columns = ["name", "url"])
encoders_order.to_csv(file_path_encoder_order, index = False)

## Pipeline (pycaret model tunned)

In [None]:
for model_name, model_obj in [final_model.steps[(-1)]]:
    joblib.dump(
        model_obj,
        file_path_pycared_model.format(model_name)
    )