## Imports

In [None]:
import sys
from pathlib import Path
PROJECT_DIR = Path.cwd().parent
sys.path.append(str(PROJECT_DIR))

# basics
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
import warnings
import joblib

# feature_importance
import shap

# viz
import matplotlib.pyplot as plt

# models
from pycaret.regression import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    StratifiedKFold
)
from sklearn.model_selection import StratifiedKFold

# metrics
from sklearn.metrics import (
    r2_score,
    mean_absolute_percentage_error,
    mean_squared_error,
    median_absolute_error,
    mean_absolute_error,
    median_absolute_error,
    make_scorer
)

# utils
from ppm.nodes.metrics_calculator import (
    metrics_calculate,
    show_results
)
from ppm.nodes.cross_validation import (
    cross_validation
)
from ppm.nodes.process_outputs import (
    ranking_output
)
from ppm.nodes.feature_explainer import (
    tree_explainer,
    shap_explainer
)
from ppm.nodes.plots import (
    plot_feature_importance,
    plot_predictions,
    plot_true_vs_pred
)

## Parameters

In [None]:
path_root = os.path.join("..","data")

path_primary = os.path.join(
    path_root, "03_primary"
)
path_model = os.path.join(
    path_root, "04_model"
)
path_encoders = os.path.join(
    path_model, "encoders"
)
path_model_final = os.path.join(
    path_model, "model"
)

file_path_input_data = os.path.join(
    path_primary, "data_input.csv"
)

file_path_metrics_features_test = os.path.join(
    path_primary, "features_test_metrics.json"
)
file_path_metrics_features_selected = os.path.join(
    path_primary, "features_selected.json"
)
file_path_best_params = os.path.join(
    path_model, "best_params.json"
)
file_path_encoder_order = os.path.join(
    path_encoders, "encoders_orders.csv"
)
file_path_not_outliers = os.path.join(
    path_primary, "data_not_outliers.csv"
)

## Read content

In [None]:
data_input = pd.read_csv(
    file_path_input_data, # file_path_input_data, file_path_not_outliers
    index_col = 0
)

In [None]:
encoders_order = pd.read_csv(
    file_path_encoder_order
)

In [None]:
# read encoders
encoders = {}
for encoder_content in encoders_order.values:
    encoder_name, encoder_url = encoder_content
    encoders[encoder_name.split(".")[0]] = joblib.load(
        encoder_url
    )

In [None]:
# read model pycaret
model = {}
for model_name in os.listdir(path_model_final):
    model[model_name.split(".")[0]] = joblib.load(
        os.path.join(
            path_model_final,
            model_name
        )
    )

In [None]:
if os.path.exists(file_path_best_params):
    with open(file_path_best_params, 'r') as json_file:
        best_params = json.load(json_file)
    best_params = best_params["params"]
else:
    best_params = {
        "random_state": 42
    }

In [None]:
estimator = model["actual_estimator"]

## Train test and split

In [None]:
target = [
    "price"
]
cd_setor_drop = [
    "cd_setor",
]
id_drop = [
    "ID"
]
cols_drop = cd_setor_drop + target

In [None]:
X = data_input.drop(cols_drop, axis = 1)

y = data_input[id_drop + target]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    test_size = 0.2,
                                                    random_state = best_params["random_state"])

## Encoders process

In [None]:
encoded_data = {
    "train": X_train.copy(),
    "test": X_test.copy()
}
for encoder_name, encoder_content in encoders.items():
    print("--- encoder: [{}]".format(encoder_name))
    for type_name, data_to_encoder in encoded_data.items():
        id_row = data_to_encoder[id_drop[0]].values
        data_to_encoder.drop(id_drop[0], axis = 1, inplace = True)
        encoded_data[type_name] = encoder_content.transform(data_to_encoder)
        encoded_data[type_name].insert(0, id_drop[0], id_row)

## Predict model

In [None]:
estimator.fit(
    X_train.drop(id_drop, axis = 1),
    y_train.drop(id_drop, axis = 1)
)

## Metrics

In [None]:
args_train = [
    y_train.drop(id_drop, axis=1).T.values[0],
    estimator.predict(X_train.drop(id_drop, axis=1))
]
args_preds = [
    y_test.drop(id_drop, axis=1).T.values[0],
    estimator.predict(X_test.drop(id_drop, axis=1))
]

metrics_train = metrics_calculate(args_train)
metrics_pred = metrics_calculate(args_preds)

metrics_train = show_results(metrics_train, "train")
metrics_test = show_results(metrics_pred, "test")
metrics_results = metrics_train|metrics_test

In [None]:
data_values = {}
for name, args in zip(['train', 'test'], [args_train, args_preds]):
    content = ranking_output(args)
    data_values[name] = content

## Cross validation

In [None]:
cv_results = cross_validation(
    X = X.drop(id_drop, axis=1).copy(),
    y = y.drop(id_drop, axis=1).copy(), 
    model = [estimator],
    cv_splits = 5,
    random_state = best_params["random_state"],
    show = True
)

## Feature importances

In [None]:
feature_importances, features_selected = tree_explainer(
    model = [estimator],
    features_names = list(X.drop(id_drop, axis=1).columns),
    n_features = 10
)

In [None]:
shap_explainer(
    [estimator],
    X_test.drop(id_drop, axis = 1)
)

In [None]:
fig_tree_explainer = plot_feature_importance(
    feature_importances, 
    10
)

## Plots

In [None]:
fig_predicts = plot_predictions(
    data_values,
    metrics_results,
    target,
    figsize = (18, 8)
)

In [None]:
fig_true_vs_pred = plot_true_vs_pred(
    data_values,
    None,
    None,
    figsize = (18, 6),
    linestyle = 'dashed'
)