## Imports

In [None]:
# basics
import numpy as np
import pandas as pd
import joblib
import shap

# model
from sklearn.decomposition import PCA
from sklearn.metrics import (
    r2_score,
    mean_absolute_percentage_error,
    mean_squared_error,
    median_absolute_error,
    mean_absolute_error,
    median_absolute_error
)

# viz
import matplotlib.pyplot as plt
import seaborn as sns

# utils
import os
from tqdm import tqdm

## Parameters

In [None]:
n_samples_oos = 2

In [None]:
path_root_oos = os.path.join(
    '..', 'oos', # data or oos
)
path_root_data = os.path.join(
    "..","data"
)
path_primary = os.path.join(
    path_root_oos, "03_primary"
)
path_model = os.path.join(
    path_root_data, "04_model"
)
path_encoders = os.path.join(
    path_model, "encoders"
)
path_model_final = os.path.join(
    path_model, "model"
)


file_path_input_data = os.path.join(
    path_primary, "data_input.csv"
)
file_path_encoder_order = os.path.join(
    path_encoders, "encoders_orders.csv"
)

## Read

In [None]:
encoders_order = pd.read_csv(file_path_encoder_order)

In [None]:
# read model pycaret
model = {}
for model_name in os.listdir(path_model_final):
    model[model_name.split(".")[0]] = joblib.load(
        os.path.join(
            path_model_final,
            model_name
        )
    )
estimator = model["actual_estimator"]

In [None]:
# read encoders
encoders = {}
for encoder_content in encoders_order.values:
    encoder_name, encoder_url = encoder_content
    encoders[encoder_name.split(".")[0]] = joblib.load(
        encoder_url
    )

In [None]:
data_input = pd.read_csv(
    file_path_input_data, # file_path_input_data, file_path_not_outliers
    index_col = 0
).head(n_samples_oos)

## Filter columns

In [None]:
target = [
    "price"
]
cd_setor_drop = [
    "cd_setor",
]
id_drop = [
    "ID"
]
cols_drop = cd_setor_drop + target

In [None]:
X = data_input.drop(cols_drop, axis = 1)

y = data_input[id_drop + target]

## Encoder process

In [None]:
encoded_data = {
    "oos": X.copy()
}
for encoder_name, encoder_content in encoders.items():
    print("--- encoder: [{}]".format(encoder_name))
    for type_name, data_to_encoder in encoded_data.items():
        
        id_row = data_to_encoder[id_drop[0]].values
        data_to_encoder.drop(id_drop[0], axis = 1, inplace = True)
        encoded_data[type_name] = encoder_content.transform(data_to_encoder)
        encoded_data[type_name].insert(0, id_drop[0], id_row)

## Predictions

In [None]:
X_encoded = encoded_data["oos"].copy()

In [None]:
y_pred = estimator.predict(
    X_encoded.drop(id_drop, axis = 1)
)

## Metrics

In [None]:
args_preds = [
   y[target[0]].values,
    y_pred
]
type_of = "oos"
basics_metrics = {
        "r2": r2_score,
        "rmse": mean_squared_error,
        "mape": mean_absolute_percentage_error,
        "mse": mean_squared_error,
        "mae": mean_absolute_error,
        "median_ae": median_absolute_error
}
metrics_pred = {
        "correlation": np.corrcoef(
            *args_preds
        )[0,1],
        f"size_{type_of}": len(args_preds[0])
}

for name, content in basics_metrics.items():
    if name != "rmse":
        #metrics_train[name] = content(*args_train)
        metrics_pred[name] = content(*args_preds)
    else:
        #metrics_train[name] = np.sqrt(content(*args_train))
        metrics_pred[name] = np.sqrt(content(*args_preds))

metrics_results = {}
for me in [[type_of, metrics_pred]]:
    print(f'-------- [ {me[0]} ] ----------')
    for metric, result in me[1].items():
        print(f"{metric} : {round(result, 4)}")
    metrics_results[me[0]] = me[1]

In [None]:
data_values = {}
for content_all in zip(
    ['oos'],
    [X_encoded[id_drop[0]].values],
    [args_preds]
):
    name, id_col, arg = content_all
    content = pd.DataFrame(arg, 
                              index = ["y_true", "y_pred"]).T
    content.insert(0, id_drop[0], id_col)
    content = content.sort_values("y_true", ascending = False)
    content["rank_true"] = (
        content
        .sort_values("y_true", ascending = False)
        .reset_index(drop=True)
        .index
    )
    content = content.sort_values("y_pred", ascending = False)
    content["rank_pred"] = (
        content
        .sort_values("y_pred", ascending = False)
        .reset_index(drop=True)
        .index
    )
    content = content.sample(frac=1).reset_index(drop = True)
    data_values[name] = content

## Plots

In [None]:
explainer = shap.Explainer(estimator)
shap_values = explainer.shap_values(X_encoded.drop(id_drop, axis = 1))
shap.summary_plot(shap_values,
                  X_encoded.drop(id_drop, axis = 1),
                  plot_type="violin",
                  color_bar=False, show=False)
plt.colorbar(label='SHAP Value')

plt.show()

In [None]:
fig, axes = plt.subplots(1, len(data_values.keys()), figsize=(20, 8))
if len(data_values.keys())==1:
    axes = [axes]
for (idx, (name, content)), metric_content in zip(enumerate(data_values.items()), metrics_results.values()):
    ax = axes[idx]
    ax.plot(content["y_true"], content["y_true"], 'r', linewidth=2, linestyle='dashed')
    ax.plot(content["y_true"], content["y_pred"], '*')
    ax.set_title(name.title() + r" | $R^{2}$ = " + "{:.5}".format(metric_content["r2"]))
    ax.set_xlabel(f"True {target[0]}")
    ax.set_ylabel(f"Predict {target[0]}")
    ax.grid()

plt.show()

In [None]:
fig, axes = plt.subplots(1, len(data_values.keys()), figsize =  (18,8))
if len(data_values.keys())==1:
    axes = [axes]
for (name, content), ax in zip(data_values.items(), axes):
    ax.plot(data_values[name]['y_true'].values, label = "true")
    ax.plot(data_values[name]['y_pred'].values, label = "pred")
    ax.set_title((
        name
        + f" | $n$ = {len(data_values[name]['y_true'].values)}"
        + r" | $\rho_{values}$ = "
        + "{:.5}".format(
            data_values["oos"].filter(regex = r"y_*").corr().values[0, 1]
            )
        + r" | $\rho_{rank}$ = "
        + "{:.5}".format(
            data_values["oos"].filter(regex = r"rank*").corr().values[0,1]
            )
        )
    ) 
    ax.grid()
plt.legend()

In [None]:
data_values['oos']