## Imports

In [32]:
# basics
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
import warnings
import joblib

# feature_importance
import shap

# viz
import matplotlib.pyplot as plt

# models
from pycaret.regression import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    StratifiedKFold
)
from sklearn.model_selection import StratifiedKFold

# metrics
from sklearn.metrics import (
    r2_score,
    mean_absolute_percentage_error,
    mean_squared_error,
    median_absolute_error,
    mean_absolute_error,
    median_absolute_error,
    make_scorer
)

# utils
import os

## Parameters

In [33]:
path_root = os.path.join("..","data")

path_primary = os.path.join(
    path_root, "03_primary"
)
path_model = os.path.join(
    path_root, "04_model"
)
path_encoders = os.path.join(
    path_model, "encoders"
)
path_model_final = os.path.join(
    path_model, "model"
)

file_path_input_data = os.path.join(
    path_primary, "data_input.csv"
)

file_path_metrics_features_test = os.path.join(
    path_primary, "features_test_metrics.json"
)
file_path_metrics_features_selected = os.path.join(
    path_primary, "features_selected.json"
)
file_path_best_params = os.path.join(
    path_model, "best_params.json"
)
file_path_encoder_order = os.path.join(
    path_encoders, "encoders_orders.csv"
)
file_path_not_outliers = os.path.join(
    path_primary, "data_not_outliers.csv"
)

## Read content

In [34]:
data_input = pd.read_csv(
    file_path_input_data, # file_path_input_data, file_path_not_outliers
    index_col = 0
)

In [36]:
encoders_order = pd.read_csv(
    file_path_encoder_order
)

In [37]:
# read encoders
encoders = {}
for encoder_content in encoders_order.values:
    encoder_name, encoder_url = encoder_content
    encoders[encoder_name.split(".")[0]] = joblib.load(
        encoder_url
    )

In [38]:
# read model pycaret
model = {}
for model_name in os.listdir(path_model_final):
    model[model_name.split(".")[0]] = joblib.load(
        os.path.join(
            path_model_final,
            model_name
        )
    )

In [39]:
if os.path.exists(file_path_best_params):
    with open(file_path_best_params, 'r') as json_file:
        best_params = json.load(json_file)
    best_params = best_params["params"]
else:
    best_params = {
        "random_state": 42
    }

In [40]:
estimator = model["actual_estimator"]

## Train test and split

In [41]:
target = [
    "price"
]
cd_setor_drop = [
    "cd_setor",
]
id_drop = [
    "ID"
]
cols_drop = cd_setor_drop + target

In [42]:
X = data_input.drop(cols_drop, axis = 1)

y = data_input[id_drop + target]

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    test_size = 0.2,
                                                    random_state = best_params["random_state"])

## Encoders process

In [44]:
encoded_data = {
    "train": X_train.copy(),
    "test": X_test.copy()
}
for encoder_name, encoder_content in encoders.items():
    print("--- encoder: [{}]".format(encoder_name))
    for type_name, data_to_encoder in encoded_data.items():
        id_row = data_to_encoder[id_drop[0]].values
        data_to_encoder.drop(id_drop[0], axis = 1, inplace = True)
        encoded_data[type_name] = encoder_content.transform(data_to_encoder)
        encoded_data[type_name].insert(0, id_drop[0], id_row)

--- encoder: [numerical_imputer]


AttributeError: 'SimpleImputer' object has no attribute 'keep_empty_features'

## Predict model

In [None]:
estimator.fit(
    X_train.drop(id_drop, axis = 1),
    y_train.drop(id_drop, axis = 1)
)

## Metrics

In [None]:
args_train = [
    y_train.drop(id_drop, axis  = 1).T.values[0],
    estimator.predict(X_train.drop(id_drop, axis  = 1))
]
args_preds = [
    y_test.drop(id_drop, axis  = 1).T.values[0],
    estimator.predict(X_test.drop(id_drop, axis  = 1))
]
basics_metrics = {
        "r2": r2_score,
        "rmse": mean_squared_error,
        "mape": mean_absolute_percentage_error,
        "mse": mean_squared_error,
        "mae": mean_absolute_error,
        "median_ae": median_absolute_error
}
metrics_train = {
        "correlation": np.corrcoef(
            *args_train
        )[0,1],
        "size_train": len(args_train[0])
}

metrics_pred = {
        "correlation": np.corrcoef(
            *args_preds
        )[0,1],
        "size_test": len(args_preds[0])
}

for name, content in basics_metrics.items():
    if name != "rmse":
        metrics_train[name] = content(*args_train)
        metrics_pred[name] = content(*args_preds)
    else:
        metrics_train[name] = np.sqrt(content(*args_train))
        metrics_pred[name] = np.sqrt(content(*args_preds))

metrics_results = {}
for me in [["train",metrics_train], ["test", metrics_pred]]:
    print(f'-------- [ {me[0]} ] ----------')
    for metric, result in me[1].items():
        print(f"{metric} : {round(result, 4)}")
    metrics_results[me[0]] = me[1]

In [None]:
data_values = {}
for content_all in zip(
    ['train', 'test'],
    [X_train[id_drop[0]].values, X_test[id_drop[0]].values],
    [args_train, args_preds]
):
    name, id_col, arg = content_all
    content = pd.DataFrame(arg, 
                              index = ["y_true", "y_pred"]).T
    content.insert(0, id_drop[0], id_col)
    content = content.sort_values("y_true", ascending = False)
    content["rank_true"] = (
        content
        .sort_values("y_true", ascending = False)
        .reset_index(drop=True)
        .index
    )
    content = content.sort_values("y_pred", ascending = False)
    content["rank_pred"] = (
        content
        .sort_values("y_pred", ascending = False)
        .reset_index(drop=True)
        .index
    )
    content = content.sample(frac=1).reset_index(drop = True)
    data_values[name] = content

## Cross validation

In [None]:
n_folds = 5
stratified_kfold = StratifiedKFold(
    n_splits = n_folds, 
    shuffle = True,
    random_state = best_params["random_state"]
)

scores = {
    "rmse": [mean_squared_error, []],
    "r2": [r2_score, []],
    "mape": [mean_absolute_percentage_error, []],
    "mse": [mean_squared_error, []],
    "median_ae": [median_absolute_error, []],
    "mae": [mean_absolute_error, []],
    "mae": [median_absolute_error, []]
}
score_validation = []
_X = X.drop(id_drop, axis = 1).copy()
_y = y.drop(id_drop, axis = 1).copy()
for train_index, test_index in tqdm(
    stratified_kfold.split(
        _X,
        _y
    )
):

    X_cv_train, X_cv_test = _X.values[train_index], _X.values[test_index]
    y_cv_train, y_cv_test = _y.values[train_index], _y.values[test_index]

    estimator.fit(X_cv_train, y_cv_train)

    y_cv_pred = estimator.predict(X_cv_test)

    args = [y_cv_test, y_cv_pred]
    for i, content in scores.items():
        if i == "rmse":
            content[1].append(
                np.sqrt(content[0](*args))
            )
        else:
            content[1].append(
                content[0](*args)
            )
            
for i, content in scores.items():
    text_mean = f"{i}: {round(np.mean(content[1]), 5)}"
    text_std = f"{i}_std: {round(np.std(content[1]), 5)}"
    print("-" * max((len(text_mean), len(text_std))))
    print(text_mean)
    print(text_std)
    print("-" * max((len(text_mean), len(text_std))))

## Feature importances

In [None]:
feature_importances  = pd.DataFrame(
    estimator.feature_importances_,
    index = X_train.drop(id_drop, axis = 1).columns,
    columns = ["fe"]
).sort_values("fe", ascending = False)
feature_importances["coefs_transform"] = (
    feature_importances/np.sum(feature_importances["fe"])
)
feature_importances = (
    feature_importances
    .reset_index()
    .rename(columns = {"index": "features"})
)

In [None]:
explainer = shap.Explainer(estimator)
shap_values = explainer.shap_values(X_test.drop(id_drop, axis = 1))
shap.summary_plot(shap_values,
                  X_test.drop(id_drop, axis = 1),
                  plot_type="violin",
                  color_bar=False, show=False)
plt.colorbar(label='SHAP Value')

plt.show()

In [None]:
k = 10
fig, axes = plt.subplots(1, 1, figsize = (12, 8))
axes.plot(
    range(len(feature_importances.head(k).coefs_transform)),
    feature_importances.head(k).coefs_transform,
    "*", 
    c = "black"
)

for (i, row), x in zip(feature_importances.head(k).iterrows(),
                       range(len(feature_importances.head(k).coefs_transform))):
    axes.annotate(row['features'], 
                  xy = (x, row['coefs_transform']), ha='center', va='bottom')
    axes.grid()
plt.show()

## Plots

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(20, 8))

for (idx, (name, content)), metric_content in zip(enumerate(data_values.items()), metrics_results.values()):
    ax = axes[idx]
    ax.plot(content["y_true"], content["y_true"], 'r', linewidth=2, linestyle='dashed')
    ax.plot(content["y_true"], content["y_pred"], '*')
    ax.grid()
    ax.set_title(name.title() + r" | $R^{2}$ = " + "{:.5}".format(metric_content["r2"]))
    ax.set_xlabel(f"True {target[0]}")
    ax.set_ylabel(f"Predict {target[0]}")
    
    ax.grid()

plt.show()


In [None]:
fig, axes = plt.subplots(1, 2, figsize =  (18,8))

for (name, content), ax in zip(data_values.items(), axes):
    ax.plot(data_values[name]['y_true'].values, label = "true")
    ax.plot(data_values[name]['y_pred'].values, label = "pred")
    ax.set_title(name.title() + r" | $\rho$ = " + "{:.5}".format(np.corrcoef(
        data_values[name]['y_true'].values, 
        data_values[name]['y_pred'].values)[0,1])
    ) 
plt.legend()