# Imports

In [None]:
# basics
import pandas as pd
import numpy as np
import json

# feature_importance
import shap

# viz
import matplotlib.pyplot as plt

# models
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# metrics
from sklearn.metrics import (
    r2_score,
    mean_absolute_percentage_error,
    mean_squared_error,
    median_absolute_error,
    mean_absolute_error,
    median_absolute_error
)

# utils
import os

# Parameters

In [None]:
path_root = os.path.join("..","data")

path_primary = os.path.join(
    path_root, "03_primary"
)

file_path_input_data = os.path.join(
    path_primary, "data_input.csv"
)

file_path_metrics_features_test = os.path.join(
    path_primary, "features_test_metrics.json"
)
file_path_metrics_features_selected = os.path.join(
    path_primary, "features_selected.json"
)

In [None]:
number_of_features = 100

# Read dataset

In [None]:
data_input = pd.read_csv(
    file_path_input_data,
    index_col = 0
)

# Shap process

In [None]:
target = [
    "price"
]
cols_to_drop = [
    "cd_setor",
    "ID"
] + target

In [None]:
try:
    X = data_input.drop(cols_to_drop, axis=1)[features_selected]
except:
    X = data_input.drop(cols_to_drop, axis=1)

y = data_input[target[0]]

random_state = 42
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    test_size = 0.2,
                                                    random_state = random_state)

rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

In [None]:
explainer = shap.Explainer(rf_model)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values,
                  X_test,
                  plot_type="violin",
                  color_bar=False, show=False)
plt.colorbar(label='SHAP Value')

plt.show()

In [None]:
args_train = [
    y_train.values,
    rf_model.predict(X_train)
]
args_preds = [
    y_test.values,
    rf_model.predict(X_test)
]
metrics_train = {
        "r2": r2_score(
            *args_train
        ),
        "mape": mean_absolute_percentage_error(
            *args_train
        ),
        "rmse": np.sqrt(
            mean_squared_error(
            *args_train
            )
        ),
        "mse": mean_squared_error(
            *args_train
        ),
        "mae": mean_absolute_error(
            *args_train
        ),
        "median_ae": median_absolute_error(
            *args_train
        ),
        "correlation": np.corrcoef(
            *args_train
        )[0,1],
        "size_train": len(args_train[0])
}

metrics_pred = {
        "r2": r2_score(
            *args_preds
        ),
        "mape": mean_absolute_percentage_error(
            *args_preds
        ),
        "rmse": np.sqrt(
            mean_squared_error(
            *args_preds
            )
        ),
        "mse": mean_squared_error(
            *args_preds
        ),
        "mae": mean_absolute_error(
            *args_preds
        ),
        "median_ae": median_absolute_error(
            *args_preds
        ),
        "correlation": np.corrcoef(
            *args_preds
        )[0,1],
        "size_test": len(args_preds[0])
}
for me in [["train",metrics_train], ["test", metrics_pred]]:
    print(f'-------- [ {me[0]} ] ----------')
    for metric, result in me[1].items():
        print(f"{metric} : {round(result, 4)}")

In [None]:
fig, axes = plt.subplots(1, 2, figsize = (20,8))

for content in [[0, args_train, "Train"], [1, args_preds, "Test"]]:
    axes[content[0]].plot(content[1][0], content[1][1],"*")
    axes[content[0]].set_title(content[-1])
    axes[content[0]].set_xlabel("True")
    axes[content[0]].set_ylabel("Prediction")
    axes[content[0]].grid()

In [None]:
data_preds = pd.DataFrame(args_preds, 
                          index = ["y_true", "y_pred"]).T
data_preds = data_preds.sort_values("y_true", ascending = False)
data_preds["rank_true"] = (
    data_preds
    .sort_values("y_true", ascending = False)
    .reset_index(drop=True)
    .index
)
data_preds = data_preds.sort_values("y_pred", ascending = False)
data_preds["rank_pred"] = (
    data_preds
    .sort_values("y_pred", ascending = False)
    .reset_index(drop=True)
    .index
)
data_preds = data_preds.sample(frac=1).reset_index(drop = True)

In [None]:
data_preds.filter(regex='rank*', axis=1).corr()

In [None]:
features_importance = pd.DataFrame(
    rf_model.feature_importances_,
    index = rf_model.feature_names_in_,
    columns = ["fe"]
).sort_values("fe", ascending = False)
features_selected = list(
    features_importance
    .head(number_of_features)
    .index)

In [None]:
plt.figure(figsize = (12,8))
plt.plot(data_preds['y_true'].values, label = "true")
plt.plot(data_preds['y_pred'].values, label = "pred")
plt.grid()
plt.legend()

## Test cutoff of features

In [None]:
if os.path.exists(file_path_metrics_features_selected):
    with open(file_path_metrics_features_selected, 'r') as json_file:
        features_selected = json.load(json_file)
if os.path.exists(file_path_metrics_features_test):
    with open(file_path_metrics_features_test, 'r') as json_file:
        metrics_all = json.load(json_file)

In [None]:
type_of = "test"
chaves_internas = ['r2', 'mape', 'mae', 'median_ae','rmse','mse']
if type_of =="train":
    chaves_internas += ["mse"] 

num_rows = 3
num_cols = 2

fig, axes = plt.subplots(num_rows, num_cols, figsize=(18+num_rows, 9*num_cols))

for i, chave_interna in enumerate(chaves_internas):
    valores = [metrics_all[chave][type_of][chave_interna] for chave in metrics_all.keys()]

    row_idx = i // num_cols
    col_idx = i % num_cols
    ax = axes[row_idx, col_idx]
    
    ax.bar(metrics_all.keys(), valores)
    ax.set_xlabel('Chave')
    ax.set_ylabel(chave_interna.upper())
    ax.set_title(f'{chave_interna.upper()} por Chave')

    for j, valor in enumerate(valores):
        if chave_interna in ['r2', 'mape']:
            ax.text(j, valor, str(round(valor, 5)), ha='center', va='bottom')
        else:
            ax.text(j, valor, str(round(valor, 2)), ha='center', va='bottom')

plt.tight_layout()

plt.show()