# Imports

In [None]:
import sys
from pathlib import Path
PROJECT_DIR = Path.cwd().parent
sys.path.append(str(PROJECT_DIR))

# basics
import pandas as pd
import json

# feature_importance
import shap

# viz
import matplotlib.pyplot as plt

# models
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# utils
import os
from ppm.nodes.feature_explainer import (
    shap_explainer,
    tree_explainer
)
from ppm.nodes.metrics_calculator import (
    metrics_calculate,
    show_results
)
from ppm.nodes.plots import (
    scatterplot_yx,
    distribution_plot
)
from ppm.nodes.process_outputs import (
    ranking_output
)

# Parameters

In [None]:
path_root = os.path.join("..","data")

path_primary = os.path.join(
    path_root, "03_primary"
)

file_path_input_data = os.path.join(
    path_primary, "data_input.csv"
)

file_path_metrics_features_test = os.path.join(
    path_primary, "features_test_metrics.json"
)
file_path_metrics_features_selected = os.path.join(
    path_primary, "features_selected.json"
)

In [None]:
number_of_features = 100

# Read dataset

In [None]:
data_input = pd.read_csv(
    file_path_input_data,
    index_col = 0
)

# Shap process

In [None]:
target = [
    "price"
]
cols_to_drop = [
    "cd_setor",
    "ID"
] + target

In [None]:
try:
    X = data_input.drop(cols_to_drop, axis=1)[features_selected]
except:
    X = data_input.drop(cols_to_drop, axis=1)

y = data_input[target[0]]

random_state = 42
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    test_size = 0.2,
                                                    random_state = random_state)

rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

In [None]:
shap_explainer(rf_model, X_test)

In [None]:
args_train = [
    y_train.values,
    rf_model.predict(X_train)
]
args_preds = [
    y_test.values,
    rf_model.predict(X_test)
]
metrics_train = metrics_calculate(args_train)
metrics_pred = metrics_calculate(args_preds)

In [None]:
metrics_train = show_results(metrics_train, "train")
metrics_test = show_results(metrics_pred, "test")
metrics_results = metrics_train|metrics_test

In [None]:
fig_scatter_xy = scatterplot_yx(args_train, 
               args_preds, 
               {
                   "nrows": 1,
                   "ncols": 2,
                   "figsize": (20,8)
               })

In [None]:
data_preds = ranking_output(args_preds)
data_train = ranking_output(args_train)

In [None]:
data_preds.filter(regex='rank*', axis=1).corr()

In [None]:
features_importance, features_selected = tree_explainer(
    [rf_model],
    list(X.columns),
    number_of_features
)

In [None]:
dist_plot_xy = distribution_plot(
    data_preds,
    {"figsize": (12,8)}
)

## Test cutoff of features

In [None]:
if os.path.exists(file_path_metrics_features_selected):
    with open(file_path_metrics_features_selected, 'r') as json_file:
        features_selected = json.load(json_file)
if os.path.exists(file_path_metrics_features_test):
    with open(file_path_metrics_features_test, 'r') as json_file:
        metrics_all = json.load(json_file)

In [None]:
type_of = "test"
chaves_internas = ['r2', 'mape', 'mae', 'median_ae','rmse','mse']
if type_of =="train":
    chaves_internas += ["mse"] 

num_rows = 3
num_cols = 2

fig, axes = plt.subplots(num_rows, num_cols, figsize=(18+num_rows, 9*num_cols))

for i, chave_interna in enumerate(chaves_internas):
    valores = [metrics_all[chave][type_of][chave_interna] for chave in metrics_all.keys()]

    row_idx = i // num_cols
    col_idx = i % num_cols
    ax = axes[row_idx, col_idx]
    
    ax.bar(metrics_all.keys(), valores)
    ax.set_xlabel('Chave')
    ax.set_ylabel(chave_interna.upper())
    ax.set_title(f'{chave_interna.upper()} por Chave')

    for j, valor in enumerate(valores):
        if chave_interna in ['r2', 'mape']:
            ax.text(j, valor, str(round(valor, 5)), ha='center', va='bottom')
        else:
            ax.text(j, valor, str(round(valor, 2)), ha='center', va='bottom')

plt.tight_layout()

plt.show()