## Imports

In [None]:
import sys
from pathlib import Path
PROJECT_DIR = Path.cwd().parent
sys.path.append(str(PROJECT_DIR))

# basics
import pandas as pd
import numpy as np
from tqdm import tqdm
import json

# viz
import matplotlib.pyplot as plt

# models
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# utils
import os
from ppm.nodes.feature_explainer import (
    shap_explainer
)

In [None]:
k_features = 500

## Parameters

In [None]:
path_root = os.path.join(
    "..", "data"
)

path_primary = os.path.join(
    path_root, "03_primary"
)
file_path_metrics_features_selected = os.path.join(
    path_primary, "features_selected.json"
)
file_path_not_outliers = os.path.join(
    path_primary, "data_not_outliers.csv"
)

data_input = pd.read_csv(
    file_path_not_outliers, # file_path_input_data,
    index_col = 0
)

## Read

In [None]:
data_input = pd.read_csv(
    file_path_not_outliers, # file_path_input_data,
    index_col = 0
)

## Feature selection

In [None]:
# some parameters

target = [
    "price"
]
cols_to_drop = [
    "cd_setor",
    "ID"
] + target

number_of_features = data_input.shape[1]-1
random_state = 42

In [None]:
X = data_input.drop(cols_to_drop, axis = 1)
y = data_input[target]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size = 0.2,
    random_state = 42
)

## Modelling

In [None]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

In [None]:
fig_shap, explainer = shap_explainer(
    [model],
    X
)
shap_values = explainer.shap_values(X)

## Get best features

In [None]:
importance_scores = np.abs(shap_values).mean(axis=0)
sorted_features = sorted(zip(X.columns, importance_scores), key=lambda x: x[1], reverse=True)
selected_features = [f[0] for f in sorted_features[:k_features]]

## Save

In [None]:
# Save selected features to a JSON file
selected_features_dict = {"selected_features": selected_features}
with open(file_path_metrics_features_selected, "w") as file:
    json.dump(selected_features_dict, file)