In [7]:
# --- Step 1: Import ---
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
import sys
import sqlite3


# Assumi che il file config.py sia in src/
sys.path.append(os.path.abspath('..'))
from src import config  # config.RAW_DATA_PATH deve essere definito
from src.config import RAW_DATA_PATH
# --- 2. Funzioni di preprocessing ---

input_file = os.path.join(RAW_DATA_PATH, "dati_training.csv")
output_file = os.path.join(RAW_DATA_PATH, "training_MS.csv")
feature_list_file = os.path.join(RAW_DATA_PATH, "feature_selected_rf.txt")

# --- Step 3: Parametri ---
TARGET_COL = "Median Salary (USD)"  # cambia se necessario
IMPORTANCE_THRESHOLD = 0.01         # soglia minima di importanza

# --- Step 4: Carica dati ---
df = pd.read_csv(input_file)
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

# --- Step 5: Codifica minima per le categoriche (senza modificarle nei salvataggi) ---
X_encoded = X.copy()
for col in X_encoded.select_dtypes(include="object").columns:
    X_encoded[col], _ = X_encoded[col].factorize()

# --- Step 6: Addestra Random Forest e calcola permutation importance ---
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_encoded, y)

result = permutation_importance(rf, X_encoded, y, n_repeats=10, random_state=42, n_jobs=1)

importances = pd.Series(result.importances_mean, index=X_encoded.columns)

# --- Step 7: Seleziona le feature più importanti ---
selected_features = importances[importances > IMPORTANCE_THRESHOLD].index.tolist()
print(f"✅ Feature selezionate ({len(selected_features)}):\n{selected_features}")

# --- Step 8: Crea nuovo DataFrame con le feature selezionate e salva ---
X_selected = X[selected_features].copy()  # usa X originale per mantenere i valori categorici originali
X_selected[TARGET_COL] = y
X_selected.to_csv(output_file, index=False)

# --- Step 9: Salva la lista delle feature selezionate ---
with open(feature_list_file, "w") as f:
    for feat in selected_features:
        f.write(f"{feat}\n")

print(f"\n📁 Dataset salvato in: {output_file}")
print(f"📝 Lista feature salvata in: {feature_list_file}")


✅ Feature selezionate (12):
['Job Title', 'Industry', 'Job Status', 'AI Impact Level', 'Required Education', 'Experience Required (Years)', 'Job Openings (2024)', 'Projected Openings (2030)', 'Remote Work Ratio (%)', 'Automation Risk (%)', 'Location', 'Gender Diversity (%)']

📁 Dataset salvato in: e:\Download-E\PROGETTO PYTHON PERSONALE\data\training_MS.csv
📝 Lista feature salvata in: e:\Download-E\PROGETTO PYTHON PERSONALE\data\feature_selected_rf.txt
