In [7]:
# --- Step 1: Import ---
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
import sys
import sqlite3


# Assumi che il file config.py sia in src/
sys.path.append(os.path.abspath('..'))
from src import config  # config.RAW_DATA_PATH deve essere definito
from src.config import RAW_DATA_PATH
# --- 2. Funzioni di preprocessing ---

input_file = os.path.join(RAW_DATA_PATH, "dati_training.csv")
output_file = os.path.join(RAW_DATA_PATH, "training_MS.csv")
feature_list_file = os.path.join(RAW_DATA_PATH, "feature_selected_rf.txt")

# --- Step 3: Parametri ---
TARGET_COL = "Median Salary (USD)"  # cambia se necessario
IMPORTANCE_THRESHOLD = 0.01         # soglia minima di importanza

# --- Step 4: Carica dati ---
df = pd.read_csv(input_file)
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

# --- Step 5: Codifica minima per le categoriche (senza modificarle nei salvataggi) ---
X_encoded = X.copy()
for col in X_encoded.select_dtypes(include="object").columns:
    X_encoded[col], _ = X_encoded[col].factorize()

# --- Step 6: Addestra Random Forest e calcola permutation importance ---
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_encoded, y)

result = permutation_importance(rf, X_encoded, y, n_repeats=10, random_state=42, n_jobs=1)

importances = pd.Series(result.importances_mean, index=X_encoded.columns)

# --- Step 7: Seleziona le feature pi√π importanti ---
selected_features = importances[importances > IMPORTANCE_THRESHOLD].index.tolist()
print(f"‚úÖ Feature selezionate ({len(selected_features)}):\n{selected_features}")

# --- Step 8: Crea nuovo DataFrame con le feature selezionate e salva ---
X_selected = X[selected_features].copy()  # usa X originale per mantenere i valori categorici originali
X_selected[TARGET_COL] = y
X_selected.to_csv(output_file, index=False)

# --- Step 9: Salva la lista delle feature selezionate ---
with open(feature_list_file, "w") as f:
    for feat in selected_features:
        f.write(f"{feat}\n")

print(f"\nüìÅ Dataset salvato in: {output_file}")
print(f"üìù Lista feature salvata in: {feature_list_file}")


‚úÖ Feature selezionate (12):
['Job Title', 'Industry', 'Job Status', 'AI Impact Level', 'Required Education', 'Experience Required (Years)', 'Job Openings (2024)', 'Projected Openings (2030)', 'Remote Work Ratio (%)', 'Automation Risk (%)', 'Location', 'Gender Diversity (%)']

üìÅ Dataset salvato in: e:\Download-E\PROGETTO PYTHON PERSONALE\data\training_MS.csv
üìù Lista feature salvata in: e:\Download-E\PROGETTO PYTHON PERSONALE\data\feature_selected_rf.txt


In [5]:
import os
import pandas as pd
import sys
sys.path.append(os.path.abspath('..'))
from src import config  # config.RAW_DATA_PATH deve essere definito
from src.config import RAW_DATA_PATH


def create_validation_ms(target_col: str):
    """
    Crea un file CSV di validation con le sole feature selezionate dalla model selection.

    Parametri
    ----------
    target_col : str
        Il nome della colonna target che deve essere inclusa nel file di output.

    Note
    -----
    - Si aspetta che `feature_selected.txt` contenga un nome di colonna per riga.
    - Legge `dati_validation.csv` da config.RAW_DATA_PATH.
    - Scrive `validation_MS.csv` nella stessa cartella.
    """
    raw_dir = config.RAW_DATA_PATH
    feature_file = os.path.join(raw_dir, "feature_selected_rf.txt")
    validation_file = os.path.join(raw_dir, "dati_validation.csv")
    output_file = os.path.join(raw_dir, "validation_MS.csv")

    # Leggi le feature selezionate (una per riga)
    with open(feature_file, "r") as f:
        selected_features = [line.strip() for line in f if line.strip()]

    # Assicurati che la colonna target sia presente nell'elenco finale
    # (in caso non sia stata selezionata dalla model selection)
    columns_to_keep = selected_features + [target_col] if target_col not in selected_features else selected_features

    # Carica il file di validazione originale
    df_val = pd.read_csv(validation_file)

    # Filtra solo le colonne selezionate
    df_val_ms = df_val[columns_to_keep]

    # Salva il nuovo file CSV
    df_val_ms.to_csv(output_file, index=False)

    return df_val_ms

# Sostituisci con il nome corretto della tua colonna target
TARGET_COL = "Median Salary (USD)"

create_validation_ms(TARGET_COL)


Unnamed: 0,Job Title,Industry,Job Status,AI Impact Level,Required Education,Experience Required (Years),Job Openings (2024),Projected Openings (2030),Remote Work Ratio (%),Automation Risk (%),Location,Gender Diversity (%),Median Salary (USD)
0,Research scientist (life sciences),Finance,Decreasing,Moderate,Master‚Äôs Degree,0,7301,8786,67.12,30.65,Canada,30.01,121278.59
1,TEFL teacher,Entertainment,Increasing,Moderate,High School,9,500,7528,47.11,74.05,USA,58.02,118944.32
2,"Editor, magazine features",Entertainment,Increasing,Low,Master‚Äôs Degree,15,4897,6962,14.18,47.36,India,34.39,95884.27
3,"Journalist, magazine",Entertainment,Increasing,Low,Bachelor‚Äôs Degree,8,3068,2711,77.63,64.27,China,55.36,139808.82
4,"Engineer, land",Transportation,Increasing,Low,High School,15,3728,6498,34.16,79.77,Australia,25.64,51687.18
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,Barista,Education,Decreasing,Low,PhD,4,7575,6354,56.88,2.74,Australia,34.37,47459.63
5996,Chief Marketing Officer,Transportation,Increasing,Moderate,PhD,1,5637,7898,81.51,54.83,India,30.70,87128.65
5997,Consulting civil engineer,Retail,Decreasing,Moderate,Master‚Äôs Degree,0,9020,2019,72.48,41.68,China,54.94,131719.46
5998,"Engineer, structural",Entertainment,Decreasing,Moderate,PhD,19,8597,1731,47.85,37.84,India,75.44,124679.24
