In [4]:
# --- 1. Import delle librerie e setup ---

import os
import sys
import pandas as pd
import sqlite3
import numpy as np
from sklearn.model_selection import train_test_split

# Assumi che il file config.py sia in src/
sys.path.append(os.path.abspath('..'))
from src import config  # config.RAW_DATA_PATH deve essere definito

# --- 2. Funzioni di preprocessing ---

def handle_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    df_copy = df.copy()
    for col in df_copy.columns:
        if df_copy[col].isnull().sum() > 0:
            if df_copy[col].dtype in [np.float64, np.int64]:
                df_copy[col] = df_copy[col].fillna(df_copy[col].mean())
            else:
                df_copy[col] = df_copy[col].fillna(df_copy[col].mode()[0])
    return df_copy

def remove_multicollinearity(df: pd.DataFrame, threshold: float = 0.9) -> pd.DataFrame:
    df_copy = df.copy()
    numeric_df = df_copy.select_dtypes(include=[np.number])
    corr_matrix = numeric_df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [col for col in upper.columns if any(upper[col] > threshold)]
    return df_copy.drop(columns=to_drop)

def remove_low_variance_features(df: pd.DataFrame, threshold: float = 0.01) -> pd.DataFrame:
    df_copy = df.copy()
    numeric_df = df_copy.select_dtypes(include=[np.number])
    variances = numeric_df.var()
    low_var_cols = variances[variances < threshold].index.tolist()
    return df_copy.drop(columns=low_var_cols)

# --- 3. Funzioni per split e salvataggio ---

def dividi_e_salva_train_validation(df: pd.DataFrame, target_col: str, test_size=0.2, random_state=42):
    X = df.drop(columns=[target_col])
    y = df[target_col]
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=random_state)

    train_df = X_train.copy()
    train_df[target_col] = y_train

    val_df = X_val.copy()
    val_df[target_col] = y_val

    train_path = os.path.join(config.RAW_DATA_PATH, "dati_training.csv")
    val_path = os.path.join(config.RAW_DATA_PATH, "dati_validation.csv")

    os.makedirs(config.RAW_DATA_PATH, exist_ok=True)
    train_df.to_csv(train_path, index=False)
    val_df.to_csv(val_path, index=False)

    print(f"Dati salvati:\n- {train_path}\n- {val_path}")

def carica_dati_training_validation(target_col: str):
    train_path = os.path.join(config.RAW_DATA_PATH, "dati_training.csv")
    val_path = os.path.join(config.RAW_DATA_PATH, "dati_validation.csv")

    df_train = pd.read_csv(train_path)
    df_val = pd.read_csv(val_path)

    X_train = df_train.drop(columns=[target_col])
    y_train = df_train[target_col]

    X_val = df_val.drop(columns=[target_col])
    y_val = df_val[target_col]

    return X_train, X_val, y_train, y_val

# --- 4. Esecuzione completa: carica, pulisci, split, salva ---

# Sostituisci con il percorso reale del tuo file .csv se serve
df = pd.read_csv("E:\\Download-E\\PROGETTO PYTHON PERSONALE\\data\\ai_job_trends_dataset.csv")



# Sostituisci con il vero nome della colonna target
TARGET_COL = "Median Salary (USD)"

# Preprocessing globale
df = handle_missing_values(df)
df = remove_multicollinearity(df)
df = remove_low_variance_features(df)

# Split e salvataggio
dividi_e_salva_train_validation(df, target_col=TARGET_COL)


Dati salvati:
- e:\Download-E\PROGETTO PYTHON PERSONALE\data\dati_training.csv
- e:\Download-E\PROGETTO PYTHON PERSONALE\data\dati_validation.csv
