In [1]:
import pandas as pd
import re

# ======================================================
# 1Ô∏è‚É£ CONFIGURAZIONE PATH
# ======================================================

BASE_PATH = "/Users/marcodonatiello/PycharmProjects/JupyterProject/data/"

FILES = {
    "train": ("train_encoded.csv", "train.csv"),
    "val":   ("val_encoded.csv",   "val.csv"),
    "test":  ("test_encoded.csv",  "test.csv")
}

# ======================================================
# 2Ô∏è‚É£ COLONNE TESTUALI
# ======================================================

TEXT_COLS = ['HISTORY', 'CUR_ILL', 'OTHER_MEDS', 'ALLERGIES']

# ======================================================
# 3Ô∏è‚É£ PULIZIA TESTO
# ======================================================

def clean_text(s):
    if pd.isna(s):
        return ""
    s = s.lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

# ======================================================
# 4Ô∏è‚É£ DIZIONARI KEYWORD (CONGELATI)
# ======================================================

HISTORY_KEYS = [
    'diabetes','hypertension','asthma','copd',
    'cancer','cardiac','heart','coronary',
    'renal','kidney','liver','hepatic',
    'autoimmune','arthritis','lupus',
    'neurologic','stroke','seizure',
    'obesity','overweight'
]

CURILL_KEYS = [
    'covid','infection','fever','flu',
    'respiratory','pneumonia','bronchitis',
    'viral','bacterial','sinus'
]

MEDS_KEYS = [
    'anticoagulant','antiplatelet',
    'steroid','immunosuppressant',
    'chemotherapy','antibiotic',
    'antidepressant','antipsychotic',
    'antihypertensive','statin',
    'insulin'
]

ALLERGY_KEYS = [
    'drug','medication','penicillin',
    'latex','food','shellfish',
    'egg','peanut','vaccine'
]

# ======================================================
# 5Ô∏è‚É£ FUNZIONE FEATURE ENGINEERING
# ======================================================

def step1_feature_engineering(input_path, output_path):
    df = pd.read_csv(input_path)
    print(f"\nüì• Caricato {input_path} ‚Üí shape {df.shape}")

    # Pulizia testo
    for col in TEXT_COLS:
        df[col] = df[col].apply(clean_text)

    # Keyword features
    def add_keyword_features(col, keys, prefix):
        for k in keys:
            df[f"{prefix}_{k}"] = df[col].str.contains(k, na=False).astype(int)

    add_keyword_features('HISTORY', HISTORY_KEYS, 'history')
    add_keyword_features('CUR_ILL', CURILL_KEYS, 'curill')
    add_keyword_features('OTHER_MEDS', MEDS_KEYS, 'meds')
    add_keyword_features('ALLERGIES', ALLERGY_KEYS, 'allergy')

    # Flag presenza testo
    df['has_history']     = (df['HISTORY'] != "").astype(int)
    df['has_cur_ill']     = (df['CUR_ILL'] != "").astype(int)
    df['has_other_meds']  = (df['OTHER_MEDS'] != "").astype(int)
    df['has_allergies']   = (df['ALLERGIES'] != "").astype(int)

    # Drop testo grezzo
    df.drop(columns=TEXT_COLS, inplace=True)

    # Salvataggio
    df.to_csv(output_path, index=False)
    print(f"üíæ Salvato {output_path} ‚Üí shape {df.shape}")

    return df

# ======================================================
# 6Ô∏è‚É£ APPLICAZIONE A TRAIN / VAL / TEST
# ======================================================

datasets = {}

for split, (inp, out) in FILES.items():
    datasets[split] = step1_feature_engineering(
        BASE_PATH + inp,
        BASE_PATH + out
    )

# ======================================================
# 7Ô∏è‚É£ CONTROLLO FINALE COERENZA FEATURE
# ======================================================

cols_train = set(datasets['train'].columns)
cols_val   = set(datasets['val'].columns)
cols_test  = set(datasets['test'].columns)

print("\nüîç Controllo coerenza colonne:")
print("Train vs Val :", cols_train == cols_val)
print("Train vs Test:", cols_train == cols_test)



üì• Caricato /Users/marcodonatiello/PycharmProjects/JupyterProject/data/processed/encoded/train_encoded.csv ‚Üí shape (536370, 18)
üíæ Salvato /Users/marcodonatiello/PycharmProjects/JupyterProject/data/train_step1_features.csv ‚Üí shape (536370, 68)

üì• Caricato /Users/marcodonatiello/PycharmProjects/JupyterProject/data/processed/encoded/val_encoded.csv ‚Üí shape (134093, 18)
üíæ Salvato /Users/marcodonatiello/PycharmProjects/JupyterProject/data/val_step1_features.csv ‚Üí shape (134093, 68)

üì• Caricato /Users/marcodonatiello/PycharmProjects/JupyterProject/data/processed/encoded/test_encoded.csv ‚Üí shape (167616, 18)
üíæ Salvato /Users/marcodonatiello/PycharmProjects/JupyterProject/data/test_step1_features.csv ‚Üí shape (167616, 68)

üîç Controllo coerenza colonne:
Train vs Val : True
Train vs Test: True
