In [5]:
import pandas as pd
import re
from tqdm import tqdm
import os

# ======================================================
# 1Ô∏è‚É£ PATH CORRETTI
# ======================================================

BASE_IN  = "/Users/marcodonatiello/PycharmProjects/JupyterProject/data/interim/npl"
BASE_OUT = "/Users/marcodonatiello/PycharmProjects/JupyterProject/dataset"

TRAIN_PATH = f"{BASE_IN}/train.csv"
VAL_PATH   = f"{BASE_IN}/val.csv"
TEST_PATH  = f"{BASE_IN}/test.csv"

OUT_TRAIN = f"{BASE_OUT}/train_step5.csv"
OUT_VAL   = f"{BASE_OUT}/val_step5.csv"
OUT_TEST  = f"{BASE_OUT}/test_step5.csv"

# ======================================================
# 2Ô∏è‚É£ LOAD DATASET
# ======================================================

for p in [TRAIN_PATH, VAL_PATH, TEST_PATH]:
    assert os.path.exists(p), f"‚ùå File non trovato: {p}"

train = pd.read_csv(TRAIN_PATH)
val   = pd.read_csv(VAL_PATH)
test  = pd.read_csv(TEST_PATH)

print("Train:", train.shape)
print("Val:", val.shape)
print("Test:", test.shape)

# ======================================================
# 3Ô∏è‚É£ DROP VAERS_ID (CONSENTITO SU TUTTI)
# ======================================================

for df in [train, val, test]:
    if "VAERS_ID" in df.columns:
        df.drop(columns=["VAERS_ID"], inplace=True)

# ======================================================
# 4Ô∏è‚É£ NLP / NEL ‚Äî SOLO TRAIN E VAL
# ======================================================

def clean_symptom_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s;]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

SYMPTOM_MAP = {
    "cardiac": [
        "chest pain","myocarditis","pericarditis",
        "tachycardia","palpitations","cardiac arrest","arrhythmia"
    ],
    "respiratory": [
        "dyspnoea","shortness of breath","respiratory distress",
        "cough","hypoxia","pneumonia"
    ],
    "neurologic": [
        "headache","seizure","syncope","loss of consciousness",
        "dizziness","stroke","paresthesia"
    ],
    "fever": [
        "fever","pyrexia","chills"
    ]
}

def extract_symptoms(text):
    feats = {}
    for group, kws in SYMPTOM_MAP.items():
        count = sum(
            1 for kw in kws if re.search(rf"\b{re.escape(kw)}\b", text)
        )
        feats[f"symp_{group}"] = int(count > 0)
        feats[f"num_symp_{group}"] = count

    feats["num_symp_total"] = sum(
        v for k, v in feats.items() if k.startswith("num_symp_")
    )
    return feats

def apply_nel(df):
    clean_text = df["LISTA_SINTOMI"].apply(clean_symptom_text)

    feats = []
    for text in tqdm(clean_text, desc="NLP/NEL"):
        feats.append(extract_symptoms(text))

    feats_df = pd.DataFrame(feats)
    return pd.concat([df.reset_index(drop=True), feats_df], axis=1)

train = apply_nel(train)
val   = apply_nel(val)

print("‚úÖ NLP/NEL applicato a train e val")

# ======================================================
# 5Ô∏è‚É£ FEATURE CROSSING ‚Äî SOLO TRAIN E VAL
# ======================================================
# 1) AGE_YRS √ó NUMERO_SINTOMI
# 2) has_history √ó NUMERO_SINTOMI
# 3) AGE_YRS √ó history_cardiac

def add_feature_crossing(df):
    df["fc_age_x_num_symptoms"] = df["AGE_YRS"] * df["NUMERO_SINTOMI"]
    df["fc_history_x_num_symptoms"] = df["has_history"] * df["NUMERO_SINTOMI"]
    df["fc_age_x_history_cardiac"] = df["AGE_YRS"] * df["history_cardiac"]
    return df

train = add_feature_crossing(train)
val   = add_feature_crossing(val)

print("‚úÖ Feature crossing aggiunte (train + val)")

# ======================================================
# 6Ô∏è‚É£ DROP LISTA_SINTOMI TESTUALE (TUTTI)
# ======================================================

for df in [train, val, test]:
    if "LISTA_SINTOMI" in df.columns:
        df.drop(columns=["LISTA_SINTOMI"], inplace=True)

print("‚úÖ LISTA_SINTOMI testuale rimossa da tutti")

# ======================================================
# 7Ô∏è‚É£ SALVATAGGIO
# ======================================================

train.to_csv(OUT_TRAIN, index=False)
val.to_csv(OUT_VAL, index=False)
test.to_csv(OUT_TEST, index=False)

print("\nüöÄ PIPELINE COMPLETATA CON SUCCESSO")
print("Train ‚Üí", OUT_TRAIN)
print("Val   ‚Üí", OUT_VAL)
print("Test  ‚Üí", OUT_TEST)


Train: (536370, 68)
Val: (134093, 68)
Test: (167616, 68)


NLP/NEL: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 536370/536370 [00:09<00:00, 59107.30it/s]
NLP/NEL: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 134093/134093 [00:02<00:00, 59364.71it/s]


‚úÖ NLP/NEL applicato a train e val
‚úÖ Feature crossing aggiunte (train + val)
‚úÖ LISTA_SINTOMI testuale rimossa da tutti

üöÄ PIPELINE COMPLETATA CON SUCCESSO
Train ‚Üí /Users/marcodonatiello/PycharmProjects/JupyterProject/data/interim/splits/train_step5.csv
Val   ‚Üí /Users/marcodonatiello/PycharmProjects/JupyterProject/data/interim/splits/val_step5.csv
Test  ‚Üí /Users/marcodonatiello/PycharmProjects/JupyterProject/data/interim/splits/test_step5.csv
