In [1]:
import pandas as pd

In [2]:
df_original = pd.read_csv('data/cohorte_endo_unprocessed.csv')

In [3]:
import pandas as pd

def process_endometriosis_data(df):
    # Step 1: Translate column names
    column_translation = {
        "centre": "center",
        "sample_code": "sample_code",
        "sample_code_ua": "sample_code_ua",
        "collected_at": "collected_at",
        "cervical_cytology": "cervical_cytology",
        "uterine_aspirate": "uterine_aspirate",
        "date_processed": "date_processed",
        "date_collected": "date_collected",
        "age": "age",
        "phase": "phase",
        "actual_hormone_treatment": "actual_hormone_treatment",
        "symptoms": "symptoms",
        "other_diseases": "other_diseases",
        "study_group": "study_group",
        "subtype": "subtype"
    }
    df.rename(columns=column_translation, inplace=True)

    # Step 2: Translate key categorical values
    value_translations = {
        "phase": {
            "Menstrual": "menstrual",
            "Proliferativa": "proliferative",
            "Ovulatoria": "ovulatory",
            "Secretora": "secretory",
            None: "unknown"
        },
        "actual_hormone_treatment": {
            "1": "yes",
            "1?": "maybe_yes",
            "0": "no",
            "0?": "maybe_no",
            "NS": "unknown"
        },
        "study_group": {
            "Endometriosis": "endometriosis",
            "Control": "control",
            "Endometriosis?": "suspected_endometriosis",
            "Control?": "suspected_control",
            "Endometriosis*": "endometriosis_post_surgery",
            "Control*": "control_post_surgery",
            "Endometriosis-adenomiosis": "endometriosis_adenomyosis",
            "Endometriosis-adenomiosis*": "endometriosis_adenomyosis_post_surgery"
        },
        "collected_at": {
            "Consulta": "consultation",
            "Quirofano": "operating_room",
            "Quirófano": "operating_room"
        },
        "symptoms": {
            'Asintomatica': 'asymptomatic',
            'dolor': 'pain',
            'dolor/infertilidad': 'pain/infertility',
            'NS': 'unknown',
            'infertilidad': 'infertility',
            'NS/infertilidad': 'unknown/infertility'
        }
    }

    for col, translations in value_translations.items():
        df[col] = df[col].map(translations).fillna(df[col])

    # Step 3: Process hormone_treatment column
    def process_hormone_treatment(df, column_name):
        df["hormone_treatment"] = df[column_name].apply(lambda x: 1 if x in ["yes", "maybe_yes"] else (0 if x in ["no", "maybe_no"] else None))
        df["hormone_treatment_maybe"] = df[column_name].apply(lambda x: 1 if "maybe" in str(x).lower() else (0 if x in ["yes", "no"] else None))
        df.drop(columns=[column_name], inplace=True)
        return df

    df = process_hormone_treatment(df, "actual_hormone_treatment")
    
    # Step 4: Process study_group column
    def process_study_group(df, column_name):
        df["endometriosis"] = df[column_name].apply(lambda x: 1 if "endometriosis" in str(x).lower() else (0 if "control" in str(x).lower() else None))
        df["post_surgery"] = df[column_name].apply(lambda x: 1 if "post_surgery" in str(x).lower() else 0)
        df["suspected"] = df[column_name].apply(lambda x: 1 if "suspected" in str(x).lower() else 0)
        df["adenomyosis"] = df[column_name].apply(lambda x: 1 if "adenomyosis" in str(x).lower() else 0)
        df.drop(columns=[column_name], inplace=True)
        return df

    df = process_study_group(df, "study_group")
    
    # Step 5: Convert symptoms into individual binary columns
    def process_symptoms(df, column_name):
        # Define the unique symptom categories
        symptom_categories = ["asymptomatic", "pain", "infertility", "unknown"]
        
        # Ensure all symptoms are treated as strings for consistency
        df[column_name] = df[column_name].fillna("unknown").astype(str)
        
        # Create binary columns for each symptom
        for symptom in symptom_categories:
            df[f"symptom_{symptom}"] = df[column_name].apply(
                lambda x: 1 if symptom in x.split("/") else 0
            )
        
        # Drop the original symptoms column
        df.drop(columns=[column_name], inplace=True)
        return df

    df = process_symptoms(df, "symptoms")

    # Step 6: Convert subtype into descriptive binary columns
    subtype_translation = {
        "1": "normal",
        "2": "ovarian",
        "3": "deep",
        "4": "extragenital",
        "5": "focal_adenomyosis",
        "6": "diffuse_adenomyosis",
        "7": "myoma",
        "8": "adnexal_pathology",
        "9": "polyp",
        "10": "hydro_hematosalpinx",
        "unknown": "unknown",
        "NS": "unknown"
    }

    # Ensure all subtypes are treated as strings for consistency
    df["subtype"] = df["subtype"].fillna("unknown").astype(str)
    for subtype_code, subtype_name in subtype_translation.items():
        # Create a binary column for each subtype
        df[f"subtype_{subtype_name}"] = df["subtype"].apply(
            lambda x: 1 if subtype_code in x.split("/") else 0
        )

    df.drop(columns=["subtype"], inplace=True)

    # Step 7: Format column names
    df.columns = df.columns.str.lower().str.replace(" ", "_").str.replace("/", "_")

    return df


In [4]:
df_processed = process_endometriosis_data(df_original)

In [5]:
df_processed.columns

Index(['center', 'sample_code', 'sample_code_ua', 'collected_at',
       'cervical_cytology', 'uterine_aspirate', 'date_processed',
       'date_collected', 'age', 'phase', 'other_diseases', 'hormone_treatment',
       'hormone_treatment_maybe', 'endometriosis', 'post_surgery', 'suspected',
       'adenomyosis', 'symptom_asymptomatic', 'symptom_pain',
       'symptom_infertility', 'symptom_unknown', 'subtype_normal',
       'subtype_ovarian', 'subtype_deep', 'subtype_extragenital',
       'subtype_focal_adenomyosis', 'subtype_diffuse_adenomyosis',
       'subtype_myoma', 'subtype_adnexal_pathology', 'subtype_polyp',
       'subtype_hydro_hematosalpinx', 'subtype_unknown'],
      dtype='object')

In [6]:
df_processed

Unnamed: 0,center,sample_code,sample_code_ua,collected_at,cervical_cytology,uterine_aspirate,date_processed,date_collected,age,phase,...,subtype_ovarian,subtype_deep,subtype_extragenital,subtype_focal_adenomyosis,subtype_diffuse_adenomyosis,subtype_myoma,subtype_adnexal_pathology,subtype_polyp,subtype_hydro_hematosalpinx,subtype_unknown
0,Vall Hebron,E01-CS-01-001,,consultation,1,0,1/19/2024,1/19/2024,45.0,secretory,...,0,0,0,0,0,0,0,0,0,1
1,Vall Hebron,E01-CS-01-012,,consultation,1,0,2/2/2024,2/2/2024,28.0,secretory,...,1,0,0,0,0,0,0,0,0,0
2,Vall Hebron,E01-CS-01-013,,consultation,1,0,2/2/2024,2/2/2024,32.0,secretory,...,0,0,0,1,0,0,0,0,0,0
3,Vall Hebron,E01-CS-01-014,,consultation,1,0,2/2/2024,2/2/2024,28.0,secretory,...,0,0,0,0,1,0,0,0,0,0
4,Vall Hebron,E01-CS-01-015,,consultation,1,0,2/5/2024,2/5/2024,45.0,proliferative,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490,Vall Hebron,,283,operating_room,0,1,1/20/2022,1/20/2022,34.0,,...,0,0,0,0,0,0,0,0,0,1
491,Vall Hebron,,284,operating_room,0,1,1/21/2022,1/21/2022,37.0,,...,0,0,0,0,0,0,0,0,0,1
492,Vall Hebron,,285,operating_room,0,1,2/8/2022,2/8/2022,43.0,,...,0,0,0,0,0,0,0,0,0,1
493,Vall Hebron,,294,operating_room,0,1,4/28/2022,4/28/2022,31.0,,...,0,0,0,0,0,0,0,0,0,1


In [7]:
# Save df_processed to a new CSV file in the data folder
df_processed.to_csv('data/cohorte_endo_processed.csv', index=False)