In [5]:
import pandas as pd

def load_and_rename_data(csv_path, code_to_value):
    """
    Load a CSV file and rename its columns using a code-to-value mapping.
    """
    df = pd.read_csv(csv_path)
    df.rename(columns=code_to_name, inplace=True)
    return df


In [6]:
def merge_dataframes(df1, df2, on_columns, suffixes):
    """
    Merge two DataFrames on specified columns.
    """
    merged_df = pd.merge(df1, df2, on=on_columns, how='left', suffixes='_df3')
    return merged_df

In [7]:
def discard_columns_with_missing_values(df, threshold=50):
    """
    Discard columns where more than a specified percentage of values are missing.
    """
    missing_percentage = df.isnull().mean() * 100
    columns_to_drop = missing_percentage[missing_percentage > threshold].index
    df.drop(columns=columns_to_drop, inplace=True)
    return df

In [8]:
def perform_dynamic_correlation(df, source_columns):
    """
    Perform dynamic correlation between columns in the DataFrame.
    """
    for column in source_columns:
        if column + '_df3' in df.columns:  # Check if the corresponding _csv3 column exists
            # Create a mask to identify rows where the data is different
            mask = df[column] != df[f'{column}_df3']
            
            # Apply correlation only to rows where data is different
            df[f'{column}_status'] = df.apply(
                lambda 
                    row: f"{row[column]} ({row[f'{column}_df3']})" 
                if 
                    pd.notna(row[f'{column}_df3']) 
                and 
                    row[column] != row[f'{column}_df3'] 
                else 
                    row[column],
                    axis=1
            )
            
            # Drop the intermediate _csv3 column if no correlation was needed
            if not mask.any():  # If all rows are identical, drop the _csv3 column
                df.drop(columns=[f'{column}_df3'], inplace=True)
    return df

In [13]:
def stroke_risk():
    
    # Load the CSV files
    df1 = pd.read_csv('./synthetic-data/stroke-risk/synthea-stroke-dataset-codes.csv', low_memory=False)
    df2 = pd.read_csv('./synthetic-data/stroke-risk/synthea-pt30k-stroke-ml-table-sel.csv', low_memory=False)
    df3 = pd.read_csv('./synthetic-data/stroke-risk/synthea-pt30k-stroke-ml-table-sel-convert.csv', low_memory=False)
    
    # Create a dictionary mapping codes to values
    code_to_name = dict(zip(df1['code'], df1['name']))

    # Rename columns in df2 and df3 using the mapping dictionary
    df2.rename(columns=code_to_name, inplace=True)
    df3.rename(columns=code_to_name, inplace=True)

    # Merge df2 and df3 on 'ptnum' and 'label'
    merged_df = pd.merge(df2, df3, on=['ptnum', 'label'], how='left', suffixes=('', '_df3'))

    # Create df4 as a copy of the merged DataFrame
    df4 = merged_df.copy()

    # Discard columns with more than 50% missing values
    df4 = discard_columns_with_missing_values(df4, threshold=50)

    # Perform dynamic correlation on df4
    df4 = perform_dynamic_correlation(df4, source_columns=df2.columns)

    # Save df4 to a new CSV file
    df4.to_csv('stroke_data.csv', index=False)

    # Display df4 for verification
    print(df4)

In [14]:
stroke_risk()

        ptnum  label  scc      race marital       ethnic gender  \
0       p3686      1  162     white       m  nonhispanic      m   
1       p4204      1  155     asian       m  nonhispanic      m   
2      p14352      1  147     white       m  nonhispanic      f   
3      p28589      1  147     white       m  nonhispanic      m   
4      p28075      1  145     asian       m  nonhispanic      m   
...       ...    ...  ...       ...     ...          ...    ...   
16050   p9840      0   70     asian       m  nonhispanic      f   
16051   p9866      0   70     white       m  nonhispanic      f   
16052   p9892      0   70     white       m  nonhispanic      f   
16053   p9912      0   70     white       m  nonhispanic      m   
16054   p9965      0   70  hawaiian       m  nonhispanic      m   

               state   age Full-time employment (finding)  ...  \
0      massachusetts  95.0                           True  ...   
1      massachusetts  71.0                           True  ... 

In [17]:
def lung_cancer_risk():
    
    # Load the CSV files
    df1 = pd.read_csv('./synthetic-data/lung-cancer-risk/synthea-lc-dataset-codes.csv', low_memory=False)
    df2 = pd.read_csv('./synthetic-data/lung-cancer-risk/synthea-pt30k4-lc-data-sel.csv', low_memory=False)
    df3 = pd.read_csv('./synthetic-data/lung-cancer-risk/synthea-pt30k4-lc-data-sel-convert.csv', low_memory=False)
    
    # Create a dictionary mapping codes to values
    code_to_name = dict(zip(df1['code'], df1['name']))

    # Rename columns in df2 and df3 using the mapping dictionary
    df2.rename(columns=code_to_name, inplace=True)
    df3.rename(columns=code_to_name, inplace=True)

    # Merge df2 and df3 on 'ptnum' and 'label'
    merged_df = pd.merge(df2, df3, on=['ptnum', 'label'], how='left', suffixes=('', '_df3'))

    # Create df4 as a copy of the merged DataFrame
    df4 = merged_df.copy()

    # Discard columns with more than 50% missing values
    df4 = discard_columns_with_missing_values(df4, threshold=50)

    # Perform dynamic correlation on df4
    df4 = perform_dynamic_correlation(df4, source_columns=df2.columns)

    # Save df4 to a new CSV file
    df4.to_csv('lung_cancer_data.csv', index=False)

    # Display df4 for verification
    print(df4)

In [18]:
lung_cancer_risk()

       ptnum  label  scc   race marital       ethnic gender          state  \
0      p3818      1  165  white       m  nonhispanic      m  massachusetts   
1     p23282      1  146  white       m  nonhispanic      m  massachusetts   
2     p23869      1  144  white       m  nonhispanic      m  massachusetts   
3      p9666      1  135  white       m  nonhispanic      m  massachusetts   
4     p24930      1  133  white       m  nonhispanic      m  massachusetts   
...      ...    ...  ...    ...     ...          ...    ...            ...   
4379   p9627      0  100  white       s  nonhispanic      m  massachusetts   
4380   p9759      0  100  white       m  nonhispanic      f  massachusetts   
4381   p9800      0  100  white       m  nonhispanic      f  massachusetts   
4382   p9901      0  100  white       m     hispanic      f  massachusetts   
4383   p9912      0  100  white       m  nonhispanic      f  massachusetts   

       age Influenza  seasonal  injectable  preservative free  