## 1. Extract the birth date of the patients from the patient_dimension.cv

In [None]:
import pandas as pd
import pickle

df_patient_dimension = pd.read_csv("/data/datasets/Tianchen/data_from_old_server/2021/ADRD_data_from_Xi/i2b2/patient_dimension.csv")

In [None]:
print(df_patient_dimension.head)

## 2. Calculate the age of the patients at each encounter with the START_DATE under DIAGNOSIS.csv

In [None]:
DIAGNOSES = pd.read_csv("/data/datasets/Tianchen/data_from_old_server/2021/ADRD_data_from_Xi/process_observation/process_observation/DIAGNOSES.csv")

In [None]:
print(DIAGNOSES.head)

In [None]:

# Step 3: Merge dataframes on 'pid' to get birth_date for each encounter
# merged_df = pd.merge(DIAGNOSES, df_patient_dimension[['deid_pat_ID', 'BIRTH_DATE']], on='deid_pat_ID')
merged_df['START_DATE'] = pd.to_datetime(merged_df['START_DATE'])
merged_df['BIRTH_DATE'] = pd.to_datetime(merged_df['BIRTH_DATE'])
merged_df['age_at_encounter'] = (merged_df['START_DATE'] - merged_df['BIRTH_DATE']).dt.days // 365


In [None]:
print(merged_df.head)

In [None]:
selected_columns = ['deid_pat_ID', 'deid_enc_ID',"CONCEPT_CD",'age_at_encounter']

# Create a new DataFrame with the selected columns
merge_df_selected = merged_df[selected_columns]


# Sort within each patient group by 'age_at_encounter'
merge_df_selected_sorted = merge_df_selected.sort_values(by=['deid_pat_ID', 'age_at_encounter'])

# Display the sorted DataFrame
# print(merge_df_selected_sorted.head(50))

In [None]:
print(merge_df_selected_sorted.index)

In [None]:

# Remove duplicate encounters
merge_df_age = merge_df_selected_sorted.drop_duplicates(subset=['deid_pat_ID', 'deid_enc_ID'])
print(merge_df_age.index)

In [None]:
# get age_vector for each patient

# Group by 'pid' and aggregate 'age_at_encounter' into a list
patient_age_df = merge_df_age.groupby('deid_pat_ID')['age_at_encounter'].agg(list).reset_index()

# Rename the column to 'age_vector'
patient_age_df = patient_age_df.rename(columns={'age_at_encounter': 'age_vector'})

# Display the resulting dataframe
print(patient_age_df.head)

## 3. Aggregate the diagnosis codes of the patients under the same encounter defined by deid_enc_ID under DIAGNOSIS.csv. 

In [None]:
selected_columns = ['deid_pat_ID', 'deid_enc_ID',"CONCEPT_CD",'age_at_encounter']
# Initialize an empty dictionary to store aggregated vectors
pid_diagnosis_dict = {}
# Iterate through each row of the DataFrame
for index, row in merge_df_selected_sorted.iterrows():
    pid = row['deid_pat_ID']
    encounter_id = row['deid_enc_ID']
    code = row['CONCEPT_CD']

    # Check if the pid is already in the dictionary
    if pid in pid_diagnosis_dict:
        # Check if encounter_id is already in the dictionary
        if encounter_id in pid_diagnosis_dict[pid]:
            # Append the diagnosis code to the existing encounter_id vector
            pid_diagnosis_dict[pid][encounter_id].append(code)
        else:
            # Initialize a new encounter_id vector if encounter_id is not in the dictionary
            pid_diagnosis_dict[pid][encounter_id] = ['CLS', code, 'SEP']
    else:
        # Initialize a new dictionary entry for pid and encounter_id
        pid_diagnosis_dict[pid] = {encounter_id: ['CLS', code, 'SEP']}
        
# Initialize an empty list to store the final rows of the new DataFrame
new_rows = []

# Iterate through the dictionary to create rows for the new DataFrame
for pid, encounters in pid_diagnosis_dict.items():
    # Concatenate the vectors for each encounter_id
    for encounter_id, codes in encounters.items():
        # Create a new row with pid, encounter_id, and the aggregated code vector
        new_row = {'pid': pid, 'code': codes}
        new_rows.append(new_row)
# Create the new DataFrame
patient_code_df = pd.DataFrame(new_rows)

# Print the new DataFrame
print(patient_code_df.index)

In [None]:
#print(patient_code_df.columns)
column_mapping = {
    'pid': 'deid_pat_ID',
    'code': 'diagnosis_code'
}

patient_code_df = patient_code_df.rename(columns=column_mapping)


In [None]:
final_merged_df = pd.merge(merge_df_age, patient_code_df, on='deid_pat_ID')


In [None]:
final_merged_df.to_csv("/data/datasets/leyang/merged_age_diagnosis.csv")

In [None]:
print(final_merged_df.head(10))


You will need to remove code that is not ICD.
You will need to combine the diagnosis codes from the same encounter identified by deid_enc_ID.
Each of the row will contain all the encounters from a single patient.
The ‘SEP’ is used to separate diagnosis codes from different encounters instead of codes within the same encounter.

In [None]:
# check how many encounters appear >= twice, for patient p, encounter j: the number of diagnosis is m_j_p

In [None]:
# since each encounter_id may have one or multiple diagnosis_code,  for each encounter_id, denote the corresponding diagnosis_code {d_1, d_2, ..,d_{m_j_p}}  and  create a vector v_{j_p}={d_1, d_2, ..,d_{m_j_p}}
# since each pid has multiple encounter_id, for each pid, Aggregate all the diagnosis code vector v_{j_p} under the encounterids, and seperate each v_{j_p} with "SEP". Also, You will need to place 'CLS' at the begin of the vector and an 'SEP' at the end of the vector. 
# For example, if a pid has two encounter_id, and for the first encounter_id there are 2 dignosis: {d_1 = 3, d_2 = 5}, for the second encounter_id there are 3 diagnosis: {d_1 = 1, d_2 = 4, d_3 = 5}. Then the aggregated vector for this pid is {CLS,3,5,SEP,1,4,5,SEP}
# create a new dataframe with 

In [None]:
   pid              code
0    1  [CLS, 1,2, SEP, 3, SEP]
1    2    [CLS, 4, SEP, 5, SEP]

In [7]:
import pandas as pd

# Example DataFrame
data = {'pid': [1, 1, 1, 2, 2],
        'encounter_id': [101, 101, 102, 201, 202],
        'diagnosis_code': ['1', '2', '3', '4', '5']}
df = pd.DataFrame(data)

# Initialize an empty dictionary to store aggregated vectors
pid_diagnosis_dict = {}

# Iterate through each row of the DataFrame
for index, row in df.iterrows():
    pid = row['pid']
    encounter_id = row['encounter_id']
    code = row['diagnosis_code']

    # Check if the pid is already in the dictionary
    if pid in pid_diagnosis_dict:
        # Check if encounter_id is already in the dictionary
        if encounter_id in pid_diagnosis_dict[pid]:
            # Append the diagnosis code to the existing encounter_id vector
            pid_diagnosis_dict[pid][encounter_id].append(code)
        else:
            # Initialize a new encounter_id vector if encounter_id is not in the dictionary
            pid_diagnosis_dict[pid][encounter_id] = [code]
    else:
        # Initialize a new dictionary entry for pid and encounter_id
        pid_diagnosis_dict[pid] = {encounter_id: [code]}

# Initialize an empty list to store the final rows of the new DataFrame
new_rows = []

# Iterate through the dictionary to create rows for the new DataFrame
for pid, encounters in pid_diagnosis_dict.items():
    # Concatenate the vectors for each encounter_id
    concatenated_codes = []
    for encounter_id, codes in encounters.items():
        concatenated_codes += codes + ['SEP']

    # Create a new row with pid, encounter_id, and the aggregated code vector
    new_row = {'pid': pid, 'code': ['CLS'] + concatenated_codes}
    new_rows.append(new_row)

# Create the new DataFrame
patient_code_df = pd.DataFrame(new_rows)

# Print the new DataFrame
print(patient_code_df)


   pid                      code
0    1  [CLS, 1, 2, SEP, 3, SEP]
1    2     [CLS, 4, SEP, 5, SEP]
