In [91]:
# Import libraries
import pandas as pd

# Use functions from the tools module
from tools import recursive_cum_avg, calculate_bmi

# Subset data for relevant participants and intake data

In [102]:
#Blood subset

df2 = pd.read_excel('../data/raw_data/outcomes_reformatted.xlsx')
all_participants = len(df2['participant_id'].unique())
df2.dropna(subset=['ecid_ldl'], inplace=True)
participant_ids = df2['participant_id'].value_counts()
df2['largest_ecid_cid_date'] = df2.groupby('participant_id')['ecid_cid_date'].transform('max')
participants_with_blood_follow_up =  participant_ids[participant_ids == 2].index.tolist()
participants_with_blood_follow_up_dict = {}
for id in participants_with_blood_follow_up:
    participants_with_blood_follow_up_dict[id] = df2[df2['participant_id']==id]['largest_ecid_cid_date'].values[0]

print(all_participants, len(participants_with_blood_follow_up_dict), all_participants - len(participants_with_blood_follow_up_dict))

#Anthro subset
df2 = pd.read_excel('../data/raw_data/outcomes_reformatted.xlsx')
all_participants = len(df2['participant_id'].unique())
df2.dropna(subset=['ecid_waist'], inplace=True)
participant_ids = df2['participant_id'].value_counts()
df2['largest_ecid_cid_date'] = df2.groupby('participant_id')['ecid_cid_date'].transform('max')
participants_with_anthro_follow_up =  participant_ids[participant_ids > 1 ].index.tolist()
participants_with_anthro_follow_up_dict = {}
for id in participants_with_anthro_follow_up:
    participants_with_anthro_follow_up_dict[id] = df2[df2['participant_id']==id]['largest_ecid_cid_date'].values[0]

print(all_participants, len(participants_with_anthro_follow_up_dict), all_participants - len(participants_with_anthro_follow_up_dict))

1628 1138 490
1628 1410 218


# Module 1: Preprocess the Dietary Data

In [88]:
# Module 1: Preprocess the dietary data
blood = False

# Read in the nutritional intake data
df = pd.read_excel('../data/raw_data/Aggregated_by_IR_id_dailyintake.xlsx')

# Need to only retain the participants and intake rows of interest. 
if blood:
    #drop all participants not in participants_with_blood_follow_up_dict
    df = df[df['Participant_ID'].isin(participants_with_blood_follow_up_dict.keys())]

    #drop all rows for every participant for which date is greater than value in participants_with_blood_follow_up_dict
    df['largest_ecid_cid_date'] = df['Participant_ID'].map(participants_with_blood_follow_up_dict)
    df = df[df['intake_response_date'] <= df['largest_ecid_cid_date']]
    df.drop(columns=['largest_ecid_cid_date'], inplace=True)
else:
    #drop all participants not in participants_with_anthro_follow_up_dict
    df = df[df['Participant_ID'].isin(participants_with_anthro_follow_up_dict.keys())]

    #drop all rows for every participant for which date is greater than value in participants_with_anthro_follow_up_dict
    df['largest_ecid_cid_date'] = df['Participant_ID'].map(participants_with_anthro_follow_up_dict)
    df = df[df['intake_response_date'] <= df['largest_ecid_cid_date']]
    df.drop(columns=['largest_ecid_cid_date'], inplace=True)

# Pull out from the aggregation data the participant ID and CID columns
aggregation_dict = {
    'Participant_ID': 'first',
    'CID': 'first'}

#Create a list of column names not present in aggregation_dict, excluding the first two columns
summed_cols = [col for col in df.columns if col not in aggregation_dict.keys()][2:]

# Step 1: Sum up by 'intake_response_id', # Not really necessary - as there are only unique values
for col in summed_cols:
    # Update aggregation_dict with the column name and the sum function
    aggregation_dict[col] = 'sum'

aggregated_df = df.groupby(['intake_response_id']).agg(aggregation_dict).reset_index()

# Step 2: Second aggregation calculates the averages per CID
aggregation_dict = {}
for col in summed_cols:
    aggregation_dict[col] = 'mean'
    
aggregated_df = aggregated_df.groupby(['Participant_ID','CID']).agg(aggregation_dict).reset_index()

# Step 3: Third aggregation calculates the cumulative averages per CID
aggregation_dict = {}
for col in summed_cols:
    aggregation_dict[col] = recursive_cum_avg

#BEWARE: at times a single variable was not measured and entered as zero! For those we calculate the cum avg everywhere else!
aggregated_df = aggregated_df.groupby(['Participant_ID']).agg(aggregation_dict).reset_index()

#Now we need to create new variables for intake_carbohydrates.
aggregated_df['intake_carbohydrate-intake_total_sugars'] = aggregated_df['intake_carbohydrate'] - aggregated_df['intake_total_sugars']
aggregated_df['intake_carbohydrate-intake_nmes'] = aggregated_df['intake_carbohydrate'] - aggregated_df['intake_nmes']
aggregated_df['intake_carbohydrate-intake_intrinsic_sugars'] = aggregated_df['intake_carbohydrate'] - aggregated_df['intake_intrinsic_sugars']
aggregated_df['intake_carbohydrate-intake_fructose'] = aggregated_df['intake_carbohydrate'] - aggregated_df['intake_fructose']
aggregated_df['intake_carbohydrate-intake_glucose'] = aggregated_df['intake_carbohydrate'] - aggregated_df['intake_glucose']

#Convert to megajoules
aggregated_df['intake_energy_mj'] = aggregated_df['intake_energy_kj'] / 1000

# Module 2: Preprocess the Outcomes

In [89]:
# Read in the outcomes data
df2 = pd.read_excel('../data/raw_data/outcomes_reformatted.xlsx')

if blood:
    #drop all participants not in participants_with_blood_follow_up_dict
    df2 = df2[df2['participant_id'].isin(participants_with_blood_follow_up_dict.keys())]
else:
    #drop all participants not in participants_with_anthro_follow_up_dict
    df2 = df2[df2['participant_id'].isin(participants_with_anthro_follow_up_dict.keys())]

# Iterate through columns
for col in df2.columns:
    # Check if the column name ends with '.1'
    if col.endswith('.1'):
        # Extract the original column name
        original_col = col[:-2]
        # Fill missing values in the original column with values from the '.1' column
        df2[original_col] = df2[col].fillna(df2[original_col])

# List of needed columns for preprocessing of outcomes
needed_cols = [
    'participant_id', 
    'ecid_cid_date',
    'CID',
    'centre_id', 
    'study_arm_id',
    'has_withdrawn', 
    'ecid_diabetes_1',
    'ecid_weight_recorded',
    'ecid_height_recorded',
    'ecid_ldl',
    'ecid_hdl',
    'ecid_trig',
    'ecid_hba1c_mmol_mol',
    'ecid_hba1c_percent',
    'cid_moissl_fat_mass',
    'cid_moissl_fat_mass_percent',
    'elig_age',
    'elig_gender',
    'ecid_waist']

# Select only the needed columns for the DataFrame
df2 = df2[needed_cols]

# Define the aggregation dictionary
aggregation_dict = {
    'centre_id':'first', 
    'study_arm_id':'first',
    'has_withdrawn':'first', 
    'ecid_diabetes_1':'max',
    'ecid_weight_recorded':recursive_cum_avg,
    'ecid_height_recorded':'max',
    'ecid_ldl':recursive_cum_avg,
    'ecid_hdl':recursive_cum_avg,
    'ecid_trig':recursive_cum_avg,
    'ecid_hba1c_mmol_mol':recursive_cum_avg,
    'ecid_hba1c_percent':recursive_cum_avg,
    'cid_moissl_fat_mass':recursive_cum_avg,
    'cid_moissl_fat_mass_percent':recursive_cum_avg,
    'elig_age':'max',
    'elig_gender':'max',
    'ecid_waist':recursive_cum_avg
}

# Perform aggregation by group on 'participant_id' and reset the index
aggregated_df2 = df2.groupby(['participant_id']).agg(aggregation_dict).reset_index()

# Calculate BMI
aggregated_df2['bmi'] = aggregated_df2.apply(lambda row: calculate_bmi(row['ecid_weight_recorded'], row['ecid_height_recorded']), axis=1)

# Make new column names                                                                              
new_column_names = ['Participant_ID', 'centre_id', 'study_arm_id', 'has_withdrawn', 'diabetes', 'weight', 'height', 'ldl',
       'hdl', 'trig', 'hba1c', 'hba1c_percent', 'bodyfat', 'bodyfat_percent', 'age', 'gender', 'waistcirumference',
       'bmi']

# All missing values should be replaced with NaN
aggregated_df2.replace(0, float('nan'), inplace=True)

# Rename the different columns
aggregated_df2.rename(columns=dict(zip(aggregated_df2.columns, new_column_names)), inplace=True)


# Module 3 - Combine the Two Dataframes

In [90]:
merged_df = pd.merge(aggregated_df, aggregated_df2, on='Participant_ID')
if blood:
    merged_df.to_excel('../data/processed_files/aggregrated_data_blood.xlsx', index=False)
else:
    merged_df.to_excel('../data/processed_files/aggregrated_data_anthro.xlsx', index=False)