In [1]:
import pandas as pd

def calculate_tertile_averages(file_path, columns_to_process, participant_column, output_folder):
    """
    Calculate tertiles and averages for specified columns and save the results in Excel files.

    Parameters:
    - file_path (str): The path to the Excel file containing the data.
    - columns_to_process (list): A list of columns for which to calculate tertiles and averages.
    - participant_column (str): The column containing participant identifiers.
    - output_folder (str): The folder path to save the output Excel files.

    Returns:
    None
    """

    # Load data from the specified Excel file
    data = pd.read_excel(file_path)

    # Remove leading spaces from column names
    data.columns = data.columns.str.strip()

    # Drop NaN values in the participant column
    data = data.dropna(subset=[participant_column])

    for column_name in columns_to_process:
        # Drop NaN values in the specified column
        data_cleaned = data.dropna(subset=[column_name])

        # Find tertiles of the specified column
        tertiles = data_cleaned[column_name].quantile([0, 1/3, 2/3, 1])

        # Add a new column classifying each row as 1st, 2nd, or 3rd tertile of the specified column
        data_cleaned[column_name + '_tertile'] = pd.cut(data_cleaned[column_name], bins=tertiles, labels=[1, 2, 3])

        # List of columns to calculate averages for (excluding participant_id and the tertile column)
        columns_to_average = [col for col in data_cleaned.columns if col not in [participant_column, column_name + '_tertile']]

        # Dictionary to store the results
        averages_by_tertile = {}

        # Loop through columns and calculate averages for each tertile
        for col in columns_to_average:
            averages = data_cleaned.groupby(column_name + '_tertile')[col].mean()
            averages_by_tertile[col] = averages

        # Convert the dictionary to a DataFrame
        averages_df = pd.DataFrame(averages_by_tertile)

        # Save the averages to a new Excel file in the specified output folder
        output_file_path = f"{output_folder}/Tertiles_Averages_{column_name}.xlsx"
        averages_df.to_excel(output_file_path, sheet_name=f'{column_name}_Averages')
        print(f'Averages by tertile for {column_name} saved to {output_file_path}')


In [2]:
import pandas as pd

# Load the Excel file
file_path = '/Users/user/Downloads/DellaCorte/Final_Averages.xlsx'
df_squared = pd.read_excel(file_path)

# Specify the columns for which you want to find the value '1'
columns_to_check = ['ecid_diabetes_1', 'ecid_breastfeeding', 'ecid_diagnosed_condition']  # Remove leading spaces

# Iterate through the specified columns and print rows with the value '1'
for column in columns_to_check:
    ones_rows = df_squared[df_squared[column] == 1]
    print(f"Rows with '1' in column '{column}':")
    print(ones_rows)








Rows with '1' in column 'ecid_diabetes_1':
Empty DataFrame
Columns: [participant_id, Cumulative Average for intake_protein, Cumulative Average for intake_fat, Cumulative Average for intake_carbohydrate, Cumulative Average for intake_starch, Cumulative Average for intake_energy_kilocalories, Cumulative Average for intake_sucrose, Cumulative Average for intake_fibre_englyst, Cumulative Average for intake_fibre_southgate, Cumulative Average for intake_total_sugars, Cumulative Average for intake_nmes, Cumulative Average for intake_intrinsic_sugars, Cumulative Average for intake_satd_fa, Cumulative Average for intake_cholesterol, Cumulative Average for intake_fructose, Cumulative Average for intake_glucose, Cumulative Average for intake_alcohol, centre_id, study_arm_id, age_at_study, has_withdrawn, ecid_baseline_is_eligible, ecid_bmi_25, ecid_pregnant, ecid_breastfeeding, ecid_eating_disorder, ecid_diabetes_1, ecid_diagnosed_condition, ecid_diabetes_1.1, ecid_eligible, ecid_reason_nonelig, 