In [1]:
import pandas as pd
import numpy as np
import pickle


# Create RPE reference standards.

Create summaries of the dataset and save as an Excel file (for easy copy/paste into manuscript).

In [2]:
with open('../data/cleaned_dataframe.pickle','rb') as read_file:
    df = pickle.load(read_file)

In [4]:
# Create a summary table with mean ± SD across ages, sexes, and test modes.

"""
Also added "perc_rpe_18_above" for this analysis which finds percentile of each 
group who met the 'classic' max test requirement of RPE >= 18.
""" 

# Variables of interest (in typical order of presentation for publication).
var_int = ['sampleSize', 'ageattest', 'heightSI', 'weightSI', 'BMI',\
           'vo2_ml_kg_min', 'FRIEND_perc',\
           'max_rer', 'peak_rpe', 
           'perc_rpe_18_above', 'perc_rpe_17_above']

# Including a summary of ALL ages too.
age_int = ['20s','30s','40s','50s','60s','70s', '80s', 'All']
sex_int = ['Male','Female']
mode_int = ['TM','CY']

# Create dataframe to write the results to.
dfSum = pd.DataFrame(index=var_int)
dfSum.index.name = 'Variables'            

for modes in mode_int:
    for sexes in sex_int:
        for ages in age_int:
            if ages == 'All':
                df_temp = df[(df['Mode']==modes) & (df['Gender']==sexes)]
            else:                      
                df_temp = df[(df['Mode']==modes) & (df['Gender']==sexes) & \
                         (df['age_group']==ages)]
            
            for var in var_int:
                if var == 'sampleSize':
                    new_label = f'{modes}_{sexes}_{ages}'
                    samp_size = int(len(df_temp['vo2_ml_kg_min']))
                    dfSum.loc[var, new_label] = f'n = {samp_size:,}'
                elif var == 'max_rer' or var == 'vo2_l_min':
                    temp_mean = f'{round(df_temp[var].mean(),2):.2f}'
                    temp_sd = f'{round(df_temp[var].std(),2):.2f}'
                    new_label = f'{modes}_{sexes}_{ages}'
                    dfSum.loc[var, new_label] = f'{temp_mean} ± {temp_sd}'
                elif var == 'max_load_watts':
                    temp_mean = f'{round(df_temp[var].mean(),0):.0f}'
                    temp_sd = f'{round(df_temp[var].std(),0):.0f}'
                    new_label = f'{modes}_{sexes}_{ages}'
                    dfSum.loc[var, new_label] = f'{temp_mean} ± {temp_sd}'    
                elif var == 'perc_rpe_18_above':
                    met_criteria = len(df_temp[df_temp['peak_rpe'] >= 18])
                    new_label = f'{modes}_{sexes}_{ages}'
                    dfSum.loc[var, new_label] = f'{(met_criteria / len(df_temp)*100):.0f}'
                elif var == 'perc_rpe_17_above':
                    met_criteria = len(df_temp[df_temp['peak_rpe'] >= 17])
                    new_label = f'{modes}_{sexes}_{ages}'
                    dfSum.loc[var, new_label] = f'{(met_criteria / len(df_temp)*100):.0f}'
                else:
                    temp_mean = f'{round(df_temp[var].mean(),1):.1f}'
                    temp_sd = f'{round(df_temp[var].std(),1):.1f}'
                    new_label = f'{modes}_{sexes}_{ages}'
                    dfSum.loc[var, new_label] = f'{temp_mean} ± {temp_sd}'

# dfSum

In [5]:
# Create a summary table with percentiles of RPE across ages, sexes, and test modes.

# Percentiles of interest and grouping variables.
perc_int = [25, 50, 75]
# perc_int = list(np.arange(10,100, 10))
perc_int.reverse()

age_int = ['20s','30s','40s','50s','60s','70s','80s']
sex_int = ['Male','Female']
mode_int = ['TM','CY']

# Create dataframe to write the results to.
dfPerc = pd.DataFrame(index = perc_int)
dfPerc.index.name = 'Percentiles'            

for modes in mode_int:
    for sexes in sex_int:
        for ages in age_int:
            new_col_name = modes +'_' + sexes + '_' + ages 
            dfPerc[new_col_name] = None
            dfTemp = df[(df['Mode']==modes) & (df['Gender']==sexes) \
                            & (df['age_group']==ages)]
            
            for percs in perc_int:
                dfPerc.loc[percs, new_col_name] \
                = f"{round(np.percentile(dfTemp['peak_rpe'], percs),1):.0f}"

# dfPerc

In [6]:
# Create a dataframe with basic cohort summaries for the text of the manuscript.
# Saving as dataframe so I can have everything in a single Excel file.

counts_text = []

counts_text.append(f"Total tests: {len(df):,}")
counts_text.append(f"Total tests in Males: {len(df[df.Gender == 'Male']):,} ({(len(df[df.Gender == 'Male'])/len(df)* 100):.0f}%)")
counts_text.append(f"Total tests in Females: {len(df[df.Gender == 'Female']):,} ({(len(df[df.Gender == 'Female'])/len(df)* 100):.0f}%)")
counts_text.append("\n")
counts_text.append(f"Total TM tests: {len(df[df.Mode == 'TM']):,} ({(len(df[df.Mode == 'TM'])/len(df)* 100):.0f}%)")
counts_text.append(f"TM tests in MALES: {len(df[(df.Mode == 'TM') & (df.Gender == 'Male')]):,}")
counts_text.append(f"TM tests in FEMALES: {len(df[(df.Mode == 'TM') & (df.Gender == 'Female')]):,}")
counts_text.append("\n")
counts_text.append(f"Total CY tests: {len(df[df.Mode == 'CY']):,} ({(len(df[df.Mode == 'CY'])/len(df)* 100):.0f}%)")
counts_text.append(f"CY tests in MALES: {len(df[(df.Mode == 'CY') & (df.Gender == 'Male')]):,}")
counts_text.append(f"CY tests in FEMALES: {len(df[(df.Mode == 'CY') & (df.Gender == 'Female')]):,}")
counts_text.append("\n")
counts_text.append(f"Tests with ethnicity listed: {len(df[~df.ethnicgroup.isna()]):,} ({(len(df[~df.ethnicgroup.isna()])/len(df))*100:.0f}%)")
counts_text.append(f"% White ethnicity: {(len(df[df.ethnicgroup == 'White, not of Hispanic origin'])/len(df[~df.ethnicgroup.isna()])*100):.0f}%")
counts_text.append("\n")
counts_text.append(f"Earliest test date: {min(df.testdate)}")
counts_text.append(f"Latest test date: {max(df.testdate)}")
counts_text.append(f"Number of sites: {len(df.Facility.unique())}")
counts_text.append(f"List of sites: {df.Facility.unique()}")

df_cohort_counts = pd.DataFrame(counts_text)


## Save the output from the tables above.

In [7]:
writer = pd.ExcelWriter('../FRIEND_RPE_5_16_22_.xlsx', engine='xlsxwriter')
# writer = pd.ExcelWriter('FRIENDupdates_for_percentiles_.xlsx', engine='xlsxwriter')

dfSum.to_excel(writer, sheet_name='Averages')
dfPerc.to_excel(writer, sheet_name='Percentiles')
df_cohort_counts.to_excel(writer, sheet_name="Cohort_Counts")

writer.save()