In [1]:
import pandas as pd
import numpy as np
import pickle

from ref_calcs import friend_percentile


# Create dataset of interest for RPE analysis.

Load the entire FRIEND dataset and then filter/clean for an analysis on RPE. Inclusion criteria include:
- RPE assessed with the Borg scale (6-20 scale).
- Treadmill or cycling test.
- 20-90 years old.
- RER >= 1.0 (for max effort).
- Apparently healthy (no CVD, COPD, or beta blocker meds).
- Only USA data.


In [2]:
# Only need these columns from the FRIEND dataset.
col_interest = ["ID", "Facility", "ageattest", "testdate", "Gender", "Country", 
               "weight", "height", "BMI", "ANYCVD", "COPD", "BetaMed",
                "Mode", "vo2_ml_kg_min", "vo2_l_min", "max_hr", "max_rer",
                "max_load_watts", "peak_rpe", "ethnicgroup"]

df_full = pd.read_excel('../data/FRIEND_dataset_with_City_3_14_22.xlsx',
                        usecols=col_interest)


### If needing to use MET-test data then need to use only what's previously been used in publications.

In this case, MET-test didn't report RPE to FRIEND so they're not going to be included in the analysis anyway.

In [3]:
# # Import the "cleaned" MET-test dataset that was used in previous pubs.
# # Kaminsky et al. 2017, MCP
# df_met = pd.read_excel('../data/CLEANED MET-test FRIEND Cycling data_4_25_2021.xlsx',
#                        usecols=col_interest)


In [4]:
# # Filter out the "unwanted" MET-test from FRIEND dataset.
# df_full = df_full.query("Facility != 'MET-test'")

# # Add in the "wanted" MET-test data.
# df = df_full.append(df_met)

In [5]:
df = df_full

### Dropping tests/ Cleaning up dataset.

In [6]:
df = df.query('Gender == "Female" | Gender == "Male"')

In [7]:
df = df.query('Country == "USA"')

In [8]:
df = df.query('Mode == "TM" | Mode == "CY"')

In [9]:
df = df.query('ageattest >= 20 & ageattest <90')

In [10]:
df = df.query('max_rer >= 1.0')

In [11]:
# Height and weight outliers become null.

df["height"] = np.where(df["height"] < 50, np.nan, df["height"])
df["height"] = np.where(df["height"] > 90, np.nan, df["height"])

df["weight"] = np.where(df["weight"] < 30, np.nan, df["weight"])

In [12]:
# Drops those with 0 for height, weight, age, or VO2.

df['weight'] = df['weight'].astype(float)
df['height'] = df['height'].astype(float)
df['ageattest'] = df['ageattest'].astype(float)
df['vo2_ml_kg_min'] = df['vo2_ml_kg_min'].astype(float)

df = df.dropna(how="any", subset=['height','weight','ageattest','vo2_ml_kg_min'])

df = df.query('height > 0 & weight > 0')
df = df.query('ageattest > 0 & vo2_ml_kg_min > 0')

In [13]:
# Drops those with CVD (but keeps those with CVD missing).

# the .ge is 'greater than or equal' (AND KEEPS nan).
filt = (df['ANYCVD'].ge(1))
df=df[~filt]

In [14]:
# Drops those with COPD (but keeps those with COPD missing (a lot of sites)).

df['COPD'] = df['COPD'].astype(float)
filt = (df['COPD'].ge(1))
df=df[~filt]

In [15]:
# Drops those taking beta meds (but keeps missing beta meds).

df['BetaMed'] = df['BetaMed'].astype(float)
filt = (df['BetaMed']).ge(1)
df=df[~filt]

In [16]:
# All tests need to include RPE for this analysis.

df = df.dropna(axis=0, subset=["peak_rpe"])

In [17]:
# Convert string dates to date time.

df["testdate"] = pd.to_datetime(df["testdate"])

In [18]:
# If max HR is >250 or < 30, make the value missing.
df['max_hr'] = pd.to_numeric(df['max_hr'], errors='coerce')

df['max_hr'].values[df['max_hr'].values > 250] = None
df['max_hr'].values[df['max_hr'].values < 30] = None

### Determine which sites used Borg scale for RPE (then drop others).

In [19]:
for site in df.Facility.unique():
    temp_df = df.query(f"Facility == '{site}'")
    print(site)
    print(f"min RPE: {min(temp_df.peak_rpe)}; max RPE: {max(temp_df.peak_rpe)}")
    
    print(f"Number of tests below 11: {sum(temp_df.peak_rpe < 11)} ({sum(temp_df.peak_rpe < 11)/len(temp_df):.0%})")
    print(f"Number of tests above 11: {sum(temp_df.peak_rpe > 10)} ({sum(temp_df.peak_rpe > 10)/len(temp_df):.0%})")
    print("\n")

JHU
min RPE: 9.0; max RPE: 20.0
Number of tests below 11: 1 (0%)
Number of tests above 11: 329 (100%)


KUMC
min RPE: 13.0; max RPE: 20.0
Number of tests below 11: 0 (0%)
Number of tests above 11: 218 (100%)


MCH
min RPE: 9.0; max RPE: 20.0
Number of tests below 11: 1 (0%)
Number of tests above 11: 259 (100%)


Pennington
min RPE: 2.0; max RPE: 20.0
Number of tests below 11: 5 (1%)
Number of tests above 11: 336 (99%)


SAMMC
min RPE: 13.0; max RPE: 118.0
Number of tests below 11: 0 (0%)
Number of tests above 11: 100 (100%)


SCSU
min RPE: 6.0; max RPE: 20.0
Number of tests below 11: 2 (2%)
Number of tests above 11: 84 (98%)


SFSU
min RPE: 14.0; max RPE: 20.0
Number of tests below 11: 0 (0%)
Number of tests above 11: 36 (100%)


Hartford Hospital
min RPE: 9.0; max RPE: 20.0
Number of tests below 11: 1 (0%)
Number of tests above 11: 443 (100%)


University of Massachusetts
min RPE: 11.0; max RPE: 20.0
Number of tests below 11: 0 (0%)
Number of tests above 11: 327 (100%)


GHCPS
min RPE

In [20]:
# Drop those sites with 100% of tests with RPE <11 (so most likely NOT using Borg scale).

df = df.query("Facility != 'SFU' & Facility != 'TUKHS'")

In [21]:
# Drop those with RPE < 10 (these individuals clearly didn't understand the scale).

df = df.query("peak_rpe >= 10")

In [22]:
# Fix RPE value entered incorrectly.

df["peak_rpe"] = np.where((df["peak_rpe"]==118.0) & (df["Facility"] == "SAMMC"),
                          18.0, df["peak_rpe"])

In [23]:
# Drop tests in which the RPE is not a whole number within the Borg scale.
# Instructions for scale say you have to use a whole number.

df = df[df["peak_rpe"].isin(list(range(10,21)))]

In [24]:
# Convert RPE to integer (for cleaner graphs).

df["peak_rpe"] = df["peak_rpe"].apply(lambda x: int(x))

### Add some variables.

In [25]:
# Add in the FRIEND fitness percentiles.

df["FRIEND_perc"] = df.apply(lambda x: friend_percentile(x.vo2_ml_kg_min, x.ageattest, x.Gender, x.Mode),
                            axis=1)

In [26]:
df["age_group"] = df.ageattest.apply(lambda x: f"{str(x)[0]}0s")

In [27]:
# Create SI units for height and weight.

df['weightSI'] = (df['weight'] / 2.205)
df['heightSI'] = (df['height'] * 2.54)

### Save as a pickle file for analysis.

In [28]:
with open('../data/cleaned_dataframe.pickle', 'wb') as to_write:
    pickle.dump(df, to_write)