In [79]:
import pandas as pd
import numpy as np

In [80]:
dir = "../data/csv-data/Cholesterol - Low - Density Lipoprotein (LDL) & Triglycerides:2015-2016.csv"
df = pd.read_csv(dir)
df.head()
original_len = len(df)


In [81]:
df.isnull().sum()

SEQN        0
WTSAF2YR    0
LBXTR       3
LBDTRSI     3
LBDLDL      3
LBDLDLSI    3
dtype: int64

In [82]:
df.dropna(inplace=True)

In [83]:
stats = df.describe().loc[['mean', 'std', 'max', 'min', '50%']]
stats

Unnamed: 0,SEQN,WTSAF2YR,LBXTR,LBDTRSI,LBDLDL,LBDLDLSI
mean,83797.574468,88550.665645,102.914894,1.161957,104.510638,2.702638
std,39.568183,81576.511351,76.672604,0.865634,40.152899,1.038404
max,83854.0,345203.133478,360.0,4.064,188.0,4.862
min,83733.0,21950.505521,15.0,0.169,37.0,0.957
50%,83809.0,54722.34333,78.0,0.881,107.0,2.767


In [84]:
units = {
    'SEQN': '',  # No unit
    'WTSAF2YR': '',  # No unit provided
    'LBXTR': 'mg/dL',
    'LBDTRSI': 'mmol/L',
    'LBDLDL': 'mg/dL',
    'LBDLDLSI': 'mmol/L',
    'LBDLDLM': 'mg/dL',
    'LBDLDMSI': 'mmol/L',
    'LBDLDLN': 'mg/dL',
    'LBDLDNSI': 'mmol/L'
}

full_names = {
    'SEQN': 'Respondent sequence number (SEQN)',
    'WTSAF2YR': 'Fasting Subsample 2 Year MEC Weight (WTSAF2YR)',
    'LBXTR': 'Triglyceride (LBXTR) (mg/dL)',
    'LBDTRSI': 'Triglyceride (LBDTRSI) (mmol/L)',
    'LBDLDL': 'LDL-Cholesterol, Friedewald (LBDLDL) (mg/dL)',
    'LBDLDLSI': 'LDL-Cholesterol, Friedewald (LBDLDLSI) (mmol/L)',
    'LBDLDLM': 'LDL-Cholesterol, Martin-Hopkins (LBDLDLM) (mg/dL)',
    'LBDLDMSI': 'LDL-Cholesterol, Martin-Hopkins (LBDLDMSI) (mmol/L)',
    'LBDLDLN': 'LDL-Cholesterol, NIH equation 2 (LBDLDLN) (mg/dL)',
    'LBDLDNSI': 'LDL-Cholesterol, NIH equation 2 (LBDLDNSI) (mmol/L)'
}

def create_sentence(row):
    sentence = f"The patient with Respondent sequence number (SEQN) {int(row['SEQN'])} has the following lab results: "
    for col in row.index:
        if col != 'SEQN':
            sentence += f"a value of {row[col]:.2f} {units[col]} for {full_names[col]}, "
    return sentence.strip(', ')

In [85]:
df['Text'] = df.apply(create_sentence, axis=1)
df.head()

Unnamed: 0,SEQN,WTSAF2YR,LBXTR,LBDTRSI,LBDLDL,LBDLDLSI,Text
0,83733.0,54722.34333,147.0,1.66,173.0,4.474,The patient with Respondent sequence number (S...
1,83734.0,25471.093699,269.0,3.037,145.0,3.75,The patient with Respondent sequence number (S...
2,83736.0,38179.51087,47.0,0.531,142.0,3.672,The patient with Respondent sequence number (S...
3,83737.0,25800.845631,46.0,0.519,103.0,2.664,The patient with Respondent sequence number (S...
4,83741.0,108751.289086,68.0,0.768,102.0,2.638,The patient with Respondent sequence number (S...


In [86]:
# Create metadata string with distribution stats
metadata = f"Overall distribution statistics: There are a total of {original_len} of participants with {len(df)} of participants with valid data in this dataset, "
for col in df.columns:
    if col != 'SEQN' and col in full_names:
        metadata += (f"{full_names[col]} has a mean of {stats[col]['mean']:.2f} {units[col]}, "
                     f"standard deviation of {stats[col]['std']:.2f} {units[col]}, "
                     f"maximum value of {stats[col]['max']:.2f} {units[col]}, "
                     f"minimum value of {stats[col]['min']:.2f} {units[col]}, "
                     f"and median value of {stats[col]['50%']:.2f} {units[col]}; ")

# Add the SAME metadata to each row
df['Metadata'] = metadata.strip('; ')
df['Metadata'][5]

'Overall distribution statistics: There are a total of 50 of participants with 47 of participants with valid data in this dataset, Fasting Subsample 2 Year MEC Weight (WTSAF2YR) has a mean of 88550.67 , standard deviation of 81576.51 , maximum value of 345203.13 , minimum value of 21950.51 , and median value of 54722.34 ; Triglyceride (LBXTR) (mg/dL) has a mean of 102.91 mg/dL, standard deviation of 76.67 mg/dL, maximum value of 360.00 mg/dL, minimum value of 15.00 mg/dL, and median value of 78.00 mg/dL; Triglyceride (LBDTRSI) (mmol/L) has a mean of 1.16 mmol/L, standard deviation of 0.87 mmol/L, maximum value of 4.06 mmol/L, minimum value of 0.17 mmol/L, and median value of 0.88 mmol/L; LDL-Cholesterol, Friedewald (LBDLDL) (mg/dL) has a mean of 104.51 mg/dL, standard deviation of 40.15 mg/dL, maximum value of 188.00 mg/dL, minimum value of 37.00 mg/dL, and median value of 107.00 mg/dL; LDL-Cholesterol, Friedewald (LBDLDLSI) (mmol/L) has a mean of 2.70 mmol/L, standard deviation of 1.0

In [87]:
# Define the healthy level metrics for each feature
HEALTHY_METRICS = {
    'WTSAF2YR': {
        'unit': 'kg',
        'sentence': "Weight recommendations are generally based on BMI (Body Mass Index), which considers both weight and height."
    },
    'LBXTR': {
        'unit': 'mg/dL',
        'normal': (None, 150),
        'borderline_high': (150, 199),
        'high': (200, 499),
        'very_high': (500, None),
        'sentence': "Normal: Less than 150 mg/dL; Borderline high: 150-199 mg/dL; High: 200-499 mg/dL; Very high: 500 mg/dL and above."
    },
    'LBDTRSI': {
        'unit': 'mmol/L',
        'normal': (None, 1.7),
        'borderline_high': (1.7, 2.2),
        'high': (2.3, 5.6),
        'very_high': (5.7, None),
        'sentence': "Normal: Less than 1.7 mmol/L; Borderline high: 1.7-2.2 mmol/L; High: 2.3-5.6 mmol/L; Very high: 5.7 mmol/L and above."
    },
    'LBDLDL': {
        'unit': 'mg/dL',
        'optimal': (None, 100),
        'near_optimal': (100, 129),
        'borderline_high': (130, 159),
        'high': (160, 189),
        'very_high': (190, None),
        'sentence': "Optimal: Less than 100 mg/dL; Near optimal/above optimal: 100-129 mg/dL; Borderline high: 130-159 mg/dL; High: 160-189 mg/dL; Very high: 190 mg/dL and above."
    },
    'LBDLDLSI': {
        'unit': 'mmol/L',
        'optimal': (None, 2.6),
        'near_optimal': (2.6, 3.3),
        'borderline_high': (3.4, 4.1),
        'high': (4.2, 4.9),
        'very_high': (5.0, None),
        'sentence': "Optimal: Less than 2.6 mmol/L; Near optimal/above optimal: 2.6-3.3 mmol/L; Borderline high: 3.4-4.1 mmol/L; High: 4.2-4.9 mmol/L; Very high: 5.0 mmol/L and above."
    }
}

# Function to categorize lab results based on healthy metrics
def categorize_value(value, metric):
    for category, range_values in metric.items():
        if category in ['unit', 'sentence']:
            continue
        lower, upper = range_values
        if (lower is None or value > lower) and (upper is None or value <= upper):
            return category.replace('_', ' ')
    return 'unknown'

# Add the "Metric" and "Category" columns
def add_metric_and_category(df):
    def generate_metric(row):
        metrics = []
        for key in row.index:
            if key in HEALTHY_METRICS:
                metrics.append(f"{HEALTHY_METRICS[key]['sentence']}")
        return ' '.join(metrics)
    
    def generate_category(row):
        categories = []
        for key in row.index:
            if key in HEALTHY_METRICS and 'unit' in HEALTHY_METRICS[key]:
                categories.append(f"{full_names[key]} is {categorize_value(row[key], HEALTHY_METRICS[key])};")
        return ' '.join(categories)
    
    df['Metric'] = df.apply(generate_metric, axis=1)
    df['Category'] = df.apply(generate_category, axis=1)
    return df

In [88]:
data = add_metric_and_category(df)

print(data.iloc[0]['Category'])

Fasting Subsample 2 Year MEC Weight (WTSAF2YR) is unknown; Triglyceride (LBXTR) (mg/dL) is normal; Triglyceride (LBDTRSI) (mmol/L) is normal; LDL-Cholesterol, Friedewald (LBDLDL) (mg/dL) is high; LDL-Cholesterol, Friedewald (LBDLDLSI) (mmol/L) is high;


In [89]:
final_df = df[['Text', 'Metadata', 'Metric', 'Category']]
output_dir = '../data/output/' + dir[dir.rfind('/')+1:]

final_df.to_csv(output_dir, index=False)

final_df.head()

Unnamed: 0,Text,Metadata,Metric,Category
0,The patient with Respondent sequence number (S...,Overall distribution statistics: There are a t...,Weight recommendations are generally based on ...,Fasting Subsample 2 Year MEC Weight (WTSAF2YR)...
1,The patient with Respondent sequence number (S...,Overall distribution statistics: There are a t...,Weight recommendations are generally based on ...,Fasting Subsample 2 Year MEC Weight (WTSAF2YR)...
2,The patient with Respondent sequence number (S...,Overall distribution statistics: There are a t...,Weight recommendations are generally based on ...,Fasting Subsample 2 Year MEC Weight (WTSAF2YR)...
3,The patient with Respondent sequence number (S...,Overall distribution statistics: There are a t...,Weight recommendations are generally based on ...,Fasting Subsample 2 Year MEC Weight (WTSAF2YR)...
4,The patient with Respondent sequence number (S...,Overall distribution statistics: There are a t...,Weight recommendations are generally based on ...,Fasting Subsample 2 Year MEC Weight (WTSAF2YR)...
