# Final Project

#### Team Members: Joshua Holman, Josue Cota, Jenny Phan


## Data Exploration

Goal: Approximate a formula for column `FTFTotal`

In [189]:
import pandas as pd
import numpy as np
from lets_plot import *
LetsPlot.setup_html()

# Data dictionary project.csv contains descriptions of each data val
hsd_orig = pd.read_csv("Health Sciences Data File New (project).csv")

hsd = hsd_orig.dropna(axis=1)
hsd

Unnamed: 0,Idnum,Date,Sex,Age,Ht,Wt,RF 2,RF 3,RF 4,RF 5,...,DBP,HR rest,Stages,PL 1,HR 1,RPE 1,PL 2,HR 2,RPE 2,FFTotal
0,M0001,2002-2,M,19,65.0,195.4,5,1,1,0,...,86,72,3,50,122,12,100,148,13.0,25
1,M0002,2002-2,M,19,68.5,201.4,5,2,2,0,...,72,72,3,50,107,8,125,131,12.0,32
2,M0003,2002-2,M,21,69.8,133.8,5,2,6,0,...,68,72,3,50,119,10,100,144,17.0,39
3,M0004,2002-2,M,18,66.0,148.0,5,1,6,0,...,78,72,3,50,108,7,125,157,14.0,37
4,M0005,2002-2,M,23,67.0,148.7,5,2,6,2,...,70,72,3,50,118,6,100,128,7.0,42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6271,F3612,2013-4,F,18,68.0,193.8,1,3,1,0,...,76,72,3,30,143,9,50,158,9.0,21
6272,F3613,2013-4,F,19,64.0,106.0,1,2,1,0,...,60,72,3,30,144,12,50,150,12.0,37
6273,F3614,2013-4,F,23,67.0,155.8,1,3,1,0,...,60,72,4,50,85,6,100,99,8.0,51
6274,F3615,2013-4,F,18,68.0,152.6,1,3,1,0,...,60,72,4,50,91,6,100,111,10.0,54


In [190]:
import matplotlib.pyplot as plt

columns = ["Wt", "Ht", "HR rest", "SBP", "DBP"]


plots = []
for column in columns:
    p = (ggplot(hsd)
         + geom_histogram(aes(x=hsd[column]),  color="white")
         + labs(title=f"{column}, frequency distribution", x=column, y="Frequency")
        )
    plots.append(p)

# Weight
plots[0]


Weight: Relatively normal distribution and the data seems to match the expected curve.

In [191]:
# Height
plots[1]

Height: Somewhat normally distributed, random dips in certain heights but otherwise fine

In [192]:
# HR Rest
plots[2]

HR REST: Expected outcome, not great in histogram form

In [193]:
# SBP
plots[3]

SBP (Systolic): Mostly normally distributed. Expected to have highs at normal Systolic Pressures

In [194]:
# DBP
plots[4]

DBP (Diastolic): Normally Distributed with normal high counts at average Diastolic rates, few outliers. 

Overall, these 5 data variables have good data. Reasons for Diastolic and Systolic having several high points might be due to averages between healthier vs less healthy people or weight differences perhaps.

Looking for strong correlations between FTTotal and other vars

In [195]:
hsd2 = hsd_orig
missing_vals = hsd2.isnull().sum()
missing_percent = missing_vals / len(hsd2) * 100
missing_df = pd.concat([missing_vals, missing_percent], axis=1)
missing_df.columns = ["Missing Values", "Percentage"]
for index, row in missing_df.iterrows():
	if row["Missing Values"] == 0:
		missing_df.drop(index, inplace=True)
	elif row["Percentage"] == 100:
		hsd2.drop(index, axis=1, inplace=True)
		missing_df.drop(index, inplace=True)

missing_df.sort_values("Percentage", ascending=False)

Unnamed: 0,Missing Values,Percentage
Waist,5528,88.081581
BIA_percent_Fat,4680,74.56979
SF 2,1596,25.43021
SF 1,1596,25.43021
SF 3,1596,25.43021
RPE 3,393,6.26195
PL 3,380,6.054812
HR 3,380,6.054812


In [196]:
hsd_clean = hsd2.drop(["Waist", "SF 2", "SF 1", "SF 3"], axis=1)
print("Available columns:")
for column in hsd_clean.columns:
    print(column, end=", ")

Available columns:
Idnum, Date, Sex, Age, Ht, Wt, RF 2, RF 3, RF 4, RF 5, BIA_percent_Fat, FF, RGM, LGM, VC, TA, PB, SBP, DBP, HR rest, Stages, PL 1, HR 1, RPE 1, PL 2, HR 2, RPE 2, PL 3, HR 3, RPE 3, FFTotal, 

In [197]:
def scatter_plot(column, col_label: str | None = column):
    p = (ggplot(hsd_clean)
            + geom_point(aes(x=hsd_clean[column], y=hsd_clean.FFTotal, color='Sex'))
            + labs(title=f"{col_label} vs FFTotal", x=col_label, y="FFTotal")
            + geom_smooth(aes(x=hsd_clean[column], y=hsd_clean.FFTotal),
                        method='lm',
                        se=True)
        )
    return p

def bar_chart(column, col_label: str | None = column, legend: dict | None = None):
    p = (ggplot(hsd_clean)
            + geom_bar(aes(x=hsd_clean[column], y=hsd_clean.FFTotal, color='Sex'))
            + labs(title=f"{col_label} vs FFTotal",
                   x=col_label, y="FFTotal")
        )
    if legend:
        p = p + scale_x_discrete(labels=legend)

    return p

In [198]:
scatter_plot("Age", "Age")

Age: Weak correlation for Age vs FFTotal. Age range from 18-20 is small so low variability may influence strength of model

In [228]:
scatter_plot("VC", "Vital Capacity")

VC: Vital capacity shows medium correlation, would be better if we normalized amongst gender

In [199]:
# plots[1]
scatter_plot("Wt", "Weight")

Weight: Medium correlation at best between weight and FFTotal. Transform into BMI score would be better.

In [200]:
legend = {
    1: "No deaths",
    2: "1 rel >60y death",
    3: "1 rel <60y death",
    4: "2 rel >60y death",
    6: "2 rel <60y death",
}
bar_chart("RF 4", "History of Family Deaths to CVD", legend)

Very strong correlation here.

In [201]:
scatter_plot("HR rest", "Resting Heart Rate")

HR rest: Weak correlation between HR rest and FFTotal, but applicable to transoformed variables

In [202]:
(ggplot(hsd2)
            + geom_point(aes(x="BIA_percent_Fat", y="FFTotal", color='Sex'))
            + labs(title=f"BIA Fat Percentage vs FFTotal", x="BIA Fat Percentage", y="FFTotal")
            + geom_smooth(aes(x="BIA_percent_Fat", y="FFTotal")
        ))

BIA % Fat: High Correlation between BIA and FFTotal

In [203]:
scatter_plot("DBP", "Diastolic Blood Pressure")

DBP : Weak correlation between DBP and FF Total

In [204]:

scatter_plot("SBP", "Systolic Blood Pressure")

SBP: Weak correlation between SBP and FF Total

In [None]:
bar_chart("RF 3", "Reported Stress Level")

Stress: not a very good measure 

In [206]:

legend = {
    0: "Don't smoke",
    2: "10 cigs a day",
    3: "20 cigs a day",
    4: "30 cigs a day",
    6: "40 cigs a day",
}
bar_chart("RF 5", "Reported Smoking History", legend)

Unsurprisingly a very strong correlation here.

In [207]:
bar_chart("Stages", "Number of Stages completed on cycle ergometer test")

In [208]:
scatter_plot("Stages", "Number of Stages completed on cycle ergometer test")

Stages: Weak correlation between Stages and FFTotal

In [210]:
hsd_clean = hsd_clean[hsd_clean['HR 1'] < 200]
scatter_plot("HR 1", "Heart Rate at end of Stage 1")

HR 1: Moderate correlation

In [211]:
hsd_clean = hsd_clean[hsd_clean['HR 2'] < 200]
scatter_plot("HR 2", "Heart Rate at end of Stage 2")

HR 2: Stronger Correlation than HR 1. HR 2 (stage 2) captures heart rate after the subject has adapted to exercise but before significant fatigue sets in. It is typically more stable and reflective of cardiovascular efficiency and fitness.


In [226]:
hsd_clean = hsd_clean[hsd_clean['HR 3'] < 200]
scatter_plot("HR 3", "Heart Rate at end of Stage 3")

HR 3: Moderate correlation

### Plotting of Transformed Variables

#### Adding a BMI Column

In [213]:
# BMI = Wt / Ht**2

hsd_clean = hsd_clean.copy()
hsd_clean['BMI'] = hsd_clean.apply(lambda row: row['Wt'] / row['Ht']**2, axis=1)
scatter_plot('BMI', 'BMI')

#### Strength-to-Weight Ratio
Shows functional strength independent of body size (higher is better).

In [214]:
# This averages left and right hand strength
hsd_clean['grip_str_ratio'] = hsd_clean.apply(lambda row: (row['RGM'] + row['LGM']) / (2 * row['Wt']), axis=1)
scatter_plot('grip_str_ratio', 'Grip Strength Ratio (Higher is better)')

#### Power-to-Weight Ratio
Shows athletic perforance normalized for size (higher is better).

In [215]:
hsd_clean['pwr_wt_ratio_1'] = hsd_clean['PL 1'] / hsd_clean['Wt']
scatter_plot('pwr_wt_ratio_1', 'Power to Weight Ratio, stage 1')

In [216]:
hsd_clean['pwr_wt_ratio_2'] = hsd_clean['PL 2'] / hsd_clean['Wt']
scatter_plot('pwr_wt_ratio_2', 'Power to Weight Ratio, stage 2')

In [217]:
hsd_clean['pwr_wt_ratio_3'] = hsd_clean['PL 3'] / hsd_clean['Wt']
scatter_plot('pwr_wt_ratio_3', 'Power to Weight Ratio, stage 3')

#### Power Progression Ratios
Shows how well someone maintains power output as intensity increases (higher is better).  
High ratios indicate good anaerobic capacity, low ratios indicate poor endurance.

In [218]:
hsd_clean['pwr_progression_1_2'] = hsd_clean['PL 2'] / hsd_clean['PL 1']
scatter_plot('pwr_progression_1_2', 'Power Progression, stage 1 -> 2')

In [219]:
hsd_clean['pwr_progression_2_3'] = hsd_clean['PL 3'] / hsd_clean['PL 2']
scatter_plot('pwr_progression_2_3', 'Power Progression, stage 2 -> 3')

#### RPE to HR Ratio
Indicates how well someone perceives their exertion level (lower is better).  
Low ratios could suggest better fitness, high ratios could suggest poor conditioning or overexertion.


In [220]:
hsd_clean['RPE_HR_1'] = hsd_clean.apply(lambda row: row['RPE 1'] / row['HR 1'], axis=1)
scatter_plot('RPE_HR_1', 'RPE to HR ratio, stage 1')

#### Heart Rate Reserve (HRR)
Larger HRR indicates better fitness, can identify abnormal heart rate responses to exercise.

In [221]:
hsd_clean['HRR_1'] = hsd_clean['HR 1'] - hsd_clean['HR rest']
scatter_plot('HRR_1', 'Heart Rate Reserve, stage 1 (higher is better)')

In [227]:
hsd_clean['HRR_3'] = hsd_clean['HR 3'] - hsd_clean['HR rest']
scatter_plot('HRR_3', 'Heart Rate Reserve, stage 3 (higher is better)')

HRR 3: stage 3 approximates approximates maximum heartrate better than stage 1 because stage 1 is warm-up phase

#### Vital Capacity Normalized 
VC normalization after z-score normalization, both male and female VC values will be on the same scale, enabling fair comparisons in our analysis. Bigger vital capicity indicates better oxygenation, indicating better overall cardiovascular health.

In [230]:
hsd_clean['VC_normalized'] = hsd_clean.groupby('Sex')['VC'].transform(lambda x: (x - x.mean()) / x.std())
scatter_plot('VC_normalized', 'Vital Capacity Normalized')

## Train and Test Splits

In [222]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from typing import List

def do_regression(selected_features: List[str]):
    X = hsd_clean.drop('FFTotal', axis=1)
    X = X[selected_features]
    y = hsd_clean['FFTotal']

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.5,
        random_state=42069
    )
    # return X_train, X_test, y_train, y_test

    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    score = model.score(X_test, y_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    print(f"R^2 score: {score}")
    print(f"MSE: {mse}")
    print(f"MAE: {mae}")
    print(f"Predicted FFT: {y_pred}")

## Running Linear Regression

In [223]:
do_regression(['BMI', 'pwr_wt_ratio_2'])

R^2 score: 0.2857537340370555
MSE: 60.02954683976482
MAE: 6.260361458117238
Predicted FFT: [39.49910574 42.30893795 38.6086261  ... 37.80455004 40.14651026
 38.50969028]
