# Exploratory Analysis

## Team Members: Joshua Holman, Josue Cota, Jenny Phan


Goal: Approximate a formula for column `FTFTotal`

In [158]:
import pandas as pd
import numpy as np
from lets_plot import *
LetsPlot.setup_html()

# Data dictionary project.csv contains descriptions of each data val
hsd_orig = pd.read_csv("Health Sciences Data File New (project).csv")

hsd = hsd_orig.dropna(axis=1)
hsd

Unnamed: 0,Idnum,Date,Sex,Age,Ht,Wt,RF 2,RF 3,RF 4,RF 5,...,DBP,HR rest,Stages,PL 1,HR 1,RPE 1,PL 2,HR 2,RPE 2,FFTotal
0,M0001,2002-2,M,19,65.0,195.4,5,1,1,0,...,86,72,3,50,122,12,100,148,13.0,25
1,M0002,2002-2,M,19,68.5,201.4,5,2,2,0,...,72,72,3,50,107,8,125,131,12.0,32
2,M0003,2002-2,M,21,69.8,133.8,5,2,6,0,...,68,72,3,50,119,10,100,144,17.0,39
3,M0004,2002-2,M,18,66.0,148.0,5,1,6,0,...,78,72,3,50,108,7,125,157,14.0,37
4,M0005,2002-2,M,23,67.0,148.7,5,2,6,2,...,70,72,3,50,118,6,100,128,7.0,42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6271,F3612,2013-4,F,18,68.0,193.8,1,3,1,0,...,76,72,3,30,143,9,50,158,9.0,21
6272,F3613,2013-4,F,19,64.0,106.0,1,2,1,0,...,60,72,3,30,144,12,50,150,12.0,37
6273,F3614,2013-4,F,23,67.0,155.8,1,3,1,0,...,60,72,4,50,85,6,100,99,8.0,51
6274,F3615,2013-4,F,18,68.0,152.6,1,3,1,0,...,60,72,4,50,91,6,100,111,10.0,54


In [159]:
import matplotlib.pyplot as plt

columns = ["Wt", "Ht", "HR rest", "SBP", "DBP"]


plots = []
for column in columns:
    p = (ggplot(hsd)
         + geom_histogram(aes(x=hsd[column]),  color="white")
         + labs(title=f"{column}, frequency distribution", x=column, y="Frequency")
        )
    plots.append(p)

# Weight
plots[0]


Relatively normal distribution and the data seems to match the expected curve.

In [160]:
# Height
plots[1]

Height: Somewhat normally distributed, random dips in certain heights but otherwise fine

In [161]:
# HR Rest
plots[2]

HR REST: Expected outcome, not great in histogram form

In [162]:
# SBP
plots[3]

SBP (Systolic): Mostly normally distributed. Expected to have highs at normal Systolic Pressures

In [163]:
# DBP
plots[4]

DBP (Diastolic): Normally Distributed with normal high counts at average Diastolic rates, few outliers. 

Overall, these 5 data variables have good data. Reasons for Diastolic and Systolic having several high points might be due to averages between healthier vs less healthy people or weight differences perhaps.

Looking for strong correlations between FTTotal and other vars

In [164]:
hsd2 = hsd_orig
missing_vals = hsd2.isnull().sum()
missing_percent = missing_vals / len(hsd2) * 100
missing_df = pd.concat([missing_vals, missing_percent], axis=1)
missing_df.columns = ["Missing Values", "Percentage"]
for index, row in missing_df.iterrows():
	if row["Missing Values"] == 0:
		missing_df.drop(index, inplace=True)
	elif row["Percentage"] == 100:
		hsd2.drop(index, axis=1, inplace=True)
		missing_df.drop(index, inplace=True)

missing_df.sort_values("Percentage", ascending=False)

Unnamed: 0,Missing Values,Percentage
Waist,5528,88.081581
BIA_percent_Fat,4680,74.56979
SF 2,1596,25.43021
SF 1,1596,25.43021
SF 3,1596,25.43021
RPE 3,393,6.26195
PL 3,380,6.054812
HR 3,380,6.054812


In [176]:
hsd_clean = hsd2.drop(["Waist", "BIA_percent_Fat", "SF 2", "SF 1", "SF 3"], axis=1)
print("Available columns:")
for column in hsd_clean.columns:
    print(column, end=", ")

Available columns:
Idnum, Date, Sex, Age, Ht, Wt, RF 2, RF 3, RF 4, RF 5, FF, RGM, LGM, VC, TA, PB, SBP, DBP, HR rest, Stages, PL 1, HR 1, RPE 1, PL 2, HR 2, RPE 2, PL 3, HR 3, RPE 3, FFTotal, 

In [166]:
def scatter_plot(column, col_label: str | None = column):
    p = (ggplot(hsd_clean)
            + geom_point(aes(x=hsd_clean[column], y=hsd_clean.FFTotal, color='Sex'))
            + labs(title=f"{col_label} vs FFTotal", x=col_label, y="FFTotal")
            + geom_smooth(aes(x=hsd_clean[column], y=hsd_clean.FFTotal),
                        method='lm',
                        se=True)
        )
    return p

def bar_chart(column, col_label: str | None = column, legend: dict | None = None):
    p = (ggplot(hsd_clean)
            + geom_bar(aes(x=hsd_clean[column], y=hsd_clean.FFTotal, color='Sex'))
            + labs(title=f"{col_label} vs FFTotal",
                   x=col_label, y="FFTotal")
        )
    if legend:
        p = p + scale_x_discrete(labels=legend)

    return p

In [167]:
# plots[1]
scatter_plot("Wt", "Weight")

Weak correlation at best between weight and FFTotal.

In [168]:
legend = {
    1: "No deaths",
    2: "1 rel >60y death",
    3: "1 rel <60y death",
    4: "2 rel >60y death",
    6: "2 rel <60y death",
}
bar_chart("RF 4", "History of Family Deaths to CVD", legend)

Very strong correlation here.

In [169]:
scatter_plot("DBP", "Diastolic Blood Pressure")

In [178]:

scatter_plot("SBP", "Systolic Blood Pressure")

In [170]:

bar_chart("RF 3", "Reported Stress Level")

In [171]:

legend = {
    0: "Don't smoke",
    2: "10 cigs a day",
    3: "20 cigs a day",
    4: "30 cigs a day",
    6: "40 cigs a day",
}
bar_chart("RF 5", "Reported Smoking History", legend)

Unsurprisingly a very strong correlation here.

In [177]:
bar_chart("Stages", "Number of Stages completed on cycle ergometer test")

In [180]:
scatter_plot("HR rest", "Resting Heart Rate")

In [179]:
scatter_plot("HR 1", "Heart Rate at end of Stage 1")