In [1]:
import numpy as np
from scipy import stats

hypothesis and analysis ideas
- 

- does age increase the severity of coronary artery disease more in males or females 
- are males more prone to a higher severity of coronary artery disease than women
- does thalessmia increase the risk of coronary artery disease
    - **does severity of artery disease differ across different blood type defects**
- **higher blood pressure correlates with a higher severity of coronary artery disease : linear regression**
- **females are more prone to higher cholesterol levels than males**

In [2]:
%%capture
%run parse_data.ipynb

In [3]:
results = stats.linregress(df['blood pressure'], df['severity of artery blockage'])
p        = results.pvalue
print(results)
print()
print(p)

LinregressResult(slope=np.float64(-0.0051779603223851235), intercept=np.float64(0.005136572691748498), rvalue=np.float64(-0.09795376076948144), pvalue=np.float64(0.09198077585788669), stderr=np.float64(0.0030629014325935493), intercept_stderr=np.float64(0.40700486011184456))

0.09198077585788669


In [4]:
male = df[df['sex'] == 1]['severity of artery blockage']
female = df[df['sex'] == 0]['severity of artery blockage']

t,p    = stats.ttest_ind(female, male)

print(t)
print(p)

1.5855736873918052
0.11390737208649557


In [5]:
male = df[df['sex'] == 1]['blood pressure']
female = df[df['sex'] == 0]['blood pressure']

t,p    = stats.ttest_ind(female, male)

print(t)
print(p)

1.1419459952202295
0.25440251177008955


In [6]:
male = df[df['sex'] == 1]['restecg']
female = df[df['sex'] == 0]['restecg']

t,p    = stats.ttest_ind(female, male)

print(t)
print(p)

-0.5825319011038801
0.5606534344274072


In [7]:
male = df[df['sex'] == 1]['cholesterol']
female = df[df['sex'] == 0]['cholesterol']

t,p    = stats.ttest_ind(female, male)

print(t)
print(p)

3.4710741116059793
0.000595853825038037


In [8]:
results = stats.linregress(df['cholesterol'], df['severity of artery blockage'])
p        = results.pvalue
print(results)
print()
print(p)

LinregressResult(slope=np.float64(-0.0020937099820790726), intercept=np.float64(-0.15888816022399022), rvalue=np.float64(-0.11594459060824375), pvalue=np.float64(0.04588334071612732), stderr=np.float64(0.0010442778895104496), intercept_stderr=np.float64(0.26392923039325694))

0.04588334071612732


In [9]:
g1 = df[df['age group'] == '<30']['cholesterol']
g2 = df[df['age group'] == '30-39']['cholesterol']
g3 = df[df['age group'] == '40-49']['cholesterol']
g4 = df[df['age group'] == '50-59']['cholesterol']
g5 = df[df['age group'] == '60-69']['cholesterol']
g6 = df[df['age group'] == '70+']['cholesterol']


results = stats.f_oneway(g1, g2, g3, g4, g5, g6)

print(results)

F_onewayResult(statistic=np.float64(2.9929446125464896), pvalue=np.float64(0.011882162066573916))


In [10]:
thal3 = df[df['blood defect type'] == 3]['severity of artery blockage']
thal6 = df[df['blood defect type'] == 6]['severity of artery blockage']
thal7 = df[df['blood defect type'] == 7]['severity of artery blockage']

F, p = stats.f_oneway(thal3, thal6, thal7)

print(f"F: {F:.10f}")
print(f"p: {p:.10f}")

F: 10.6920133658
p: 0.0000329323


In [11]:
print("Mean severity by defect type:")
print("No thalessemia:", thal3.mean())
print("Fixed thalessemia:", thal6.mean())
print("Reversable thalessemia:", thal7.mean())

Mean severity by defect type:
No thalessemia: -0.4573170731707317
Fixed thalessemia: -1.0
Reversable thalessemia: -0.9391304347826087


In [12]:
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats

data = [thal3, thal6, thal7]

def thal_severityofdisease_ANOVA(df, figure_label="Figure 1."):
    
    plt.figure(figsize=(8, 6))
    box = plt.boxplot(data, tick_labels=['No Blood Type Defect', 'Fixed Thalessemia', 'Reversible Thalessemia'], patch_artist=True, 
                      medianprops={'color': 'red', 'linewidth': 2}, 
                      boxprops={'facecolor': 'grey', 'linewidth': 2},
                      whiskerprops={'linewidth': 2, 'color': 'black'}, 
                      flierprops={'marker': 'o', 'color': 'black', 'markersize': 8})
    
    # Annotate the plot with F-statistic and p-value
    plt.text(2, 0.5, f"F-statistic: {F:.10f}\np-value: {p:.10f}", 
             ha='center', fontsize=12, color='red', weight='normal')
    
    # Add titles and labels
    plt.title('Severity of Artery Blockage Across Blood Defect Types (ANOVA)', fontsize=14)
    plt.xlabel('Blood Defect Type', fontsize=12)
    plt.ylabel('Severity of Artery Blockage', fontsize=12)

    fig.suptitle(figure_label, x=0.01, y=0.99, ha='left', fontsize=12, fontweight='bold')
    fig.suptitle("Figure 1.", fontsize=14, fontweight='bold')


    
    # Show plot
    return fig, ax, F, p
    plt.show()

In [13]:
plt.show()

---

In [14]:
g1 = df[df['age group'] == '<30']['severity of artery blockage']
g2 = df[df['age group'] == '30-39']['severity of artery blockage']
g3 = df[df['age group'] == '40-49']['severity of artery blockage']
g4 = df[df['age group'] == '50-59']['severity of artery blockage']
g5 = df[df['age group'] == '60-69']['severity of artery blockage']
g6 = df[df['age group'] == '70+']['severity of artery blockage']


results = stats.f_oneway(g1, g2, g3, g4, g5, g6)

print(results)

F_onewayResult(statistic=np.float64(8.068219221880225), pvalue=np.float64(3.823589214733821e-07))


In [15]:
t,p    = stats.ttest_ind(thal6, thal7)

print(t)
print(p)

-0.23348256694633607
0.8157511590580403


In [16]:
t,p    = stats.ttest_ind(thal3, thal7)

print(t)
print(p)

4.412322916233586
1.4660910454857844e-05


In [17]:
t,p    = stats.ttest_ind(thal3, thal6)

print(t)
print(p)

2.625847761251332
0.009387474770560102


In [18]:
print("Mean severity by age group:")
print(g1.mean())
print(g2.mean())
print(g3.mean())
print(g4.mean())
print(g5.mean())
print(g6.mean())

Mean severity by age group:
0.0
0.0
-0.32
-0.7142857142857143
-1.095890410958904
-1.1666666666666667


In [19]:
g4 = df[df['age group'] == '50-59']['severity of artery blockage']
g5 = df[df['age group'] == '60-69']['severity of artery blockage']
g6 = df[df['age group'] == '70+']['severity of artery blockage']


results = stats.f_oneway(g4, g5, g6)

print(results)

F_onewayResult(statistic=np.float64(3.9449691703158978), pvalue=np.float64(0.020860979566614677))


In [20]:
fixed_df = df[df['blood defect type'] == 6]

male = fixed_df[fixed_df['sex'] == 1]['severity of artery blockage']
female = fixed_df[fixed_df['sex'] == 0]['severity of artery blockage']

t_stat, p_value = stats.ttest_ind(female, male) 
print("t-statistic:", t_stat)
print("p-value:", p_value)

t-statistic: -0.9457324874869208
p-value: 0.3583495986949766


In [21]:
reversible_df = df[df['blood defect type'] == 7]

male = reversible_df[reversible_df['sex'] == 1]['severity of artery blockage']
female = reversible_df[reversible_df['sex'] == 0]['severity of artery blockage']

t_stat, p_value = stats.ttest_ind(female, male) 
print("t-statistic:", t_stat)
print("p-value:", p_value)

t-statistic: -1.338487374936076
p-value: 0.18342446043317692


In [22]:
g1 = df[df['age group'] == '<30']['cholesterol']
g2 = df[df['age group'] == '30-39']['cholesterol']
g3 = df[df['age group'] == '40-49']['cholesterol']
g4 = df[df['age group'] == '50-59']['cholesterol']
g5 = df[df['age group'] == '60-69']['cholesterol']
g6 = df[df['age group'] == '70+']['cholesterol']


results = stats.f_oneway(g1, g2, g3, g4, g5, g6)

print(results)

F_onewayResult(statistic=np.float64(2.9929446125464896), pvalue=np.float64(0.011882162066573916))


In [23]:
LVHdf = df[df['restecg'] == 2]

maleLVH = LVHdf[LVHdf['sex'] == 1]['severity of artery blockage']
femaleLVH = LVHdf[LVHdf['sex'] == 0]['severity of artery blockage']

t_stat, p_value = stats.ttest_ind(femaleLVH, maleLVH) 
print("t-statistic:", t_stat)
print("p-value:", p_value)

t-statistic: 1.0797304861749237
p-value: 0.28206735408872785


In [24]:
No = df[df['restecg'] == 0]['severity of artery blockage']
STT = df[df['restecg'] == 1]['severity of artery blockage']
LVH = df[df['restecg'] == 2]['severity of artery blockage']


results = stats.f_oneway(No, STT, LVH)

print(results)

F_onewayResult(statistic=np.float64(2.7349794431364605), pvalue=np.float64(0.0665469129539562))


In [25]:
t_stat, p_value = stats.ttest_ind(LVH, STT) 
print("t-statistic:", t_stat)
print("p-value:", p_value)

t-statistic: 0.40295126286110633
p-value: 0.6875655589111894


In [26]:
STTdf = df[df['restecg'] == 1]

maleSTT = STTdf[STTdf['sex'] == 1]['severity of artery blockage']
femaleSTT = STTdf[STTdf['sex'] == 0]['severity of artery blockage']

t_stat, p_value = stats.ttest_ind(femaleSTT, maleSTT) 
print("t-statistic:", t_stat)
print("p-value:", p_value)

t-statistic: 3.9999999999999996
p-value: 0.057190958417936644


In [27]:
healthy = df[df['exercise induced angina'] == 0]['severity of artery blockage']
unhealthy = df[df['exercise induced angina'] == 1]['severity of artery blockage']

t,p    = stats.ttest_ind(healthy, unhealthy)

print(t)
print(p)

2.5744126987248164
0.010528762279197018


In [28]:
unhealthydf = df[df['exercise induced angina'] == 1]

maleang = unhealthydf[unhealthydf['sex'] == 1]['severity of artery blockage']
femaleang = unhealthydf[unhealthydf['sex'] == 0]['severity of artery blockage']

t_stat, p_value = stats.ttest_ind(femaleang, maleang) 
print("t-statistic:", t_stat)
print("p-value:", p_value)

t-statistic: 1.616962318700205
p-value: 0.10920235268050714


In [29]:
highsugardf = df[df['Fasting Blood Sugar'] == 1]

malehigh = highsugardf[highsugardf['sex'] == 1]['severity of artery blockage']
femalehigh = highsugardf[highsugardf['sex'] == 0]['severity of artery blockage']

t_stat, p_value = stats.ttest_ind(malehigh, femalehigh) 
print("t-statistic:", t_stat)
print("p-value:", p_value)

t-statistic: 0.5372125264379756
p-value: 0.5940245746423083


In [30]:
highcholdf = df[df['cholesterol'] > 200]

malehighc = highcholdf[highcholdf['sex'] == 1]['severity of artery blockage']
femalehighc = highcholdf[highcholdf['sex'] == 0]['severity of artery blockage']

t_stat, p_value = stats.ttest_ind(malehighc, femalehighc) 
print("t-statistic:", t_stat)
print("p-value:", p_value)

t-statistic: -1.6020005628117937
p-value: 0.11043875043855154


In [31]:
highbp_df = df[df['blood pressure'] > 120]

male_highbp = highbp_df[highbp_df['sex'] == 1]['severity of artery blockage']
female_highbp = highbp_df[highbp_df['sex'] == 0]['severity of artery blockage']

t_stat, p_value = stats.ttest_ind(male_highbp, female_highbp) 
print("t-statistic:", t_stat)
print("p-value:", p_value)

t-statistic: -1.1015341152205573
p-value: 0.27200149917122535


In [32]:
olddf = df[df['age'] > 49]

oldSTT = olddf[olddf['restecg'] == 1]['severity of artery blockage']
oldhealthyheart = olddf[olddf['restecg'] == 0]['severity of artery blockage']

t_stat, p_value = stats.ttest_ind(oldSTT, oldhealthyheart) 
print("t-statistic:", t_stat)
print("p-value:", p_value)

t-statistic: -0.5320732086421202
p-value: 0.595891887663775


In [33]:
olddf = df[df['age'] > 49]

oldLVH = olddf[olddf['restecg'] == 2]['severity of artery blockage']
oldhealthyheart = olddf[olddf['restecg'] == 0]['severity of artery blockage']

t_stat, p_value = stats.ttest_ind(oldLVH, oldhealthyheart) 
print("t-statistic:", t_stat)
print("p-value:", p_value)

t-statistic: -1.2244220933561591
p-value: 0.22219143514940867


In [34]:
unhealthydf = df[df['exercise induced angina'] == 1]

LVHang = unhealthydf[unhealthydf['restecg'] == 2]['severity of artery blockage']
normalang = unhealthydf[unhealthydf['restecg'] == 0]['severity of artery blockage']

t_stat, p_value = stats.ttest_ind(normalang, LVHang) 
print("t-statistic:", t_stat)
print("p-value:", p_value)

t-statistic: 1.5554888535115332
p-value: 0.1232263205965278


In [35]:
unhealthydf = df[df['exercise induced angina'] == 1]

STTang = unhealthydf[unhealthydf['restecg'] == 1]['severity of artery blockage']
normalang = unhealthydf[unhealthydf['restecg'] == 0]['severity of artery blockage']

t_stat, p_value = stats.ttest_ind(normalang, STTang) 
print("t-statistic:", t_stat)
print("p-value:", p_value)

t-statistic: -0.33366041940293684
p-value: 0.7402960001803351


In [36]:
femaledf = df[df['sex'] == 0]

f_highchol = femaledf[femaledf['cholesterol'] > 199 ]['severity of artery blockage']
f_lowchol = femaledf[femaledf['cholesterol'] < 200 ]['severity of artery blockage']

t_stat, p_value = stats.ttest_ind(f_lowchol, f_highchol) 
print("t-statistic:", t_stat)
print("p-value:", p_value)

t-statistic: 0.5655261206377757
p-value: 0.5730633863345923


In [37]:
maledf = df[df['sex'] == 1]

m_highchol = maledf[maledf['cholesterol'] > 199 ]['severity of artery blockage']
m_lowchol = maledf[maledf['cholesterol'] < 200 ]['severity of artery blockage']

t_stat, p_value = stats.ttest_ind(m_lowchol, m_highchol) 
print("t-statistic:", t_stat)
print("p-value:", p_value)

t-statistic: 1.1816664509683255
p-value: 0.23874824589473662


In [38]:
thal6df = df[df['blood defect type'] == 6]

thal_highchol = thal6df[thal6df['cholesterol'] > 199 ]['severity of artery blockage']
thal_lowchol = thal6df[thal6df['cholesterol'] < 200 ]['severity of artery blockage']

t_stat, p_value = stats.ttest_ind(thal_highchol, thal_lowchol) 
print("t-statistic:", t_stat)
print("p-value:", p_value)

t-statistic: -0.968561174725514
p-value: 0.3471783674622052


In [39]:
thal7df = df[df['blood defect type'] == 7]

thal_highchol = thal7df[thal7df['cholesterol'] > 199 ]['severity of artery blockage']
thal_lowchol = thal7df[thal7df['cholesterol'] < 200 ]['severity of artery blockage']

t_stat, p_value = stats.ttest_ind(thal_highchol, thal_lowchol) 
print("t-statistic:", t_stat)
print("p-value:", p_value)

t-statistic: -0.7293566056446501
p-value: 0.4672926621284894


In [40]:
thal7df = df[df['blood defect type'] == 7]

thal_STT = thal7df[thal7df['restecg'] == 2 ]['severity of artery blockage']
thal_normal = thal7df[thal7df['restecg'] == 0 ]['severity of artery blockage']

t_stat, p_value = stats.ttest_ind(thal_normal, thal_STT) 
print("t-statistic:", t_stat)
print("p-value:", p_value)

t-statistic: 1.0058057800808042
p-value: 0.3166770896385629


In [41]:
thal7df = df[df['blood defect type'] == 7]

thal_LVH = thal7df[thal7df['restecg'] == 1 ]['severity of artery blockage']
thal_normal = thal7df[thal7df['restecg'] == 0 ]['severity of artery blockage']

t_stat, p_value = stats.ttest_ind(thal_normal, thal_LVH) 
print("t-statistic:", t_stat)
print("p-value:", p_value)

t-statistic: 0.16981563896145282
p-value: 0.8657671155247368


In [42]:
thal6df = df[df['blood defect type'] == 6]

thal_STT = thal6df[thal6df['restecg'] == 2 ]['severity of artery blockage']
thal_normal = thal6df[thal6df['restecg'] == 0 ]['severity of artery blockage']

t_stat, p_value = stats.ttest_ind(thal_normal, thal_STT) 
print("t-statistic:", t_stat)
print("p-value:", p_value)

t-statistic: 1.0866384696417501
p-value: 0.2943506574471821


In [43]:
femaledf = df[df['sex'] == 0]

f_LVH = femaledf[femaledf['restecg'] == 1 ]['severity of artery blockage']
f_normal = femaledf[femaledf['restecg'] == 0 ]['severity of artery blockage']

t_stat, p_value = stats.ttest_ind(f_LVH, f_normal) 
print("t-statistic:", t_stat)
print("p-value:", p_value)

t-statistic: 0.2821303861038682
p-value: 0.7790080815688967


In [44]:
maledf = df[df['sex'] == 1]

m_LVH = maledf[maledf['restecg'] == 1 ]['severity of artery blockage']
m_normal = maledf[maledf['restecg'] == 0 ]['severity of artery blockage']

t_stat, p_value = stats.ttest_ind(m_LVH, m_normal) 
print("t-statistic:", t_stat)
print("p-value:", p_value)

t-statistic: -2.7492337961267848
p-value: 0.007125394625241976


In [45]:
import matplotlib.pyplot as plt
from scipy import stats

def male_severityofdisease_ttest(df, figure_label='Figure 2.'):
    data = [m_normal, m_LVH]          
    tick_labels = ['Normal ECG', 'LVH'] 

    fig, ax = plt.subplots(figsize=(6,5))
    ax.boxplot(data, tick_labels=tick_labels, patch_artist=True,
               medianprops={'color':'red', 'linewidth':2},
               boxprops={'facecolor':'skyblue', 'linewidth':2},
               whiskerprops={'color':'green', 'linewidth':2},
               flierprops={'marker':'o', 'color':'black', 'markersize':6})

    max_val = max(m_LVH.max(), m_normal.max())
    ax.text(1.5, max_val*1.05, f"t = {t_stat:.2f}\np = {p_value:.4e}",
            ha='center', fontsize=12,
            bbox=dict(facecolor='white', alpha=0.5))

    ax.set_ylabel('Severity of Artery Blockage')
    ax.set_title('Severity of Artery Blockage in Males by Resting ECG')
    fig.suptitle(figure_label, x=0.01, y=0.99, ha='left', fontsize=12, fontweight='bold')

    plt.tight_layout(rect=[0,0,1,0.95])
    return fig
    

In [46]:
len(m_LVH)

1

In [47]:
unhealthydf = df[df['exercise induced angina'] == 1]

maleang = unhealthydf[unhealthydf['sex'] == 1]['severity of artery blockage']
femaleang = unhealthydf[unhealthydf['sex'] == 0]['severity of artery blockage']

t_stat, p_value = stats.ttest_ind(femaleang, maleang) 
print("t-statistic:", t_stat)
print("p-value:", p_value)

t-statistic: 1.616962318700205
p-value: 0.10920235268050714


In [48]:
youngdf = df[df['age'] < 50]
print(stats.linregress(youngdf['cholesterol'], youngdf['severity of artery blockage']))

LinregressResult(slope=np.float64(-0.0013638092385135313), intercept=np.float64(0.0477408857895113), rvalue=np.float64(-0.08346978367625389), pvalue=np.float64(0.44756198636002853), stderr=np.float64(0.0017871750368554779), intercept_stderr=np.float64(0.4243032245811356))


In [49]:
def young_cholesterol_regression(df, figure_label="Figure 2."):

    x = youngdf['cholesterol']
    y = youngdf['severity of artery blockage']

    slope, intercept, r, p, stderr = stats.linregress(x, y)
    fig, ax = plt.subplots(figsize=(7, 5))

    ax.scatter(x, y, color='black', edgecolor='black', label='Patients < 50')

    x_vals = np.linspace(x.min(), x.max(), 200)
    y_vals = intercept + slope * x_vals
    ax.plot(x_vals, y_vals, color='red', linewidth=2, label='Regression Line')

    ax.set_xlabel("Cholesterol (mm/dL)")
    ax.set_ylabel("Severity of Artery Blockage")
    ax.set_title("Cholesterol vs Artery Blockage Severity in Patients Under 50")

    ax.text(0.05, 0.4,
            f"slope = {slope:.3f}\nintercept = {intercept:.3f}\np-value = {p:.3f}\nr = {r:.3f}",
            transform=ax.transAxes,
            fontsize=11,
            va='top',
            bbox=dict(facecolor='white', alpha=0.6))

    fig.suptitle(figure_label, x=0.01, y=0.99,
                 fontweight='bold', ha='left', fontsize=12)

    plt.tight_layout(rect=[0, 0, 1, 0.95])

    results = {
        "slope": slope,
        "intercept": intercept,
        "r": r,
        "p": p,
        "stderr": stderr
    }

    return fig, results

In [50]:
youngdf = df[df['age'] > 50]
print(stats.linregress(youngdf['cholesterol'], youngdf['severity of artery blockage']))

LinregressResult(slope=np.float64(-0.0010608816142721048), intercept=np.float64(-0.5941370052827575), rvalue=np.float64(-0.0594128321117772), pvalue=np.float64(0.39743641653989736), stderr=np.float64(0.0012510388259640993), intercept_stderr=np.float64(0.32478273989891615))


In [51]:
highsugardf = df[df['Fasting Blood Sugar'] == 1]

highbp = highsugardf[highsugardf['blood pressure'] > 120]['severity of artery blockage']
lowbp = highsugardf[highsugardf['blood pressure'] < 121]['severity of artery blockage']

t_stat, p_value = stats.ttest_ind(highbp, lowbp) 
print("t-statistic:", t_stat)
print("p-value:", p_value)

t-statistic: -0.05618937858001879
p-value: 0.9554638809979266


In [52]:
youngdf = df[df['age'] < 50]

youngLVHdf = youngdf[youngdf['restecg'] == 2]['severity of artery blockage']
youngnormaldf = youngdf[youngdf['restecg'] == 0]['severity of artery blockage']
youngSTTdf = youngdf[youngdf['restecg'] == 1]['severity of artery blockage']

t_stat, p_value = stats.ttest_ind(youngnormaldf, youngLVHdf) 
print("t-statistic:", t_stat)
print("p-value:", p_value)

t-statistic: 1.2742407744929154
p-value: 0.20613434139287423


In [53]:
len(youngLVHdf)

33

In [54]:
len(youngnormaldf)

52

In [55]:
len(youngdf)

85

In [56]:
len(youngSTTdf)

0

In [57]:
youngdf = df[df['age'] < 50]

youngang = youngdf[youngdf['exercise induced angina'] == 1]['severity of artery blockage']
youngnoang = youngdf[youngdf['exercise induced angina'] == 0]['severity of artery blockage']

t_stat, p_value = stats.ttest_ind(youngnoang, youngang) 
print("t-statistic:", t_stat)
print("p-value:", p_value)

t-statistic: 1.043554602939863
p-value: 0.29972144901009645


In [58]:
youngdf = df[df['age'] < 50]

younghighbp = youngdf[youngdf['blood pressure'] > 120]['severity of artery blockage']
younglowbp = youngdf[youngdf['blood pressure'] < 121]['severity of artery blockage']

t_stat, p_value = stats.ttest_ind(younghighbp, younglowbp) 
print("t-statistic:", t_stat)
print("p-value:", p_value)

t-statistic: -0.11022033096708399
p-value: 0.9125007113351642


In [59]:
slope, intercept, r, p, stderr = stats.linregress(youngdf['cholesterol'], youngdf['severity of artery blockage'])

print("Cholesterol p =", p)

slope, intercept, r, p, stderr = stats.linregress(
    youngdf['blood pressure'], youngdf['severity of artery blockage'])

print("Blood Pressure p =", p)

maleyoung = youngdf[youngdf['sex']==1]['severity of artery blockage']
femaleyoung = youngdf[youngdf['sex']==0]['severity of artery blockage']

t, p = stats.ttest_ind(maleyoung, femaleyoung, equal_var=False)

print("Sex p =", p)
print(t)

Cholesterol p = 0.44756198636002853
Blood Pressure p = 0.45069448740973583
Sex p = 0.03130906452342524
-2.1906307834759757


In [60]:
print(maleyoung.mean())
print(femaleyoung.mean())

len(maleyoung)


-0.3442622950819672
-0.08333333333333333


61

In [61]:
import matplotlib.pyplot as plt
from scipy import stats

def young_sex_ttest_plot(df, figure_label="Figure X."):
    """
    Creates a boxplot comparing severity of artery blockage between
    young males (<50) and young females (<50).

    Returns:
        fig - matplotlib Figure object
        t, p - t-statistic and p-value
    """

    # Subset young patients
    young = df[df['age'] < 50]

    # Split by sex
    maleyoung = young[young['sex'] == 1]['severity of artery blockage'].dropna()
    femaleyoung = young[young['sex'] == 0]['severity of artery blockage'].dropna()

    # T-test
    t, p = stats.ttest_ind(maleyoung, femaleyoung, equal_var=False)

    # Create figure
    fig, ax = plt.subplots(figsize=(6, 5))

    # Boxplot data
    data = [maleyoung, femaleyoung]
    labels = ['Male (<50)', 'Female (<50)']

    ax.boxplot(
        data,
        tick_labels=labels,
        patch_artist=True,
        medianprops={'color': 'red', 'linewidth': 2},
        boxprops={'facecolor': 'skyblue', 'linewidth': 1.5},
        whiskerprops={'linewidth': 1.5},
        capprops={'linewidth': 1.5}
    )

    # Labels
    ax.set_ylabel("Severity of Artery Blockage")
    ax.set_title("CAD Severity by Sex (Patients Under 50)")

    # Annotation above plot
    y_max = max(maleyoung.max(), femaleyoung.max())
    ax.text(
        0.5, y_max + 0.1 * y_max,
        f"t = {t:.3f},  p = {p:.5f}",
        ha='center',
        fontsize=11
    )

    # Figure label
    fig.suptitle(
        figure_label,
        x=0.01,
        y=0.995,
        ha='left',
        fontweight='bold'
    )

    plt.tight_layout(rect=[0, 0, 1, 0.95])

    return fig, t, p


In [62]:
len(femaleyoung)

24

In [63]:
print("Male <50 count:", len(maleyoung))
print("Female <50 count:", len(femaleyoung))

Male <50 count: 61
Female <50 count: 24


In [64]:
print(maleyoung.describe())
print(femaleyoung.describe())

count    61.000000
mean     -0.344262
std       0.814151
min      -3.000000
25%       0.000000
50%       0.000000
75%       0.000000
max      -0.000000
Name: severity of artery blockage, dtype: float64
count    24.000000
mean     -0.083333
std       0.282330
min      -1.000000
25%      -0.000000
50%      -0.000000
75%       0.000000
max      -0.000000
Name: severity of artery blockage, dtype: float64


In [81]:
import pandas as pd

def young_sex_ttest_boxed_table(df, figure_label="Table 1."):

    t_stat, p_value = stats.ttest_ind(maleyoung, femaleyoung, equal_var=False)

    table_df = pd.DataFrame({
        "Test": ["Sex difference (<50)"],
        "t-statistic": [round(t_stat, 5)],
        "p-value": [round(p_value, 5)],
        "Male <50 mean": [round(maleyoung.mean(), 5)],
        "Female <50 mean": [round(femaleyoung.mean(), 5)]
    })

    fig, ax = plt.subplots(figsize=(9, 2))
    ax.axis('off') 

    tbl = ax.table(
        cellText=table_df.values,
        colLabels=table_df.columns,
        cellLoc='center',
        loc='center',
        edges='closed' 
    )

    tbl.auto_set_font_size(False)
    tbl.set_fontsize(10)
    tbl.scale(1, 1.5)  

    fig.suptitle(figure_label, x=0.01, y=0.99, ha='left', fontweight='bold')

    plt.tight_layout()
    plt.show()

    return fig
