In [1]:
import pandas as pd
from scipy.stats import ttest_ind, chi2_contingency

# Load the updated dataset
file_path = '../Data/heart_attack_dataset_updated.csv'
df = pd.read_csv(file_path)

# Print each header and its unique values
print("Headers and their unique values:")
for col in df.columns:
    print(f"\n{col}:")
    print(df[col].unique())

print("\n")

Headers and their unique values:

Gender:
['Male' 'Female']

Age:
[70 55 42 84 86 66 33 73 63 88 69 78 89 71 30 77 76 74 45 34 61 52 49 81
 39 32 46 67 41 35 80 85 57 79 62 48 36 64 60 50 72 65 40 51 82 75 31 43
 37 38 54 44 59 58 53 83 56 87 47 68]

Blood Pressure (mmHg):
[181 103  95 106 187 125 182 115 174 154 133 165 153 110 107 112  91 101
 141 124 109 143 197 149 104 159 193 135 190 129 126 134 172 179 111 192
 180 166 119 139 116 191 120 158 138 198 162 142 169 178 196 164 161 168
 113 185 148 171 176 183 147  97 175 105 145  98 128 195 146 167 163 144
 156 122 152 136 151 150 114 127 186 184 137  96 188 100 173 199 132 160
 194  99  94 170 140 130 123 117 189 157 131 121 118 102  93 108  90  92
 155]

Cholesterol (mg/dL):
[262 253 295 270 296 271 288 286 254 150 236 171 215 182 242 179 227 259
 273 212 222 285 266 209 157 191 268 161 274 248 205 280 255 188 246 297
 181 249 258 235 201 204 198 200 186 217 176 233 216 210 172 165 190 183
 156 229 294 195 220 243 265 283 225 234 

In [2]:
from scipy.stats import pearsonr, chi2_contingency, f_oneway, ttest_ind

# List of numerical and categorical columns
numerical_columns = ['Age', 'Blood Pressure (mmHg)', 'Cholesterol (mg/dL)', 'Risk Score']
categorical_columns = ['Gender', 'Has Diabetes', 'Smoking Status', 'Chest Pain Type', 'Treatment',
                       'Age Group', 'Cholesterol Level Category', 'Blood Pressure Category',
                       'Heart Health Status', 'Lifestyle Modification Necessity', 'Treatment Effectiveness Category']

# Thresholds
correlation_threshold = 0.3  # for Pearson correlation
p_value_threshold = 0.05  # for Chi-square, ANOVA, and t-tests

# Results dictionaries
investigate = []
no_investigation = []

# 1. Numerical vs. Numerical (Correlation)
for i, col1 in enumerate(numerical_columns):
    for col2 in numerical_columns[i+1:]:
        corr, p_value = pearsonr(df[col1], df[col2])
        if abs(corr) > correlation_threshold:
            investigate.append(f"Correlation between {col1} and {col2}: corr = {corr:.2f}, p-value = {p_value:.4f}")
        else:
            no_investigation.append(f"Correlation between {col1} and {col2}: corr = {corr:.2f}, p-value = {p_value:.4f}")

# 2. Categorical vs. Categorical (Chi-Square Test)
for i, col1 in enumerate(categorical_columns):
    for col2 in categorical_columns[i+1:]:
        contingency_table = pd.crosstab(df[col1], df[col2])
        chi2, p_value, _, _ = chi2_contingency(contingency_table)
        if p_value < p_value_threshold:
            investigate.append(f"Chi-Square test between {col1} and {col2}: p-value = {p_value:.4f}")
        else:
            no_investigation.append(f"Chi-Square test between {col1} and {col2}: p-value = {p_value:.4f}")

# 3. Numerical vs. Categorical (ANOVA/T-test)
for num_col in numerical_columns:
    for cat_col in categorical_columns:
        unique_categories = df[cat_col].unique()
        groups = [df[num_col][df[cat_col] == category] for category in unique_categories]
        if len(unique_categories) == 2:  # Use t-test for binary categories
            t_stat, p_value = ttest_ind(groups[0], groups[1])
        else:  # Use ANOVA for multiple categories
            f_stat, p_value = f_oneway(*groups)
        if p_value < p_value_threshold:
            investigate.append(f"Test between {num_col} and {cat_col}: p-value = {p_value:.4f}")
        else:
            no_investigation.append(f"Test between {num_col} and {cat_col}: p-value = {p_value:.4f}")

# Print the results
print("\n=== Should Investigate Further ===")
for item in investigate:
    print(item)

print("\n=== No Further Investigation Needed ===")
for item in no_investigation:
    print(item)


=== Should Investigate Further ===
Correlation between Blood Pressure (mmHg) and Risk Score: corr = 0.52, p-value = 0.0000
Correlation between Cholesterol (mg/dL) and Risk Score: corr = 0.52, p-value = 0.0000
Chi-Square test between Smoking Status and Lifestyle Modification Necessity: p-value = 0.0000
Chi-Square test between Chest Pain Type and Treatment Effectiveness Category: p-value = 0.0000
Chi-Square test between Treatment and Treatment Effectiveness Category: p-value = 0.0000
Chi-Square test between Cholesterol Level Category and Heart Health Status: p-value = 0.0000
Chi-Square test between Cholesterol Level Category and Lifestyle Modification Necessity: p-value = 0.0000
Chi-Square test between Blood Pressure Category and Heart Health Status: p-value = 0.0000
Chi-Square test between Blood Pressure Category and Lifestyle Modification Necessity: p-value = 0.0000
Chi-Square test between Heart Health Status and Lifestyle Modification Necessity: p-value = 0.0000
Test between Age and 

In [3]:
# What is not worth looking into? 

# Test between Age and Age Group: No
# Reason: This is a redundant check since Age Group is derived directly from Age.

# Test between Blood Pressure (mmHg) and Blood Pressure Category: No
# Reason: This is also redundant as Blood Pressure Category is directly derived from Blood Pressure.

# Test between Cholesterol (mg/dL) and Cholesterol Level Category: No
# Reason: This is redundant since Cholesterol Level Category is derived directly from Cholesterol levels.

#The rest of the "=== Should Investigate Further ===" is worth looking into