In [11]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import chi2_contingency, pointbiserialr

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# 相关性分析

In [12]:
df = pd.read_csv('../dataset/encoded_data.csv')
data = pd.read_csv('../dataset/encoded_data.csv')
data_True = data[data['HadHeartAttack'] == 1]
data_False = data[data['HadHeartAttack'] == 0]

data = pd.concat([data_True.sample(n=len(data_False), random_state=42, replace=True), data_False])

In [13]:
# 分类函数
def classify_columns(df):
    real_numbers = []
    natural_numbers = []
    binary_classification = []

    for column in df.columns:
        if df[column].unique().shape[0] == 2:
            binary_classification.append(column)
        elif (df[column] % 1 == 0).all():
            natural_numbers.append(column)
        else:
            real_numbers.append(column)
    
    return real_numbers, natural_numbers, binary_classification

# 调用分类函数
real_numbers, natural_numbers, binary_classification = classify_columns(df)

print("实数列：", real_numbers)
print("自然数列：", natural_numbers)
print("二值分类列：", binary_classification)

实数列： ['HeightInMeters', 'WeightInKilograms', 'BMI']
自然数列： ['GeneralHealth', 'PhysicalHealthDays', 'MentalHealthDays', 'LastCheckupTime', 'SleepHours', 'RemovedTeeth', 'HadDiabetes', 'SmokerStatus', 'ECigaretteUsage', 'RaceEthnicityCategory', 'AgeCategory', 'TetanusLast10Tdap', 'CovidPos']
二值分类列： ['Sex', 'PhysicalActivities', 'HadHeartAttack', 'HadAngina', 'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD', 'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis', 'DeafOrHardOfHearing', 'BlindOrVisionDifficulty', 'DifficultyConcentrating', 'DifficultyWalking', 'DifficultyDressingBathing', 'DifficultyErrands', 'ChestScan', 'AlcoholDrinkers', 'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver', 'HighRiskLastYear']


## 二值分类列

In [14]:
binary_data = data[binary_classification]

# 计算每个属性与 HadHeartAttack 的概率
# 计算联合概率 P(A & HadHeartAttack)
joint_prob = binary_data.copy()
joint_prob['HadHeartAttack'] = data['HadHeartAttack']  # 添加 HadHeartAttack 列用于联合计算

# 计算每一列的置信度和提升度
confidence_dict = {}
lift_dict = {}
chi2_dict = {}
p_value_dict = {}

for column in binary_data.columns:
    # 计算 P(A & HadHeartAttack)
    joint_count = len(joint_prob[(joint_prob[column] == 1) & (joint_prob['HadHeartAttack'] == 1)])
    # 计算 P(A)
    prob_A = binary_data[column].sum() / len(binary_data)
    # 计算 P(HadHeartAttack)
    prob_HadHeartAttack = data['HadHeartAttack'].sum() / len(data)
    # 计算 P(A | HadHeartAttack)
    prob_A_given_HadHeartAttack = joint_count / data['HadHeartAttack'].sum()

    # 置信度 = P(A | HadHeartAttack)
    confidence = prob_A_given_HadHeartAttack
    confidence_dict[column] = confidence

    # 提升度 = P(A & HadHeartAttack) / (P(A) * P(HadHeartAttack))
    lift = joint_count / (prob_A * prob_HadHeartAttack * len(binary_data))
    lift_dict[column] = lift
    
    # 卡方检验
    contingency_table = pd.crosstab(binary_data[column], data['HadHeartAttack'])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    chi2_dict[column] = chi2
    p_value_dict[column] = p

# 创建结果的 DataFrame
result_df = pd.DataFrame({
    'Column': binary_data.columns,
    'Confidence': confidence_dict.values(),
    'Lift': lift_dict.values(),
    'Chi2 Statistic': chi2_dict.values(),
    'P-value': p_value_dict.values()
})

# 输出结果
result_df.sort_values(
    by='Lift', ascending=False, ignore_index=True, inplace=True
)
result_df.to_csv(
    '../asserts/binary_classification_correlation2.csv', index=False
)
result_df

Unnamed: 0,Column,Confidence,Lift,Chi2 Statistic,P-value
0,HadHeartAttack,1.0,2.0,465170.000009,0.0
1,HadAngina,0.504302,1.8696,129929.866636,0.0
2,HadStroke,0.187272,1.703075,28406.943638,0.0
3,HadKidneyDisease,0.140997,1.554844,14279.96101,0.0
4,HadCOPD,0.225043,1.532462,22699.617727,0.0
5,DifficultyDressingBathing,0.096532,1.522376,8592.565331,0.0
6,DifficultyWalking,0.382051,1.483485,37716.245006,0.0
7,DifficultyErrands,0.159532,1.445265,11442.211637,0.0
8,BlindOrVisionDifficulty,0.116167,1.433711,7714.040475,0.0
9,DeafOrHardOfHearing,0.199168,1.432384,14044.441839,0.0


## 自然数列

In [15]:
natural_data = data[natural_numbers]
target = data['HadHeartAttack']  # 二值属性

# 定义计算 Cramér's V 的函数
def cramers_v(chi2, n, k, r):
    return np.sqrt(chi2 / (n * min(k - 1, r - 1)))

# 初始化字典来存储卡方检验和 Cramér's V 结果
chi2_dict = {}
p_value_dict = {}
cramers_v_dict = {}

# 对每一列进行卡方检验和 Cramér's V 计算
for column in natural_data.columns:
    # 计算卡方检验的列联表
    contingency_table = pd.crosstab(natural_data[column], target)
    
    # 计算卡方检验结果
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    
    # 计算 Cramér's V
    n = contingency_table.sum().sum()  # 总样本数
    k = contingency_table.shape[0]  # 行数
    r = contingency_table.shape[1]  # 列数
    cramers_v_value = cramers_v(chi2, n, k, r)
    
    # 保存结果
    chi2_dict[column] = chi2
    p_value_dict[column] = p
    cramers_v_dict[column] = cramers_v_value

# 将结果汇总到一个 DataFrame 中
results_df = pd.DataFrame({
    'Column': natural_data.columns,
    'Chi2 Statistic': chi2_dict.values(),
    'P-value': p_value_dict.values(),
    'Cramér\'s V': cramers_v_dict.values()
})

# 显示结果
results_df.to_csv(
    '../asserts/natural_numbers_correlation2.csv', index=False
)
results_df

Unnamed: 0,Column,Chi2 Statistic,P-value,Cramér's V
0,GeneralHealth,66389.736622,0.0,0.377783
1,PhysicalHealthDays,28714.481597,0.0,0.248452
2,MentalHealthDays,5788.218926,0.0,0.111549
3,LastCheckupTime,14777.769409,0.0,0.178237
4,SleepHours,12177.121724,0.0,0.161795
5,RemovedTeeth,47979.510785,0.0,0.321159
6,HadDiabetes,32796.764687,0.0,0.265526
7,SmokerStatus,19438.485134,0.0,0.20442
8,ECigaretteUsage,1279.547938,4.031836e-277,0.052447
9,RaceEthnicityCategory,1944.172194,0.0,0.064649


## 实数列

In [16]:
real_data = data[real_numbers]
target = data['HadHeartAttack']  # 二值属性

# 存储结果的字典
correlation_dict = {}
p_value_dict = {}

# 对每个连续属性列进行点双列相关系数计算
for column in real_data.columns:
    # 计算点双列相关系数（Point-Biserial Correlation）
    corr, p_value = pointbiserialr(real_data[column], target)
    correlation_dict[column] = corr
    p_value_dict[column] = p_value

# 汇总结果到 DataFrame 中
results_df = pd.DataFrame({
    'Column': real_data.columns,
    'Point-Biserial Correlation': correlation_dict.values(),
    'P-value': p_value_dict.values()
})
results_df.to_csv(
    '../asserts/real_numbers_correlation2.csv', index=False
)
results_df

Unnamed: 0,Column,Point-Biserial Correlation,P-value
0,HeightInMeters,0.048349,9.51002e-239
1,WeightInKilograms,0.083338,0.0
2,BMI,0.067051,0.0
