In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

df = pd.read_csv('Variable_final.csv')

df = df.astype("category")
print(df.dtypes)  # 查看所有变量的数据类型


SEX               category
_AGE_GROUP        category
_RACEGR3          category
_EDUCA_GROUP      category
_INCOM_GROUP      category
_SMOKER3_GROUP    category
_BMI5CAT          category
_FRTLT1A          category
_VEGLT1A          category
_TOTINDA          category
_RFBING5          category
_ECIGSTS_GROUP    category
_PHYS14D          category
_MENT14D          category
CHECKUP1_GROUP    category
_LTASTH1          category
dtype: object


In [2]:
df['_LTASTH1'] = df['_LTASTH1'].map({1.0: 0, 2.0: 1})
df['_LTASTH1'] = df['_LTASTH1'].astype(int)


print(df['_LTASTH1'].dtype)  # 看看数据类型
print(df['_LTASTH1'].value_counts())  # 看看具体值

print(df['_LTASTH1'].isna().sum())  # 检查缺失值数量
print(df['_LTASTH1'].unique())  # 看看具体取值


int32
0    264659
1     42549
Name: _LTASTH1, dtype: int64
0
[0 1]


In [3]:
# 定义变量
demographic_vars = ["C(SEX)", "C(_AGE_GROUP)", "C(_RACEGR3)", "C(_EDUCA_GROUP)", "C(_INCOM_GROUP)"]
lifestyle_vars = ["C(_SMOKER3_GROUP)", "C(_BMI5CAT)", "C(_FRTLT1A)", "C(_VEGLT1A)", 
                  "C(_TOTINDA)", "C(_RFBING5)", "C(_ECIGSTS_GROUP)", "C(_PHYS14D)", 
                  "C(_MENT14D)", "C(CHECKUP1_GROUP)"]


all_vars = demographic_vars + lifestyle_vars

In [4]:
import numpy as np
import statsmodels.stats.weightstats as smw

def weighted_ci(data, weights, confidence=0.95):
    weighted_stats = smw.DescrStatsW(data, weights=weights)
    return weighted_stats.tconfint_mean(alpha=1-confidence)


# 先检查列名是否匹配
print("Available columns in df:", df.columns)

# 移除 "C()"，确保列名匹配
demographic_vars = ["SEX", "_AGE_GROUP", "_RACEGR3", "_EDUCA_GROUP", "_INCOM_GROUP"]
lifestyle_vars = ["_SMOKER3_GROUP", "_BMI5CAT", "_FRTLT1A", "_VEGLT1A", 
                  "_TOTINDA", "_RFBING5", "_ECIGSTS_GROUP", "_PHYS14D", 
                  "_MENT14D", "CHECKUP1_GROUP"]
asthma_vars = ['_LTASTH1']
all_vars = demographic_vars + lifestyle_vars

# 重新计算表格
table_data = []

for var in demographic_vars:
    if var not in df.columns:
        print(f"Warning: {var} not found in df, skipping...")
        continue  # 如果变量不在 df 中，跳过

    counts = df[var].value_counts(dropna=True)
    total = counts.sum()
    for category, count in counts.items():
        ci_str = weighted_ci(count, total)
        table_data.append([var, category, count, ci_str])

table_df = pd.DataFrame(table_data, columns=["Variable", "Category", "Count (n)", "Weighted % (95% CI)"])
print(table_df)



Available columns in df: Index(['SEX', '_AGE_GROUP', '_RACEGR3', '_EDUCA_GROUP', '_INCOM_GROUP',
       '_SMOKER3_GROUP', '_BMI5CAT', '_FRTLT1A', '_VEGLT1A', '_TOTINDA',
       '_RFBING5', '_ECIGSTS_GROUP', '_PHYS14D', '_MENT14D', 'CHECKUP1_GROUP',
       '_LTASTH1'],
      dtype='object')
        Variable  Category  Count (n)   Weighted % (95% CI)
0            SEX       2.0     164674  (164674.0, 164674.0)
1            SEX       1.0     142534  (142534.0, 142534.0)
2     _AGE_GROUP       2.0     119295  (119295.0, 119295.0)
3     _AGE_GROUP       3.0     101982  (101982.0, 101982.0)
4     _AGE_GROUP       1.0      85931    (85931.0, 85931.0)
5       _RACEGR3       1.0     240556  (240556.0, 240556.0)
6       _RACEGR3       5.0      22901    (22901.0, 22901.0)
7       _RACEGR3       2.0      22687    (22687.0, 22687.0)
8       _RACEGR3       3.0      14543    (14543.0, 14543.0)
9       _RACEGR3       4.0       6521      (6521.0, 6521.0)
10  _EDUCA_GROUP       2.0     290086  (290086.0,

In [5]:
# 重新计算表格
table_data = []

for var in lifestyle_vars:
    if var not in df.columns:
        print(f"Warning: {var} not found in df, skipping...")
        continue  # 如果变量不在 df 中，跳过

    counts = df[var].value_counts(dropna=True)
    total = counts.sum()
    for category, count in counts.items():
        ci_str = weighted_ci(count, total)
        table_data.append([var, category, count, ci_str])

table_df = pd.DataFrame(table_data, columns=["Variable", "Category", "Count (n)", "Weighted % (95% CI)"])
print(table_df)

          Variable  Category  Count (n)   Weighted % (95% CI)
0   _SMOKER3_GROUP       3.0     173790  (173790.0, 173790.0)
1   _SMOKER3_GROUP       2.0      88085    (88085.0, 88085.0)
2   _SMOKER3_GROUP       1.0      45333    (45333.0, 45333.0)
3         _BMI5CAT       3.0     111677  (111677.0, 111677.0)
4         _BMI5CAT       4.0      98251    (98251.0, 98251.0)
5         _BMI5CAT       2.0      92842    (92842.0, 92842.0)
6         _BMI5CAT       1.0       4438      (4438.0, 4438.0)
7         _FRTLT1A       1.0     200302  (200302.0, 200302.0)
8         _FRTLT1A       2.0     106906  (106906.0, 106906.0)
9         _VEGLT1A       1.0     257634  (257634.0, 257634.0)
10        _VEGLT1A       2.0      49574    (49574.0, 49574.0)
11        _TOTINDA       1.0     228083  (228083.0, 228083.0)
12        _TOTINDA       2.0      79125    (79125.0, 79125.0)
13        _RFBING5       1.0     262045  (262045.0, 262045.0)
14        _RFBING5       2.0      45163    (45163.0, 45163.0)
15  _ECI

In [6]:
# 重新计算表格
table_data = []

for var in asthma_vars:
    if var not in df.columns:
        print(f"Warning: {var} not found in df, skipping...")
        continue  # 如果变量不在 df 中，跳过

    counts = df[var].value_counts(dropna=True)
    total = counts.sum()
    for category, count in counts.items():
        ci_str = weighted_ci(count, total)
        table_data.append([var, category, count, ci_str])

table_df = pd.DataFrame(table_data, columns=["Variable", "Category", "Count (n)", "Weighted % (95% CI)"])
print(table_df)

   Variable  Category  Count (n)   Weighted % (95% CI)
0  _LTASTH1         0     264659  (264659.0, 264659.0)
1  _LTASTH1         1      42549    (42549.0, 42549.0)


In [7]:
import pandas as pd
import scipy.stats as stats

# 定义变量组
demographic_vars = ["SEX", "_AGE_GROUP", "_RACEGR3", "_EDUCA_GROUP", "_INCOM_GROUP"]
lifestyle_vars = ["_SMOKER3_GROUP", "_BMI5CAT", "_FRTLT1A", "_VEGLT1A", 
                  "_TOTINDA", "_RFBING5", "_ECIGSTS_GROUP", "_PHYS14D", 
                  "_MENT14D", "CHECKUP1_GROUP"]
asthma_var = '_LTASTH1'

# 计算卡方检验
chi2_results = []

for var in demographic_vars + lifestyle_vars:
    contingency_table = pd.crosstab(df[var], df[asthma_var])
    chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
    chi2_results.append({"Variable": var, "Chi2": chi2, "p-value": p, "DOF": dof})

chi2_df = pd.DataFrame(chi2_results)
print(chi2_df)

          Variable         Chi2        p-value  DOF
0              SEX  1582.113553   0.000000e+00    1
1       _AGE_GROUP   602.626632  1.384489e-131    2
2         _RACEGR3   723.322111  3.105276e-155    4
3     _EDUCA_GROUP   269.518402   1.445013e-60    1
4     _INCOM_GROUP  2436.721457   0.000000e+00    2
5   _SMOKER3_GROUP   469.187263  1.310027e-102    2
6         _BMI5CAT  2058.806052   0.000000e+00    3
7         _FRTLT1A   109.836935   1.063924e-25    1
8         _VEGLT1A    45.557854   1.481962e-11    1
9         _TOTINDA   476.989951  9.654455e-106    1
10        _RFBING5    14.783221   1.206038e-04    1
11  _ECIGSTS_GROUP  1053.012840  2.193649e-229    2
12        _PHYS14D  7112.828402   0.000000e+00    2
13        _MENT14D  4901.325805   0.000000e+00    2
14  CHECKUP1_GROUP    69.142130   9.161725e-17    1


In [8]:
import numpy as np

def cramers_v(contingency_table):
    """计算 Cramér's V 以衡量两个分类变量的关联强度"""
    chi2, _, _, _ = stats.chi2_contingency(contingency_table)
    n = np.sum(contingency_table)  # 总样本数
    min_dim = min(contingency_table.shape) - 1  # k - 1
    return np.sqrt(chi2 / (n * min_dim))

# 计算 Cramér's V
cramers_v_results = []

for var in demographic_vars + lifestyle_vars:
    contingency_table = pd.crosstab(df[var], df[asthma_var])
    v_value = cramers_v(contingency_table)
    cramers_v_results.append({"Variable": var, "Cramér's V": v_value})

# 转换为 DataFrame 并展示结果
cramers_v_df = pd.DataFrame(cramers_v_results)
print(cramers_v_df)

          Variable                                         Cramér's V
0              SEX  _LTASTH1
0    0.077317
1    0.192830
dtype: fl...
1       _AGE_GROUP  _LTASTH1
0    0.047718
1    0.119009
dtype: fl...
2         _RACEGR3  _LTASTH1
0    0.052278
1    0.130383
dtype: fl...
3     _EDUCA_GROUP  _LTASTH1
0    0.031912
1    0.079588
dtype: fl...
4     _INCOM_GROUP  _LTASTH1
0    0.095953
1    0.239309
dtype: fl...
5   _SMOKER3_GROUP  _LTASTH1
0    0.042105
1    0.105009
dtype: fl...
6         _BMI5CAT  _LTASTH1
0    0.088199
1    0.219970
dtype: fl...
7         _FRTLT1A  _LTASTH1
0    0.020372
1    0.050808
dtype: fl...
8         _VEGLT1A  _LTASTH1
0    0.013120
1    0.032722
dtype: fl...
9         _TOTINDA  _LTASTH1
0    0.042453
1    0.105879
dtype: fl...
10        _RFBING5  _LTASTH1
0    0.007474
1    0.018640
dtype: fl...
11  _ECIGSTS_GROUP  _LTASTH1
0    0.063077
1    0.157316
dtype: fl...
12        _PHYS14D  _LTASTH1
0    0.163937
1    0.408862
dtype: fl...
13        _MENT14D  