In [None]:
import pandas as pd

immune_pro = pd.read_csv("/Volumes/data_files/UKB_data/processed_data/immune_pro.csv")
b_cols = pd.read_csv("/Volumes/data_files/UKB_data/all_with_colnames.csv")

b_cols = b_cols[b_cols["eid"].isin(immune_pro["eid"])]
print(b_cols.columns.tolist())
select_b_col = b_cols[['eid','sex','bmi','smoke_ever','smoke_status','alcohol_status','alcohol_amount']]


In [None]:
sle_pro = immune_pro[['eid','age','ethnicity',"fp-SLE",'SLE']]
sle_pro = pd.merge(sle_pro, select_b_col, on='eid', how='inner')
sle_pro = sle_pro[['eid', 'sex','age','bmi', 'ethnicity', 'smoke_ever', 'alcohol_amount','fp-SLE', 'SLE']]

In [None]:
sle_pro = sle_pro[sle_pro["fp-SLE"]>0]
print("all participant", len(sle_pro))
sle_control = sle_pro[sle_pro["SLE"]==0]
print("control", len(sle_control))
sle_case = sle_pro[sle_pro["SLE"]==1]
print("case", len(sle_case))

In [None]:
sle_pro_white = sle_pro[sle_pro['ethnicity']==1001.0]
sle_pro_female = sle_pro[sle_pro['sex']==0]

sle_control_white = sle_control[sle_control['ethnicity']==1001.0]
sle_control_female = sle_control[sle_control['sex']==0]

sle_case_white = sle_case[sle_case['ethnicity']==1001.0]
sle_case_female = sle_case[sle_case['sex']==0]

sle_pro_smoke = sle_pro[sle_pro['smoke_ever']==0]
sle_control_smoke = sle_control[sle_control['smoke_ever']==0]
sle_case_smoke = sle_case[sle_case['smoke_ever']==0]


print("all white", len(sle_pro_white), len(sle_pro_white)/len(sle_pro))
print("all female", len(sle_pro_female), len(sle_pro_female)/len(sle_pro))

print("control white", len(sle_control_white), len(sle_control_white)/len(sle_control))
print("control female", len(sle_control_female), len(sle_control_female)/len(sle_control))

print("case white", len(sle_case_white), len(sle_case_white)/len(sle_case))
print("case female", len(sle_case_female), len(sle_case_female)/len(sle_case))

print("all smoke", len(sle_pro_smoke), len(sle_pro_smoke)/len(sle_pro))
print("control smoke", len(sle_control_smoke), len(sle_control_smoke)/len(sle_control))
print("case smoke", len(sle_case_smoke), len(sle_case_smoke)/len(sle_case))


In [None]:
from tqdm import tqdm
import pandas as pd
from scipy.stats import ttest_ind, chi2_contingency
from statsmodels.stats.multitest import multipletests

results = []
t_test_columns = ['age', 'bmi'] 
chi2_columns = ['sex', 'ethnicity', 'smoke_ever', 'alcohol_amount']  

for column in tqdm(t_test_columns + chi2_columns, desc="Processing variables"):
    sle_group = sle_control[column]
    print("sle_group: ", len(sle_group))
    non_sle_group = sle_case[column]
    print("non_sle_group: ", len(non_sle_group))

    if column in t_test_columns:
        t_stat, p_value = ttest_ind(sle_group, non_sle_group, nan_policy='omit')
        results.append({
            "variable": column,
            "method": "t-test",
            "statistic": t_stat,
            "p_value": p_value
        })
    elif column in chi2_columns:
        contingency_table = pd.crosstab(sle_pro["SLE"], sle_pro[column])
        chi2_stat, p_value, _, _ = chi2_contingency(contingency_table)
        results.append({
            "variable": column,
            "method": "chi2",
            "statistic": chi2_stat,
            "p_value": p_value
        })

results_df = pd.DataFrame(results)
results_df["FDR_P-value"] = multipletests(results_df["p_value"], method="fdr_bh")[1]
results_df["Bonferroni_P-value"] = multipletests(results_df["p_value"], method="bonferroni")[1]
results_df.to_csv("participants_characteristics.csv", index=False)