# icd10 寻找SLE的患者，然后比对到蛋白质数据上

In [None]:
import pandas as pd

icd10 = pd.read_csv('/Volumes/data_files/UKB_data/raw_data/icd10.tab', sep='\t')
proteomic = pd.read_csv('/Volumes/data_files/UKB_data/raw_data/olink.tsv', sep='\t')

# 添加新列 SLE，处理非字符串和 NaN 情况 Systemic lupus erythematosus
icd10['SLE'] = icd10['p41270'].apply(
    lambda x: 1 if isinstance(x, str) and any(code.startswith('M32') for code in x.split('|')) else 0)
# 添加新列 RA，针对 M05 和 M06, M05 为类风湿性关节炎，M06 为其他特指的风湿病 Rheumatoid arthritis 
icd10['RA'] = icd10['p41270'].apply(
    lambda x: 1 if isinstance(x, str) and any(code.startswith(('M05', 'M06')) for code in x.split('|')) else 0)
# 干燥综合征（Sjogren's Syndrome）：M35.0
icd10['SS'] = icd10['p41270'].apply(
    lambda x: 1 if isinstance(x, str) and any(code.startswith(('M35')) for code in x.split('|')) else 0)
# 系统性硬化症（Systemic Sclerosis）：M34
icd10['Systemic Sclerosis'] = icd10['p41270'].apply(
    lambda x: 1 if isinstance(x, str) and any(code.startswith(('M34')) for code in x.split('|')) else 0)
# 抗磷脂综合征（antiphospholipid syndrome，APS， D686
icd10['APS'] = icd10['p41270'].apply(
    lambda x: 1 if isinstance(x, str) and any(code.startswith(('D686')) for code in x.split('|')) else 0)
# E063 Autoimmune thyroiditis 甲状腺自身免疫病
icd10['Autoimmune thyroiditis'] = icd10['p41270'].apply(
    lambda x: 1 if isinstance(x, str) and any(code.startswith(('E063')) for code in x.split('|')) else 0)

slers = icd10[icd10['SLE'] == 1]
raers = icd10[icd10['RA'] == 1]
ssers = icd10[icd10['SS'] == 1]
Systemic_Sclerosis_er = icd10[icd10['Systemic Sclerosis'] == 1]
APS_er = icd10[icd10['APS'] == 1]
Autoimmune_thyroiditis_er = icd10[icd10['Autoimmune thyroiditis'] == 1]

proteomic['SLE'] = proteomic['eid'].apply(lambda x: 1 if x in slers['eid'].values else 0)
proteomic['RA'] = proteomic['eid'].apply(lambda x: 1 if x in raers['eid'].values else 0)
proteomic['SS'] = proteomic['eid'].apply(lambda x: 1 if x in ssers['eid'].values else 0)
proteomic['Systemic Sclerosis'] = proteomic['eid'].apply(lambda x: 1 if x in Systemic_Sclerosis_er['eid'].values else 0)
proteomic['APS'] = proteomic['eid'].apply(lambda x: 1 if x in APS_er['eid'].values else 0)
proteomic['Autoimmune thyroiditis'] = proteomic['eid'].apply(
    lambda x: 1 if x in Autoimmune_thyroiditis_er['eid'].values else 0)

proteomic.to_csv("/Volumes/data_files/UKB_data/processed_data/pro_sler", index=False)

In [11]:
# icd10 确诊SLE，有蛋白质数据的共541人
proteomic = pd.read_csv("/Volumes/data_files/UKB_data/processed_data/pro_sler")
sle_all = proteomic[proteomic['SLE']==1]
sle_all

Unnamed: 0,eid,a1bg,aamdc,aarsd1,abca2,abhd14b,abl1,abo,abraxas2,acaa1,...,znrf4,zp3,zp4,zpr1,SLE,RA,SS,Systemic Sclerosis,APS,Autoimmune thyroiditis
45,1021071,-0.1414,-0.52745,0.06600,-0.09420,-0.38320,-1.0009,-0.1367,-0.6064,-1.14930,...,-0.2405,0.2267,-0.1333,-0.3513,1,0,1,0,0,0
158,1075188,-0.1046,-0.65155,-1.43235,-0.08540,-0.40140,-0.6964,-2.0383,-0.2779,-0.00900,...,-0.1697,-0.7525,0.0609,0.1540,1,0,0,0,0,0
198,1097706,-0.0796,0.79315,,-0.26305,0.83300,,-1.9894,0.7588,1.32910,...,0.6039,0.3533,-0.3082,-0.0954,1,0,0,0,1,0
205,1099743,0.1981,-0.22985,-0.82860,0.27225,0.54620,-0.0889,2.8415,1.1232,-0.51940,...,,0.5345,,0.3562,1,1,1,0,0,0
312,1159180,0.0521,,0.54400,-0.09200,0.15410,-0.2625,0.7587,0.8438,0.09690,...,0.2569,,-0.2296,-0.2191,1,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52648,5840912,-0.0162,0.15445,-0.11095,0.04710,-0.00125,-0.6368,1.6961,0.3157,-0.84125,...,0.3290,2.1844,0.9185,0.6255,1,1,0,0,0,0
52698,5860742,0.1770,0.28905,0.05360,0.14385,-0.03370,0.0805,-3.1873,-0.2775,0.72470,...,-0.3364,-4.6763,-0.2716,-0.0382,1,0,0,0,0,0
52769,5893538,0.0406,-0.37105,0.05375,0.07650,-0.09920,-1.1005,0.3782,-0.3292,-1.20030,...,-0.5340,0.1885,0.0683,1.1436,1,0,0,0,0,0
52790,5902203,-0.0131,-0.15125,-0.32330,0.18055,0.00000,0.3543,-1.8423,0.4049,0.79300,...,0.9529,-0.6149,0.3848,1.1333,1,0,0,0,0,0


# Self-reported 数据获取

In [None]:
# "SLE":"1381"
# 全部UKB 50W人群中有 self reported SLE的人数 351

import numpy as np
pro = pd.read_csv("/Volumes/data_files/UKB_data/processed_data/pro_sler")

srd_date = pd.read_csv("/Volumes/data_files/UKB_data/raw_data/srdTime.tab",sep='\t')
srd_date = srd_date[srd_date['eid'].isin(pro['eid'])]

srd = pd.read_csv("/Volumes/data_files/UKB_data/raw_data/srd.tab",sep='\t')
srd = srd[srd['eid'].isin(pro['eid'])]

def find_columns_with_1464(row, code):
    relevant_columns = []
    for col in row.index:
        if row[col] == float(code):
            # 提取列名的特定部分，例如 "i0_a0" 从 "p20002_i0_a0" 中提取
            relevant_columns.append("_".join(col.split("_")[1:]))
    return relevant_columns

srd_disease_code = {"RA":"1464","SLE":"1381","SS":"1382","Systemic Sclerosis":"1384","APS":"1564","Autoimmune thyroiditis":"1428"}

disease_column = pro.columns[-6:]
for disease in srd_disease_code.keys():
    srd[disease] = srd.apply(find_columns_with_1464, axis=1,args=(srd_disease_code[disease],))


immune_srd = srd[['eid','RA', 'SLE', 'SS', 'Systemic Sclerosis', 'APS', 'Autoimmune thyroiditis']]
# 把[]去掉
for disease in srd_disease_code.keys():
    immune_srd[disease] = immune_srd[disease].apply(lambda x: x[0] if len(x) > 0 else np.nan)
immune_srd.to_csv("/Volumes/data_files/UKB_data/processed_data/immune_srd.csv",index=False)


In [12]:
immune_srd = pd.read_csv("/Volumes/data_files/UKB_data/processed_data/immune_srd.csv")
sle_srd = immune_srd[immune_srd['SLE'].notnull()]
sle_srd

Unnamed: 0,eid,RA,SLE,SS,Systemic Sclerosis,APS,Autoimmune thyroiditis
35,1712779,i0_a2,i0_a6,i0_a1,,,
240,5702638,,i0_a5,,,,
261,5948965,,i0_a1,,,,
341,2173891,,i0_a0,,,,
530,5947167,,i0_a1,,,,
...,...,...,...,...,...,...,...
52547,3046806,,i0_a1,,,,
52597,4985642,,i0_a2,,,,
52663,1358711,,i0_a0,,,,
52747,1314680,i0_a5,i0_a3,,,,


# ICD10 SLE 时间数据获取

In [None]:
import warnings
import pandas as pd
warnings.filterwarnings('ignore')

pro = pd.read_csv("/Volumes/data_files/UKB_data/processed_data/pro_sler")
pro = pro.fillna(pro.median(numeric_only=True))

dates = pd.read_csv("/Volumes/data_files/UKB_data/raw_data/icd10Date.tab",sep='\t')
icd10 = pd.read_csv("/Volumes/data_files/UKB_data/raw_data/icd10.tab",sep='\t')
# 将 'p41270' 按 '|' 分割成列表
code_icd10 = icd10[icd10['eid'].isin(pro['eid'])]
code_icd10['p41270'] = code_icd10['p41270'].fillna('').astype(str)
code_icd10['icd10_list'] = code_icd10['p41270'].str.split('|')

disease_code = {"RA":"M05|M06", "SLE":"M32","SS":"M35","Systemic Sclerosis":"M34",
                "APS":"D686","Autoimmune thyroiditis":"E063"}

disease_column = pro.columns[-6:]

for disease in disease_column:
    codes = disease_code.get(disease).split('|')
    code_name = disease + "_icd10_index"
    print("code: ",codes,"code name: ",code_name)
    code_icd10[code_name] = code_icd10['icd10_list'].apply(
        lambda x: [i for i, icd_code in enumerate(x) if any(icd_code.startswith(code) for code in codes)]
    )
    
def extract_dates(row, date_df,col_name):
    indices = row[col_name]
    eid = row['eid']
    date_row = date_df[date_df['eid'] == eid].iloc[0]  # 找到匹配的 eid 行
    dates = {f'p41280_a{idx}': date_row[f'p41280_a{idx}'] for idx in indices if f'p41280_a{idx}' in date_row}
    return dates

for disease in disease_column:
    code_name = disease + "_icd10_index"
    date_name = disease + "_dates"
    code_icd10[date_name] = code_icd10.apply(lambda row: extract_dates(row, dates, code_name), axis=1)

# 提取最早日期的函数
def find_earliest_date(dates_dict):
    if not dates_dict:  # 如果字典为空
        return None
    return min(dates_dict.values())  # 找到最早的日期

for disease in disease_column:
    date_name = disease + "_dates"
    code_icd10[f'earliest_{date_name}'] = code_icd10[date_name].apply(find_earliest_date)

immune_icd10_date = code_icd10[['eid','earliest_SLE_dates', 'earliest_RA_dates', 'earliest_SS_dates', 'earliest_Systemic Sclerosis_dates', 'earliest_APS_dates', 'earliest_Autoimmune thyroiditis_dates']]
immune_icd10_date.to_csv("/Volumes/data_files/UKB_data/processed_data/immune_icd10_date.csv",index=False)


In [13]:
immune_icd10_date = pd.read_csv("/Volumes/data_files/UKB_data/processed_data/immune_icd10_date.csv")
immune_icd10_date = immune_icd10_date[immune_icd10_date['earliest_SLE_dates'].notnull()]
sle_date_eid = immune_icd10_date[['eid', 'earliest_SLE_dates']]
sle_date_eid

Unnamed: 0,eid,earliest_SLE_dates
35,1712779,2006-10-06
81,5000606,2003-09-06
159,3305003,1997-07-03
240,5702638,2013-05-01
261,5948965,2003-12-03
...,...,...
52747,1314680,2000-11-28
52773,3711114,2008-02-04
52820,4579293,2006-08-09
52866,5902203,2002-08-04


# 将上面的数据整合成一个

In [None]:
import pandas as pd

immune_icd10_date = pd.read_csv("/Volumes/data_files/UKB_data/processed_data/immune_icd10_date.csv")
immune_srd = pd.read_csv("/Volumes/data_files/UKB_data/processed_data/immune_srd.csv")
# 这个b_cols数据就是R语言处理的数据，黄老师那个初步处理过的Rdata，我转化成csv格式，在python中处理
b_cols = pd.read_csv("/Volumes/data_files/UKB_data/all_with_colnames.csv")
b_cols = b_cols[b_cols["eid"].isin(immune_icd10_date["eid"])]
select_b_col = b_cols[['eid','sex','ethnicity','bmi', 'date_attend', 'birth_year', 'birth_month']]
immune_basic = (
    immune_icd10_date
    .merge(immune_srd, on="eid", how="outer")
    .merge(select_b_col, on="eid", how="outer")
)
immune_basic.columns =[['eid', 'icd10_SLE_dates', 'icd10_RA_dates', 'icd10_SS_dates', 'icd10_Systemic Sclerosis_dates', 'icd10_APS_dates', 'icd10_Autoimmune thyroiditis_dates', 'srd_RA', 'srd_SLE', 'srd_SS', 'srd_Systemic Sclerosis', 'srd_APS', 'srd_Autoimmune thyroiditis', 'sex', 'ethnicity', 'bmi', 'date_attend', 'birth_year', 'birth_month']]
immune_basic.to_csv("/Volumes/data_files/UKB_data/processed_data/immune_basic.csv", index=False)

In [16]:
immune_basic = pd.read_csv("/Volumes/data_files/UKB_data/processed_data/immune_basic.csv")
immune_basic = immune_basic[["eid","icd10_SLE_dates","srd_SLE",'sex', 'ethnicity', 'bmi', 'date_attend', 'birth_year', 'birth_month']]
immune_basic

Unnamed: 0,eid,icd10_SLE_dates,srd_SLE,sex,ethnicity,bmi,date_attend,birth_year,birth_month
0,1000044,,,1.0,1001.0,30.3114,2009-11-16,1957.0,8.0
1,1000102,,,0.0,1001.0,25.6596,2009-12-11,1944.0,12.0
2,1000228,,,0.0,1001.0,23.3401,2009-10-27,1950.0,8.0
3,1000333,,,0.0,1001.0,22.6304,2009-04-24,1955.0,9.0
4,1000405,,,0.0,1001.0,23.9512,2008-09-18,1941.0,7.0
...,...,...,...,...,...,...,...,...,...
53008,6024526,,,1.0,4002.0,21.0925,2009-01-21,1966.0,4.0
53009,6024720,,,1.0,1001.0,28.3124,2009-07-13,1946.0,5.0
53010,6024802,,,1.0,1003.0,22.8571,2009-06-19,1939.0,8.0
53011,6024868,,,1.0,1001.0,28.2167,2009-07-20,1961.0,10.0


# 随访时间计算

In [None]:
import pandas as pd
import numpy as np

immune = pd.read_csv("/Volumes/data_files/UKB_data/processed_data/immune_basic.csv")
sease = ['SLE','RA','SS', 'Systemic Sclerosis', 'APS', 'Autoimmune thyroiditis']
disease_date = ['icd10_SLE_dates', 'icd10_RA_dates', 'icd10_SS_dates', 'icd10_Systemic Sclerosis_dates', 'icd10_APS_dates', 'icd10_Autoimmune thyroiditis_dates']

# # 使用immune_basic['date_attend'] - 各个疾病的disease_date，形成新的disease-follow-up列
immune['date_attend'] = pd.to_datetime(immune['date_attend'], errors='coerce')
immune["fp-len"] = (pd.to_datetime("2023-10") - immune['date_attend']).dt.days / 365.25

for d_date in disease_date:
    immune[d_date] = pd.to_datetime(immune[d_date], errors='coerce')

for d, d_date in zip(disease, disease_date):
    immune[f"fp-{d}"] = immune.apply(
        lambda row: (row[d_date] - row['date_attend']).days / 365.25 if pd.notnull(row[d_date]) else None,
        axis=1
    )

# 遍历 disease 和 disease_date
for d, d_date in zip(disease, disease_date):
    column_name = f"fp-{d}"

    # 检查列是否存在，如果不存在则创建一个空列
    if column_name not in immune.columns:
        immune[column_name] = None  # 或者 np.nan

    # 如果列值为 NaN，则用 fp-len 的值填充
    immune.loc[immune[column_name].isna(), column_name] = immune.loc[immune[column_name].isna(), "fp-len"]

immune['age'] = (immune['date_attend'].dt.year - immune['birth_year']) - \
                ((immune['date_attend'].dt.month < immune['birth_month']).astype(int))

immune.to_csv("/Volumes/data_files/UKB_data/processed_data/immune_basic_fp.csv", index=False)

In [23]:
immune = pd.read_csv("/Volumes/data_files/UKB_data/processed_data/immune_basic_fp.csv")
immune = immune[['eid','fp-len','fp-SLE']]
sle_fp_eid = immune[immune['fp-SLE']>0]['eid']
print("SLE 文章中的case+control人数：",len(sle_fp_eid))


SLE 文章中的case+control人数： 52747


In [24]:
# 将蛋白质数据+icd10数据+Self-reported数据+随访时间数据全部整合得到一个文件 immune_pro.csv
immune_basic = pd.read_csv("/Volumes/data_files/UKB_data/processed_data/immune_basic_fp.csv")
pro = pd.read_csv("/Volumes/data_files/UKB_data/processed_data/pro_sler")
immune_pro = pro.merge(immune_basic, on='eid', how='left')

pro = pd.read_csv("/Volumes/data_files/UKB_data/processed_data/immune_pro.csv")
pro = pro.fillna(pro.median(numeric_only=True))
sle_patient = pro[pro['SLE'] == 1]
sle_patient_eid = sle_patient['eid']
sle_srd_eid = sle_srd['eid']

print("sle_patient num",len(sle_patient_eid))
print("sle_srd num",len(sle_srd_eid))

  pro = pd.read_csv("/Volumes/data_files/UKB_data/processed_data/immune_pro.csv")


sle_patient num 541
sle_srd num 351


In [26]:
pro = pro[pro['fp-SLE']>0]
sle_case = pro[pro['SLE'] == 1]
sle_control = pro[pro['SLE'] == 0]
print("sle case+control num",len(pro['eid']))
print("sle case num",len(sle_case))
print("sle control num",len(sle_control))


sle case+control num 52747
sle case num 275
sle control num 52472
