In [None]:
import pandas as pd
import numpy as np
import os

data_path_list = [
    '/data1/jiapl_group/lishuhua/UKB/Phenotype/fields/core_category/fields_group_1.tsv',
    '/data1/jiapl_group/lishuhua/UKB/Phenotype/fields/core_category/fields_group_39.tsv',
    '/data1/jiapl_group/lishuhua/UKB/Phenotype/fields/third_category/third_group_5.tsv',
    '/data1/jiapl_group/lishuhua/UKB/Phenotype/fields/third_category/third_group_6.tsv'
    ]
fields_ids = ['p31', 'p21022', 'p22001', 'p22006', 'p22019', 'p22020', 'p22021', 'p22027', 'p22009_a1', 'p22009_a2', 'p22009_a3', 'p22009_a4', 'p22009_a5', 'p22009_a6', 'p22009_a7', 'p22009_a8', 'p22009_a9', 'p22009_a10', 'p22009_a11', 'p22009_a12', 'p22009_a13', 'p22009_a14', 'p22009_a15', 'p22009_a16', 'p22009_a17', 'p22009_a18', 'p22009_a19', 'p22009_a20']

# for each file, read the data and extract the relevant columns
data_frames = []
for file_path in data_path_list:
    # search if the field_id column exists
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue
    columns = pd.read_csv(file_path, sep='\t', nrows=0).columns.tolist()
    # find the overlap of columns with fields_ids
    overlap_columns = set(columns).intersection(fields_ids)
    if not overlap_columns:
        print(f"No relevant fields found in {file_path}")
        continue
    # read the file and filter by fields_ids
    df = pd.read_csv(file_path, sep='\t', usecols=['eid'] + list(overlap_columns), header=0)
    data_frames.append(df)

# merge all data frames on 'eid'
if data_frames:
    merged_df = data_frames[0]
    for df in data_frames[1:]:
        merged_df = pd.merge(merged_df, df, on='eid', how='outer')
    
    # save the merged dataframe to a new file
    output_file = '/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/pheno/covar/qc_covars.tsv'
    merged_df.to_csv(output_file, sep='\t', index=False, header=True)
    print(f"Merged data saved to {output_file}")

In [None]:
# sample QC
import pandas as pd

qc_covars_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/pheno/covar/qc_covars.tsv"
# Load the QC covariates
covars = pd.read_csv(qc_covars_path, sep='\t')

covars_qced = covars[(covars["p31"] == covars["p22001"]) & (covars["p22019"].isnull()) & (covars["p22021"] != 10) & (covars["p22027"].isnull())]

# Save the QCed covariates to a new file
output_file_qced = '/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/pheno/covar/qc_covars_qced.tsv'
covars_qced.to_csv(output_file_qced, sep='\t', index=False, header=True)
print(f"QCed covariates saved to {output_file_qced}")
print(f"Number of samples after QC: {len(covars_qced)}")

In [None]:
# split qc_covars_qced into two cohorts: white British and Chinese
import pandas as pd

white_british_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/cohort/white_british_sample_ids.txt"
chinese_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/cohort/chinese_sample_ids.txt"
# Load the white British sample IDs
white_british_ids = pd.read_csv(white_british_path, header=None, names=['FID', "IID"], sep='\t')
# Load the Chinese sample IDs
chinese_ids = pd.read_csv(chinese_path, header=None, names=['FID', "IID"], sep='\t')
# Load the QC covariates
qc_covars_qced_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/pheno/covar/qc_covars_qced.tsv"
qc_covars_qced = pd.read_csv(qc_covars_qced_path, sep='\t')
qc_covars_qced["FID"] = qc_covars_qced['eid']
qc_covars_qced['IID'] = qc_covars_qced['eid']

# Merge the white British IDs with the QC covariates
merge_df_eur_qced = pd.merge(white_british_ids, qc_covars_qced, on=['FID','IID'], how='inner')
merge_df_eur_qced = merge_df_eur_qced[(merge_df_eur_qced['p22006'] == 1) & (merge_df_eur_qced['p22020'] == 1)]  # filter for white British only
# Merge the Chinese IDs with the QC covariates
merge_df_eas_qced = pd.merge(chinese_ids, qc_covars_qced, on=['FID','IID'], how='inner')
# check if there are full na rows (expect FID, IID, eid column) in the merged DataFrames
check_eur = merge_df_eur_qced.drop(columns=['FID', 'IID', 'eid']).isnull().all(axis=1)
check_eas = merge_df_eas_qced.drop(columns=['FID', 'IID', 'eid']).isnull().all(axis=1)
if check_eur.any():
    print("Warning: There are full NA rows in the merged white British DataFrame.")
if check_eas.any():
    print("Warning: There are full NA rows in the merged Chinese DataFrame.")
# Save the merged DataFrame to a new file
output_file_eur_qced = '/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/pheno/covar/covars_white_british_qced.tsv'
output_file_eas_qced = '/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/pheno/covar/covars_chinese_qced.tsv'
output_list_eur_qced = '/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/pheno/covar/covars_white_british_qced.txt'
output_list_eas_qced = '/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/pheno/covar/covars_chinese_qced.txt'
merge_df_eur_qced.to_csv(output_file_eur_qced, sep='\t', index=False, header=True)
merge_df_eur_qced[['FID', 'IID']].to_csv(output_list_eur_qced, sep='\t', index=False, header=False)
print(f"Merged data with white British sample IDs after QC saved to {output_file_eur_qced}")
print(f"Number of samples in white British cohort after QC: {len(merge_df_eur_qced)}")
merge_df_eas_qced.to_csv(output_file_eas_qced, sep='\t', index=False, header=True)
merge_df_eas_qced[['FID', 'IID']].to_csv(output_list_eas_qced, sep='\t', index=False, header=False)
print(f"Merged data with Chinese sample IDs after QC saved to {output_file_eas_qced}")
print(f"Number of samples in Chinese cohort after QC: {len(merge_df_eas_qced)}")