In [None]:
# 5_get_pheno.py
# This script merges multiple phenotype files from the UK Biobank into a single file.

import pandas as pd
import numpy as np
import os

data_path_list = [
    '/data1/jiapl_group/lishuhua/UKB/Phenotype/fields/core_category/fields_group_1.tsv',
    '/data1/jiapl_group/lishuhua/UKB/Phenotype/fields/core_category/fields_group_2.tsv',
    '/data1/jiapl_group/lishuhua/UKB/Phenotype/fields/core_category/fields_group_17.tsv',
    '/data1/jiapl_group/lishuhua/UKB/Phenotype/fields/core_category/fields_group_37.tsv',
    '/data1/jiapl_group/lishuhua/UKB/Phenotype/fields/core_category/fields_group_39.tsv',
    '/data1/jiapl_group/lishuhua/UKB/Phenotype/fields/core_category/fields_group_73.tsv',
    '/data1/jiapl_group/lishuhua/UKB/Phenotype/fields/core_category/fields_group_75.tsv',
    '/data1/jiapl_group/lishuhua/UKB/Phenotype/fields/core_category/fields_group_76.tsv',
    '/data1/jiapl_group/lishuhua/UKB/Phenotype/fields/core_category/fields_group_83.tsv',
    '/data1/jiapl_group/lishuhua/UKB/Phenotype/fields/core_category/fields_group_84.tsv',
    '/data1/jiapl_group/lishuhua/UKB/Phenotype/fields/core_category/fields_group_85.tsv',
    '/data1/jiapl_group/lishuhua/UKB/Phenotype/fields/core_category/fields_group_86.tsv',
    '/data1/jiapl_group/lishuhua/UKB/Phenotype/fields/core_category/fields_group_87.tsv',
    '/data1/jiapl_group/lishuhua/UKB/Phenotype/fields/core_category/fields_group_88.tsv',
    '/data1/jiapl_group/lishuhua/UKB/Phenotype/fields/core_category/fields_group_91.tsv',
    '/data1/jiapl_group/lishuhua/UKB/Phenotype/fields/core_category/fields_group_92.tsv',
]

fields_ids = ['p48_i0', 'p48_i1', 'p48_i2', 'p48_i3', 'p50_i0', 'p50_i1', 'p50_i2', 'p50_i3', 'p102_i0_a0', 'p102_i0_a1', 'p102_i1_a0', 'p102_i1_a1', 'p102_i2_a0', 'p102_i2_a1', 'p102_i3_a0', 'p102_i3_a1', 'p4079_i0_a0', 'p4079_i0_a1', 'p4079_i1_a0', 'p4079_i1_a1', 'p4079_i2_a0', 'p4079_i2_a1', 'p4079_i3_a0', 'p4079_i3_a1', 'p4080_i0_a0', 'p4080_i0_a1', 'p4080_i1_a0', 'p4080_i1_a1', 'p4080_i2_a0', 'p4080_i2_a1', 'p4080_i3_a0', 'p4080_i3_a1', 'p20116_i0', 'p20116_i1', 'p20116_i2', 'p20116_i3', 'p20117_i0', 'p20117_i1', 'p20117_i2', 'p20117_i3', 'p21001_i0', 'p21001_i1', 'p21001_i2', 'p21001_i3', 'p21002_i0', 'p21002_i1', 'p21002_i2', 'p21002_i3', 'p30000_i0', 'p30000_i1', 'p30000_i2', 'p30010_i0', 'p30010_i1', 'p30010_i2', 'p30020_i0', 'p30020_i1', 'p30020_i2', 'p30080_i0', 'p30080_i1', 'p30080_i2', 'p30120_i0', 'p30120_i1', 'p30120_i2', 'p30130_i0', 'p30130_i1', 'p30130_i2', 'p30140_i0', 'p30140_i1', 'p30140_i2', 'p30150_i0', 'p30150_i1', 'p30150_i2',
'p30620_i0', 'p30620_i1', 'p30650_i0', 'p30650_i1', 'p30670_i0', 'p30670_i1', 'p30690_i0', 'p30690_i1',
'p30700_i0', 'p30700_i1', 'p30730_i0', 'p30730_i1', 'p30740_i0', 'p30740_i1', 'p30760_i0', 'p30760_i1',
'p30780_i0', 'p30780_i1', 'p30870_i0', 'p30870_i1', 'p30880_i0', 'p30880_i1']

data_frames = []
for file_path in data_path_list:
    if not os.path.exists(file_path):
        print(f"File {file_path} does not exist.")
        continue
    columns = pd.read_csv(file_path, sep='\t', nrows=0).columns.tolist()
    overlap_columns = set(columns).intersection(fields_ids)
    if not overlap_columns:
        print(f"No overlapping columns found in {file_path}.")
        continue
    df = pd.read_csv(file_path, sep='\t', usecols=['eid'] + list(overlap_columns), header=0, na_values=['NA', '.', '-9', ''])
    data_frames.append(df)

if data_frames:
    merge_df = data_frames[0]
    for df in data_frames[1:]:
        merge_df = pd.merge(merge_df, df, on='eid', how='outer')
    
    output_file = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/pheno/trait/trait_ukb.tsv"
    merge_df.to_csv(output_file, sep='\t', index=False, header=True, na_rep='')
    print(f"Data merged successfully and saved to {output_file}.")
else:
    print("No data frames to merge. Please check the input files.")


In [None]:
# 7_merge_ukb_pheno.py

# merge all instances and arrays into a single column
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm, rankdata
import os
import re

sns.set(style="whitegrid")

def inverse_normal_transform(x):
    """Perform inverse normal transformation on a series."""
    values = np.array(x)
    is_na = np.isnan(values)
    ranks = rankdata(values[~is_na], method='average')
    transformed = np.empty_like(values, dtype=float)
    transformed[~is_na] = norm.ppf( (ranks - 0.5) / len(ranks) )
    transformed[is_na] = np.nan  # Keep NaNs in the same places
    # Ensure the transformed values are calulated correctly
    transformed = pd.Series(transformed, index=x.index)
    # transformed = transformed.astype(float)  # Ensure the type is float
    return transformed

def change_category_to_binary(x):
    """Convert a series to binary (0/1) if it contains only two unique values."""
    # change all -3 as NaN
    x = x.replace(-3, np.nan)
    # Only proceed if there are exactly two unique non-NaN values
    unique_vals = pd.Series(x).dropna().unique()
    if len(unique_vals) == 3:
        # Map the smaller value to 0, the larger to 1
        sorted_vals = sorted(unique_vals)
        mapping = {sorted_vals[0]: 1, sorted_vals[1]: 1, sorted_vals[2]: 2}
        x = x.map(mapping)
    transformed = pd.Series(x, index=x.index)
    return transformed

def is_categorical(series, max_n_unique=20, min_n_unique=2):
    """Check if a series is categorical based on the number of unique values."""
    series = series.dropna()
    unique_vals = set(series.unique())

    if series.dtype.kind not in 'iui' and not all( (float(x).is_integer() for x in unique_vals) ):
        return False
    n_unique = len(unique_vals)
    if 1 < n_unique <= max_n_unique and not unique_vals.issubset({0, 1}):
        return True
    else:
        return False

def process_one_phenotype(df, pheno_prefix, output_dir='/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/pheno/trait/'):
    pattern = re.compile(rf"^{pheno_prefix}_i\d+(_a\d+)?$")
    cols = [col for col in df.columns if pattern.match(col)]

    if not cols:
        print(f"No columns found for prefix {pheno_prefix}.")
        return None
    print(f"[{pheno_prefix}] Processing columns: {cols}")

    pheno_raw = df[cols].bfill(axis=1).iloc[:, 0]

    categorical = is_categorical(pheno_raw)

    n_total = pheno_raw.shape[0]
    n_no_missing = pheno_raw.notna().sum()

    result = pd.DataFrame({
        'eid': df['eid'],
        f'{pheno_prefix}_raw': pheno_raw
    })
    
    # figure_output_path = os.path.join(output_dir, f"/distribution/")
    # os.makedirs(os.path.dirname(figure_output_path), exist_ok=True)

    if categorical:
        counts = pheno_raw.value_counts(dropna=True).to_dict()
        pheno_int = change_category_to_binary(pheno_raw)
        result[f'{pheno_prefix}_int'] = pheno_int
        print(f"[Categorical] Non-missing: {n_no_missing}, Category counts: {counts}")
        plt.figure(figsize=(10, 4))
        plt.subplot(1, 2, 1)
        sns.countplot(x=pheno_raw)
        plt.title(f"{pheno_prefix} Raw Distribution (Categorical)")

        plt.subplot(1, 2, 2)
        sns.countplot(x=pheno_int)
        plt.title(f"{pheno_prefix} Transform Distribution (Categorical)")

        plt.xlabel("Category")
        plt.ylabel("Count")
        plt.tight_layout()
        plt.savefig(f'{output_dir}/distribution/{pheno_prefix}_dist.png')
        plt.close()
    else:
        print(f"[Continuous] Non-missing: {n_no_missing}, Mean: {pheno_raw.mean()}, Std: {pheno_raw.std()}")
        pheno_int = pd.Series(inverse_normal_transform(pheno_raw), index=pheno_raw.index)
        result[f'{pheno_prefix}_int'] = pheno_int

        plt.figure(figsize=(10, 4))
        plt.subplot(1, 2, 1)
        sns.histplot(pheno_raw, kde=True, bins=50, color='skyblue')
        plt.title(f"{pheno_prefix} Raw Distribution")

        plt.subplot(1, 2, 2)
        sns.histplot(pheno_int, kde=True, bins=50, color='salmon')
        plt.title(f"{pheno_prefix} Inverse Normal Distribution")

        plt.tight_layout()
        plt.savefig(f'{output_dir}/distribution/{pheno_prefix}_dist.png')
        plt.close()
    
    return result

def batch_process(df, pheno_prefixes, output_dir='/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/pheno/trait/'):
    all_results = []
    
    for prefix in pheno_prefixes:
        result = process_one_phenotype(df, prefix, output_dir)
        if result is not None:
            all_results.append(result)
    if all_results:
        all_merged = all_results[0]
        for res in all_results[1:]:
            all_merged = pd.merge(all_merged, res, on='eid', how='outer')
        output_file = os.path.join(output_dir, 'trait_ukb_processed.tsv')
        all_merged.to_csv(output_file, sep='\t', index=False, header=True, na_rep='')
        print(f"Processed data saved to {output_file}.")
    else:
        print("No valid phenotypes processed. Please check the input data.")


if __name__ == "__main__":
    trait_path = '/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/pheno/trait/trait_ukb.tsv'
    trait_df = pd.read_csv(trait_path, sep='\t', na_values=['NA', '.', '-9', ''])
    
    pheno_prefixes = [
        'p48', 'p50', 'p102', 'p4079', 'p4080', 'p20116', 'p20117', 'p21001', 'p21002',
        'p30000', 'p30010', 'p30020', 'p30080', 'p30120', 'p30130', 'p30140',
        'p30150', 'p30620', 'p30650', 'p30670', 'p30690', 'p30700',
        'p30730', 'p30740', 'p30760', 'p30780', 'p30870', 'p30880']

    batch_process(trait_df, pheno_prefixes)

In [None]:
## QC trait data and split into EUR and EAS
import pandas as pd

white_british_list_path = '/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/pheno/covar/covars_white_british_qced.txt'
chinese_list_path = '/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/pheno/covar/covars_chinese_qced.txt'
trait_path = '/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/pheno/trait/trait_ukb_processed.tsv'

white_british_list = pd.read_csv(white_british_list_path, sep='\t', header=None, names=['FID', 'IID'])
chinese_list = pd.read_csv(chinese_list_path, sep='\t', header=None, names=['FID', 'IID'])

trait_df = pd.read_csv(trait_path, sep='\t', na_values=['NA', '.', '-9', ''])
trait_df['FID'] = trait_df['eid']
trait_df['IID'] = trait_df['eid']

# Merge with white British and Chinese lists
trait_white_british = pd.merge(white_british_list, trait_df, on=['FID', 'IID'], how='inner')
trait_chinese = pd.merge(chinese_list, trait_df, on=['FID', 'IID'], how='inner')

id_cols = ['FID', 'IID']
trait_cols_eur = [col for col in trait_white_british.columns if col not in ['FID', 'IID', 'eid']]
trait_cols_eas = [col for col in trait_chinese.columns if col not in ['FID', 'IID', 'eid']]
for trait in trait_cols_eur:
    df_trait = trait_white_british[ id_cols + [trait] ].copy()
    df_trait = df_trait.dropna(subset=[trait])
    output_path = f'/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/pheno/trait/White_British/{trait}.txt'
    df_trait.to_csv(output_path, sep='\t', index=False, header=True, na_rep='', quoting=3, encoding='utf-8')
    print(f"Trait {trait} for White British saved to {output_path}, {df_trait.shape[0]} samples.")

for trait in trait_cols_eas:
    df_trait = trait_chinese[id_cols + [trait]].copy()
    df_trait = df_trait.dropna(subset=[trait])
    output_path = f'/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/pheno/trait/Chinese/{trait}.txt'
    df_trait.to_csv(output_path, sep='\t', index=False, header=True, na_rep='', quoting=3, encoding='utf-8')
    print(f"Trait {trait} for Chinese saved to {output_path}, {df_trait.shape[0]} samples.")

# check the number of samples
print(f"Number of white British samples: {trait_white_british.shape[0]}")
print(f"Number of Chinese samples: {trait_chinese.shape[0]}")

# Save the results
trait_white_british.to_csv('/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/pheno/trait/trait_ukb_white_british.txt', sep='\t', index=False, header=True, na_rep='', quoting=3, encoding='utf-8')
trait_chinese.to_csv('/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/pheno/trait/trait_ukb_chinese.txt', sep='\t', index=False, header=True, na_rep='', quoting=3, encoding='utf-8')

In [None]:
# 7_merge_ukb_pheno.py

# merge all instances and arrays into a single column
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm, rankdata
import os
import re

sns.set(style="whitegrid")

def inverse_normal_transform(x):
    """Perform inverse normal transformation on a series."""
    values = np.array(x)
    is_na = np.isnan(values)
    ranks = rankdata(values[~is_na], method='average')
    transformed = np.empty_like(values, dtype=float)
    transformed[~is_na] = norm.ppf( (ranks - 0.5) / len(ranks) )
    transformed[is_na] = np.nan  # Keep NaNs in the same places
    # Ensure the transformed values are calulated correctly
    transformed = pd.Series(transformed, index=x.index)
    # transformed = transformed.astype(float)  # Ensure the type is float
    return transformed

def change_category_to_binary(x):
    """Convert a series to binary (0/1) if it contains only two unique values."""
    # change all -3 as NaN
    x = x.replace(-3, np.nan)
    # Only proceed if there are exactly two unique non-NaN values
    unique_vals = pd.Series(x).dropna().unique()
    if len(unique_vals) == 3:
        # Map the smaller value to 0, the larger to 1
        sorted_vals = sorted(unique_vals)
        mapping = {sorted_vals[0]: 1, sorted_vals[1]: 1, sorted_vals[2]: 2}
        x = x.map(mapping)
    transformed = pd.Series(x, index=x.index)
    return transformed

def is_categorical(series, max_n_unique=20, min_n_unique=2):
    """Check if a series is categorical based on the number of unique values."""
    series = series.dropna()
    unique_vals = set(series.unique())

    if series.dtype.kind not in 'iui' and not all( (float(x).is_integer() for x in unique_vals) ):
        return False
    n_unique = len(unique_vals)
    if 1 < n_unique <= max_n_unique and not unique_vals.issubset({0, 1}):
        return True
    else:
        return False

def process_one_phenotype(df, pheno_prefix, output_dir):

    pheno_data = df[['eid'] + [col for col in df.columns if col.startswith(pheno_prefix)]]
    print(f"[{pheno_prefix}] Processing columns: {pheno_data.columns.tolist()}")

    categorical = is_categorical(pheno_data[[col for col in pheno_data.columns if col.endswith("_raw")]].iloc[:, 0])
    raw_data = pheno_data[[col for col in pheno_data.columns if col.endswith("_raw")]].iloc[:, 0]

    # figure_output_path = os.path.join(output_dir, f"/distribution/")
    # os.makedirs(os.path.dirname(figure_output_path), exist_ok=True)

    if categorical:
        counts = raw_data.value_counts(dropna=True).to_dict()
        plt.figure(figsize=(10, 4))
        # plt.subplot(1, 2, 1)
        sns.countplot(x=raw_data)
        plt.title(f"{pheno_prefix} Raw Distribution (Categorical)")

        # plt.subplot(1, 2, 2)
        # sns.countplot(x=int_data)
        # plt.title(f"{int_data} Transform Distribution (Categorical)")

        plt.xlabel("Category")
        plt.ylabel("Count")
        plt.tight_layout()
        plt.savefig(f'{output_dir}/{pheno_prefix}_dist.png')
        plt.close()
    else:
        int_data = pheno_data[[col for col in pheno_data.columns if col.endswith("_int")]].iloc[:, 0]
        plt.figure(figsize=(10, 4))
        plt.subplot(1, 2, 1)
        sns.histplot(raw_data, kde=True, bins=50, color='skyblue')
        plt.title(f"{pheno_prefix} Raw Distribution")

        plt.subplot(1, 2, 2)
        sns.histplot(int_data, kde=True, bins=50, color='salmon')
        plt.title(f"{pheno_prefix} Inverse Normal Distribution")

        plt.tight_layout()
        plt.savefig(f'{output_dir}/{pheno_prefix}_dist.png')
        plt.close()

def batch_process(df, pheno_prefixes, output_dir='/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/pheno/trait/'):
    
    for prefix in pheno_prefixes:
        process_one_phenotype(df, prefix, output_dir)


if __name__ == "__main__":
    # trait_path = '/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/pheno/trait/trait_ukb.tsv'
    chinese_df = pd.read_csv('/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/pheno/trait/trait_ukb_chinese.txt', sep='\t', na_values=['NA', '.', '-9', ''])
    white_british_df = pd.read_csv('/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/pheno/trait/trait_ukb_white_british.txt', sep='\t', na_values=['NA', '.', '-9', ''])
    # trait_df = pd.read_csv(trait_path, sep='\t', na_values=['NA', '.', '-9', ''])
    
    pheno_prefixes = [
        'p48', 'p50', 'p102', 'p4079', 'p4080', 'p20116', 'p20117', 'p21001', 'p21002',
        'p30000', 'p30010', 'p30020', 'p30080', 'p30120', 'p30130', 'p30140',
        'p30150', 'p30620', 'p30650', 'p30670', 'p30690', 'p30700',
        'p30730', 'p30740', 'p30760', 'p30780', 'p30870', 'p30880']

    batch_process(chinese_df, pheno_prefixes, '/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/pheno/trait/Chinese_distribution/')
    batch_process(white_british_df, pheno_prefixes, '/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/pheno/trait/White_British_distribution/')