In [None]:
import pandas as pd
import numpy as np
import os

data_path_list = [
    "/data1/jiapl_group/lishuhua/UKB/Phenotype/fields/core_category/fields_group_34.tsv",
    "/data1/jiapl_group/lishuhua/UKB/Phenotype/fields/third_category/third_group_27.tsv",
    "/data1/jiapl_group/lishuhua/UKB/Phenotype/fields/third_category/third_group_30.tsv"
]

field_ids = ["p41202", "p41203", "p41204", "p41205", "p41270", "p41271", "p"]

data_frames = []
for file_path in data_path_list:
    if not os.path.exists(file_path):
        print(f"File {file_path} does not exist.")
        continue
    columns = pd.read_csv(file_path, sep='\t', nrows=0).columns.tolist()
    overlap_columns = set(columns).intersection(field_ids)
    if not overlap_columns:
        print(f"No overlapping columns found in {file_path}.")
        continue
    df = pd.read_csv(file_path, sep='\t', usecols=['eid'] + list(overlap_columns), header=0, na_values=['NA', '.', '-9', ''])
    data_frames.append(df)

if data_frames:
    merge_df = data_frames[0]
    for df in data_frames[1:]:
        merge_df = pd.merge(merge_df, df, on='eid', how='outer')
    
    output_file = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB_ICD/pheno/icd_pheno.tsv"
    merge_df.to_csv(output_file, sep='\t', index=False, header=True, na_rep='')
    print(f"Data merged successfully and saved to {output_file}.")
else:
    print("No data frames to merge. Please check the input files.")

In [None]:
# 7_merge_ukb_pheno.py

# merge all instances and arrays into a single column
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm, rankdata
import os
import re

sns.set(style="whitegrid")

def inverse_normal_transform(x):
    """Perform inverse normal transformation on a series."""
    values = np.array(x)
    is_na = np.isnan(values)
    ranks = rankdata(values[~is_na], method='average')
    transformed = np.empty_like(values, dtype=float)
    transformed[~is_na] = norm.ppf( (ranks - 0.5) / len(ranks) )
    transformed[is_na] = np.nan  # Keep NaNs in the same places
    # Ensure the transformed values are calulated correctly
    transformed = pd.Series(transformed, index=x.index)
    # transformed = transformed.astype(float)  # Ensure the type is float
    return transformed

def change_category_to_binary(x):
    """Convert a series to binary (0/1) if it contains only two unique values."""
    # change all -3 as NaN
    x = x.replace(-3, np.nan)
    # Only proceed if there are exactly two unique non-NaN values
    unique_vals = pd.Series(x).dropna().unique()
    if len(unique_vals) == 3:
        # Map the smaller value to 0, the larger to 1
        sorted_vals = sorted(unique_vals)
        mapping = {sorted_vals[0]: 1, sorted_vals[1]: 1, sorted_vals[2]: 2}
        x = x.map(mapping)
    transformed = pd.Series(x, index=x.index)
    return transformed

def is_categorical(series, max_n_unique=20, min_n_unique=2):
    """Check if a series is categorical based on the number of unique values."""
    series = series.dropna()
    unique_vals = set(series.unique())

    if series.dtype.kind not in 'iui' and not all( (float(x).is_integer() for x in unique_vals) ):
        return False
    n_unique = len(unique_vals)
    if 1 < n_unique <= max_n_unique and not unique_vals.issubset({0, 1}):
        return True
    else:
        return False

def process_one_phenotype(df, pheno_prefix, output_dir='/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB_ICD/pheno/'):
    pattern = re.compile(rf"^{pheno_prefix}_i\d+(_a\d+)?$")
    cols = [col for col in df.columns if pattern.match(col)]

    if not cols:
        print(f"No columns found for prefix {pheno_prefix}.")
        return None
    print(f"[{pheno_prefix}] Processing columns: {cols}")

    pheno_raw = df[cols].bfill(axis=1).iloc[:, 0]

    categorical = is_categorical(pheno_raw)

    n_total = pheno_raw.shape[0]
    n_no_missing = pheno_raw.notna().sum()

    result = pd.DataFrame({
        'eid': df['eid'],
        f'{pheno_prefix}_raw': pheno_raw
    })
    
    # figure_output_path = os.path.join(output_dir, f"/distribution/")
    # os.makedirs(os.path.dirname(figure_output_path), exist_ok=True)

    if categorical:
        counts = pheno_raw.value_counts(dropna=True).to_dict()
        pheno_int = change_category_to_binary(pheno_raw)
        result[f'{pheno_prefix}_int'] = pheno_int
        print(f"[Categorical] Non-missing: {n_no_missing}, Category counts: {counts}")
        plt.figure(figsize=(10, 4))
        plt.subplot(1, 2, 1)
        sns.countplot(x=pheno_raw)
        plt.title(f"{pheno_prefix} Raw Distribution (Categorical)")

        plt.subplot(1, 2, 2)
        sns.countplot(x=pheno_int)
        plt.title(f"{pheno_prefix} Transform Distribution (Categorical)")

        plt.xlabel("Category")
        plt.ylabel("Count")
        plt.tight_layout()
        plt.savefig(f'{output_dir}/distribution/{pheno_prefix}_dist.png')
        plt.close()
    else:
        print(f"[Continuous] Non-missing: {n_no_missing}, Mean: {pheno_raw.mean()}, Std: {pheno_raw.std()}")
        pheno_int = pd.Series(inverse_normal_transform(pheno_raw), index=pheno_raw.index)
        result[f'{pheno_prefix}_int'] = pheno_int

        plt.figure(figsize=(10, 4))
        plt.subplot(1, 2, 1)
        sns.histplot(pheno_raw, kde=True, bins=50, color='skyblue')
        plt.title(f"{pheno_prefix} Raw Distribution")

        plt.subplot(1, 2, 2)
        sns.histplot(pheno_int, kde=True, bins=50, color='salmon')
        plt.title(f"{pheno_prefix} Inverse Normal Distribution")

        plt.tight_layout()
        plt.savefig(f'{output_dir}/distribution/{pheno_prefix}_dist.png')
        plt.close()
    
    return result

def batch_process(df, pheno_prefixes, output_dir='/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB_ICD/pheno/'):
    all_results = []
    
    for prefix in pheno_prefixes:
        result = process_one_phenotype(df, prefix, output_dir)
        if result is not None:
            all_results.append(result)
    if all_results:
        all_merged = all_results[0]
        for res in all_results[1:]:
            all_merged = pd.merge(all_merged, res, on='eid', how='outer')
        output_file = os.path.join(output_dir, 'icd_pheno_2.tsv')
        all_merged.to_csv(output_file, sep='\t', index=False, header=True, na_rep='')
        print(f"Processed data saved to {output_file}.")
    else:
        print("No valid phenotypes processed. Please check the input data.")


if __name__ == "__main__":
    trait_path = '/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB_ICD/pheno/icd_pheno.tsv'
    trait_df = pd.read_csv(trait_path, sep='\t', na_values=['NA', '.', '-9', ''])
    
    pheno_prefixes = ["p41202", "p41203", "p41204", "p41205", "p41270", "p41271"]

    batch_process(trait_df, pheno_prefixes)