In [None]:
#!/bin/bash
# --- 阶段一：初始QC ---
echo "Stage 1: Performing initial QC on each dataset individually..."

# 定义一个包含所有数据集前缀的数组
DATASETS=("/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/CAS/geno/CAS_for_PCA/CAS_qc" "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/geno/Chinese_for_PCA/merged" "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/1kg/geno/geno_for_PCA/all_phase3_qc_auto_only")

for PREFIX in "${DATASETS[@]}"
do
    echo " -> Processing: ${PREFIX}"
    /data1/jiapl_group/lishuhua/software/general/plink2 --bfile ${PREFIX} \
    --autosome \
    --maf 0.05 \
    --geno 0.02 \
    --mind 0.02 \
    --hwe 1e-6 \
    --snps-only just-acgt \
    --make-bed \
    --out ${PREFIX}_qc
done

echo "Stage 1 complete."

# --- 阶段二：SNP协调 ---
echo -e "\nStage 2: SNP Harmonization..."

echo " -> Step 2a: Finding intersection of SNPs across all datasets..."
# 从每个QC后的bim文件中提取SNP ID (第二列)
awk '{print $2}' /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/CAS/geno/CAS_for_PCA/CAS_qc_qc.bim > /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/CAS.snps
awk '{print $2}' /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/geno/Chinese_for_PCA/merged_qc.bim > /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/UKB_EAS.snps
awk '{print $2}' /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/1kg/geno/geno_for_PCA/all_phase3_qc_auto_only_qc.bim > /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/1kg_eas.snps

# 使用sort和comm命令找出三者的交集
sort /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/CAS.snps -o /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/CAS.snps
sort /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/UKB_EAS.snps -o /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/UKB_EAS.snps
sort /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/1kg_eas.snps -o /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/1kg_eas.snps

comm -12 /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/CAS.snps /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/UKB_EAS.snps > /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/temp_common.snps
comm -12 /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/temp_common.snps /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/1kg_eas.snps > /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/common_snps.txt

echo " Found $(wc -l < /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/common_snps.txt) common SNPs."

echo " -> Step 2b: Identifying and preparing to remove ambiguous A/T, C/G SNPs..."
# 我们以1KG作为参考标准来寻找歧义SNP
awk '($5=="A" && $6=="T") || ($5=="T" && $6=="A") || ($5=="C" && $6=="G") || ($5=="G" && $6=="C") {print $2}' /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/1kg/geno/geno_for_PCA/all_phase3_qc_auto_only_qc.bim > /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/ambiguous_snps.txt

# 使用grep从我们的共有SNP列表中移除这些歧义SNP，生成最终的“白名单”
grep -vFf /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/ambiguous_snps.txt /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/common_snps.txt > /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/final_snps_to_keep.txt

echo " After removing $(wc -l < /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/ambiguous_snps.txt) ambiguous SNPs, we have $(wc -l < /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/final_snps_to_keep.txt) high-quality SNPs to keep."

echo " -> Step 2c: Extracting final SNPs and harmonizing alleles..."

# 首先，处理参考队列(1KG)，只提取最终的SNP
/data1/jiapl_group/lishuhua/software/general/plink2 --bfile /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/1kg/geno/geno_for_PCA/all_phase3_qc_auto_only_qc \
--extract /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/final_snps_to_keep.txt \
--make-bed \
--out /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/1kg/geno/geno_for_PCA/1kg_eas_final

# 然后，处理您的两个队列，在提取SNP的同时，强制其等位基因与1KG对齐
# 创建一个参考等位基因文件
awk '{print $2, $6}' /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/1kg/geno/geno_for_PCA/1kg_eas_final.bim > /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/reference_alleles.txt

# 协调队列A
/data1/jiapl_group/lishuhua/software/general/plink2 --bfile /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/CAS/geno/CAS_for_PCA/CAS_qc_qc \
--extract /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/final_snps_to_keep.txt \
--ref-allele force /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/reference_alleles.txt 2 1 \
--make-bed \
--out /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/CAS/geno/CAS_for_PCA/CAS_final

# 协调队列B
/data1/jiapl_group/lishuhua/software/general/plink2 --bfile /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/geno/Chinese_for_PCA/merged_qc \
--extract /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/final_snps_to_keep.txt \
--ref-allele force /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/reference_alleles.txt 2 1 \
--make-bed \
--out /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/geno/Chinese_for_PCA/merged_final

echo "Stage 2 complete. All datasets are now harmonized."

# --- 阶段三：合并与验证 ---
echo -e "\nStage 3: Merging harmonized data and re-running sanity check PCA..."

# 创建合并列表
echo "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/CAS/geno/CAS_for_PCA/CAS_final" > /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/merge_list.txt
echo "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/geno/Chinese_for_PCA/merged_final" >> /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/merge_list.txt

# 进行合并（这一步因为之前的严格协调，很可能直接成功，或只有极少量missnp）
/data1/jiapl_group/lishuhua/software/general/plink --bfile /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/1kg/geno/geno_for_PCA/1kg_eas_final \
--merge-list /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/merge_list.txt \
--make-bed \
--out /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/ALL_merged_rigorous_qc

In [None]:
/data1/jiapl_group/lishuhua/software/general/plink --bfile /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/1kg/geno/geno_for_PCA/1kg_eas_final --exclude /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/ALL_merged_rigorous_qc-merge.missnp --make-bed --out /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/1kg/geno/geno_for_PCA/1kg_eas_final_no_missnp
/data1/jiapl_group/lishuhua/software/general/plink --bfile /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/CAS/geno/CAS_for_PCA/CAS_final --exclude /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/ALL_merged_rigorous_qc-merge.missnp --make-bed --out /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/CAS/geno/CAS_for_PCA/CAS_final_no_missnp
/data1/jiapl_group/lishuhua/software/general/plink --bfile /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/geno/Chinese_for_PCA/merged_final --exclude /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/ALL_merged_rigorous_qc-merge.missnp --make-bed --out /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/geno/Chinese_for_PCA/merged_final_no_missnp

# 创建合并列表
echo "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/CAS/geno/CAS_for_PCA/CAS_final_no_missnp" > /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/merge_list.txt
echo "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/geno/Chinese_for_PCA/merged_final_no_missnp" >> /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/merge_list.txt

# 进行合并（这一步因为之前的严格协调，很可能直接成功，或只有极少量missnp）
/data1/jiapl_group/lishuhua/software/general/plink --bfile /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/1kg/geno/geno_for_PCA/1kg_eas_final_no_missnp \
--merge-list /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/merge_list.txt \
--make-bed \
--out /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/ALL_merged_rigorous_qc_final

In [None]:
# step8: LD pruning in merged dataset
# plink --bfile /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/ALL_merged_rigorous_qc_final --indep-pairwise 50 5 0.2 --out /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/ALL_merged_rigorous_qc_final_pruned
# plink2 --bfile /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/ALL_merged_rigorous_qc_final --extract /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/ALL_merged_rigorous_qc_final_pruned.prune.in --make-bed --out /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/ALL_merged_rigorous_qc_final_for_pca

# step9: caculate PCA !!! PLINK2 !!!
# plink2 --bfile /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/ALL_merged_rigorous_qc_final_for_pca --pca 10 --out /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/new_pca_results

In [None]:
#!/bin/bash
### Another way to do PCA ###
# --- 执行PCA和投影 (最新版语法) ---

echo -e "\nStep 2: Starting PCA Projection Analysis (Latest PLINK 2 Syntax)..."

# 2a. LD剪枝(Pruning): 这一步保持不变
echo " -> Stage A: Defining PCA space on 1KG reference panel..."
echo " --> 2a. Performing LD Pruning on 1KG samples..."
/data1/jiapl_group/lishuhua/software/general/plink2 --bfile /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/ALL_merged_rigorous_qc_final \
--keep /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/1kg_samples.txt \
--indep-pairwise 50 5 0.2 \
--out /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/1kg_ref_pruned_2

# 2b. 计算PC载荷(Loadings): 这是语法变化的地方
# 我们不再使用 --write-coeffs，而是直接在 --pca 参数后加上 approx_wts
# approx_wts 的意思是：执行近似PCA，并写入权重(weights)，也就是载荷(loadings)
echo " --> 2b. Calculating PC loadings from 1KG reference using new 'approx_wts' syntax..."
/data1/jiapl_group/lishuhua/software/general/plink2 --bfile /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/ALL_merged_rigorous_qc_final \
--extract /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/1kg_ref_pruned_2.prune.in \
--keep /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/1kg_samples.txt \
--freq counts \
--nonfounders \
--pca 10 allele-wts \
--out 1kg_ref_pca_2

echo "Stage 2: Projecting local cohorts using '--score variance-standardize'..."

/data1/jiapl_group/lishuhua/software/general/plink2 --bfile /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/ALL_merged_rigorous_qc_final \
--keep /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/CAS_UKB_samples.txt \
--read-freq 1kg_ref_pca_2.acount \
--score 1kg_ref_pca_2.eigenvec.allele 2 6 header-read no-mean-imputation variance-standardize \
--score-col-nums 7-16 \
--out /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Merge_CAS_UKB_1kg/CAS_UKB_pcs_projected_2