# 1. read metadata

In [None]:
import pandas as pd
from cyvcf2 import VCF, Writer
import subprocess
import os
import random

os.chdir('/mnt/qmtang/EvoFill_data/20251211_chr22/')

KGP_vcf = "/mnt/qmtang/EvoFill_data/20251204_chr22/1kGP_hg38_chr22_maf0.01_snps.vcf.gz"

VCF2DIS = "/home/qmtang/GitHub/VCF2Dis/bin/VCF2Dis"
KGP_ped = "/mnt/NAS/Omics/DNA/1kGP/meta/20130606_g1k.ped"


save_dir = "./data"
if not os.path.exists(save_dir):
        os.makedirs(save_dir)
random.seed(42)

pop_anno: dict[str, dict[str, str]] = {
    "GWD":  {"CHN":"冈比亚西部人",        "Region":"非洲/冈比亚",            "说明":"西非撒哈拉以南，班图语支"},
    "CHS":  {"CHN":"中国南方汉族",        "Region":"亚洲/中国南方",         "说明":"中国汉族南方群体"},
    "TSI":  {"CHN":"托斯卡纳人",          "Region":"欧洲/意大利",           "说明":"意大利中部托斯卡纳地区"},
    "PUR":  {"CHN":"波多黎各人",          "Region":"美洲/波多黎各",         "说明":"拉丁裔，欧洲-非洲-美洲土著混合"},
    "JPT":  {"CHN":"日本东京人",          "Region":"亚洲/日本",             "说明":"本州岛东部，大和族"},
    "IBS":  {"CHN":"西班牙伊比利亚人",    "Region":"欧洲/西班牙",           "说明":"伊比利亚半岛土著"},
    "YRI":  {"CHN":"尼日利亚约鲁巴人",    "Region":"非洲/尼日利亚",         "说明":"西非尼日尔-刚果语系"},
    "GIH":  {"CHN":"印度古吉拉特人",      "Region":"亚洲/印度（古吉拉特）", "说明":"印度西部，印欧语系"},
    "FIN":  {"CHN":"芬兰人",              "Region":"欧洲/芬兰",             "说明":"北欧乌拉尔语系，遗传瓶颈明显"},
    "CEU":  {"CHN":"北欧白人（犹他）",    "Region":"欧洲/西北欧→美国",      "说明":"西北欧移民后裔（美国犹他州）"},
    "ITU":  {"CHN":"印度泰卢固人",        "Region":"亚洲/印度（安得拉）",   "说明":"印度东南部，德拉维语系"},
    "STU":  {"CHN":"印度泰米尔人",        "Region":"亚洲/印度（泰米尔纳德）","说明":"印度最南端，德拉维语系"},
    "KHV":  {"CHN":"越南京族人",          "Region":"亚洲/越南",             "说明":"越南北部，南亚语系→越芒语支"},
    "CHB":  {"CHN":"中国北方汉族",        "Region":"亚洲/中国北方",         "说明":"中国汉族北方群体"},
    "LWK":  {"CHN":"肯尼亚卢赫雅人",      "Region":"非洲/肯尼亚",           "说明":"东非班图语支"},
    "ESN":  {"CHN":"尼日利亚埃桑人",      "Region":"非洲/尼日利亚",         "说明":"尼日尔-刚果语系，西非"},
    "ACB":  {"CHN":"加勒比非裔",          "Region":"美洲/加勒比→非洲",      "说明":"非洲奴隶贸易后裔（加勒比地区）"},
    "PJL":  {"CHN":"印度旁遮普人",        "Region":"亚洲/印度（旁遮普）",   "说明":"印度西北部，印欧语系"},
    "GBR":  {"CHN":"英国英格兰人",        "Region":"欧洲/英国",             "说明":"大不列颠岛土著"},
    "CLM":  {"CHN":"哥伦比亚麦德林人",    "Region":"美洲/哥伦比亚",         "说明":"拉丁裔，欧洲-美洲土著-非洲混合"},
    "CDX":  {"CHN":"中国西双版纳傣族人",  "Region":"亚洲/中国云南",         "说明":"傣泰民族，侗台语系"},
    "MSL":  {"CHN":"塞拉利昂门德人",      "Region":"非洲/塞拉利昂",         "说明":"西非曼德语支"},
    "PEL":  {"CHN":"秘鲁利马人",          "Region":"美洲/秘鲁",             "说明":"拉丁裔，克丘亚-欧洲混合"},
    "BEB":  {"CHN":"孟加拉国人",          "Region":"亚洲/孟加拉国",         "说明":"孟加拉湾沿岸，印欧语系"},
    "MXL":  {"CHN":"墨西哥洛杉矶人",      "Region":"美洲/墨西哥→美国",      "说明":"拉丁裔，美洲土著-欧洲混合"},
    "ASW":  {"CHN":"非裔美国人",          "Region":"美洲/美国",             "说明":"非洲奴隶贸易后裔（美国南部）"},
}


In [None]:
# 2. 读取 ped 文件
ped_df = pd.read_csv(KGP_ped, sep='\t',)

# 3. 读取 VCF 样本列表
vcf = VCF(KGP_vcf)
vcf_samples = set(vcf.samples)
vcf.close()

# 4. 筛选 ped 中也在 VCF 中出现的样本
ped_df = ped_df[ped_df['Individual ID'].isin(vcf_samples)]

# 5. 统计各个 population 的样本数量
pop_counts = ped_df['Population'].value_counts()
(pop_counts
 .rename_axis("Population")
 .to_frame("Samples")
 .assign(CHN=lambda x: x.index.map(lambda p: pop_anno[p]["CHN"]),
         Region=lambda x: x.index.map(lambda p: pop_anno[p]["Region"]))
)

# 2. splitting samples

In [None]:
minor_pop_list = ['CDX'] # 选择需要分割的人群

In [None]:
major_samples = ped_df[~ped_df['Population'].isin(minor_pop_list)]['Individual ID'].tolist()
minor_samples = ped_df[ ped_df['Population'].isin(minor_pop_list)]['Individual ID'].tolist()

random.shuffle(major_samples)
major_val   = major_samples[:100]          # 100 个验证集
major_train = major_samples[100:]          # 剩下的训练集

major_val_list   = os.path.join(save_dir, "major_pops_val.samples.txt")
major_train_list = os.path.join(save_dir, "major_pops_train.samples.txt")
with open(major_val_list, 'w') as f:
    for s in major_val:
        f.write(s + '\n')
with open(major_train_list, 'w') as f:
    for s in major_train:
        f.write(s + '\n')

In [7]:
def sample_vcf(in_vcf, sample_list, out_vcf, threads=8):
    """用 bcftools 按样本列表切子集"""
    subprocess.run([
        "bcftools", "view", "-S", sample_list,
        "-Oz", "-o", out_vcf, in_vcf, "--threads", str(threads)
    ], check=True)
    subprocess.run(["tabix", "-p", "vcf", out_vcf], check=True)

major_val_vcf   = os.path.join(save_dir, "major_pops_val.vcf.gz")
major_train_vcf = os.path.join(save_dir, "major_pops_train.vcf.gz")
sample_vcf(KGP_vcf, major_val_list,   major_val_vcf)
sample_vcf(KGP_vcf, major_train_list, major_train_vcf)

In [None]:
def mask_sites(in_vcf, out_vcf, missing_rate=0.9, seed=42):
    """
    按位点（variant）维度随机 mask missing_rate 比例，
    即每个位点以 missing_rate 概率被整列设成 ./.
    """
    random.seed(seed)
    tmp = out_vcf.replace(".gz", "")
    vcf = VCF(in_vcf)
    w   = Writer(tmp, vcf)
    for var in vcf:
        if random.random() < missing_rate:
            # 整列置空
            gts = [[-1, -1, True] for _ in range(len(vcf.samples))]
        else:
            gts = var.genotypes
        var.genotypes = gts
        w.write_record(var)
    w.close(); vcf.close()
    subprocess.run(["bgzip", "-f", tmp], check=True)
    subprocess.run(["tabix", "-p", "vcf", out_vcf], check=True)

major_val_mask_vcf = os.path.join(save_dir, "major_pops_val.mask90p.vcf.gz")
mask_sites(major_val_vcf, major_val_mask_vcf, missing_rate=0.9)

In [10]:
random.shuffle(minor_samples)
n_minor     = len(minor_samples)
fewshot_n   = max(1, int(n_minor * 0.1))
fewshot_sam = minor_samples[:fewshot_n]
remain_sam  = minor_samples[fewshot_n:]

# 样本列表文件路径
fewshot_list = os.path.join(save_dir, "minor_fewshot.samples.txt")
remain_list  = os.path.join(save_dir,  "minor_remain.samples.txt")
minor_all_list = os.path.join(save_dir, "minor_pops_all.samples.txt")

# 写 txt
def write_samples(sam_list, file_path):
    with open(file_path, 'w') as f:
        for s in sam_list:
            f.write(s + '\n')

write_samples(fewshot_sam, fewshot_list)
write_samples(remain_sam,  remain_list)
write_samples(minor_samples, minor_all_list)

# 4.1 全部 minor 样本
minor_all_vcf = os.path.join(save_dir, "minor_pops_all.vcf.gz")
sample_vcf(KGP_vcf, minor_all_list, minor_all_vcf)

# 4.2 全部 minor 90% mask
minor_all_mask_vcf = os.path.join(save_dir, "minor_pops_all.mask90p.vcf.gz")
mask_sites(minor_all_vcf, minor_all_mask_vcf, missing_rate=0.9)

# 4.3 few-shot 子集（完整位点）
fewshot_vcf = os.path.join(save_dir,"minor_pops_fewshot.vcf.gz")
sample_vcf(KGP_vcf, fewshot_list, fewshot_vcf)

# 4.4 remain 子集再 90% mask
remain_mask_vcf = os.path.join(save_dir,  "minor_pops_90pct.mask90p.vcf.gz")
mask_sites(minor_all_vcf, remain_mask_vcf, missing_rate=0.9)   # 位点全集已存在，直接用

In [11]:
def calc_evo_matrix(vcf_path, out_tsv, VCF2DIS_BIN):
    """计算并对称化距离矩阵"""
    subprocess.run([VCF2DIS_BIN, "-InPut", vcf_path, "-OutPut", out_tsv], check=True)
    dist = pd.read_csv(out_tsv, sep='\t', index_col=0, skiprows=1, header=None)
    idx  = [i.strip() for i in dist.index]
    dist.columns, dist.index = idx, idx
    dist.to_csv(out_tsv, sep='\t', header=True, index=True)

calc_evo_matrix(major_train_vcf,
                os.path.join(save_dir, "evo_mat_major_pops_train.tsv"),
                VCF2DIS)

calc_evo_matrix(fewshot_vcf,
                os.path.join(save_dir, "evo_mat_minor_fewshot.tsv"),
                VCF2DIS)

Total Sample Number to construct p-distance matrix is [ 3009 ]
Start To Cal ...
Start To Create P_distance ...
P_distance is created done ...
	Rscript	vistreecode.r	./data/evo_mat_major_pops_train.tsv
	or see more at
		[ https://github.com/hewm2008/VCF2Dis/blob/main/OtherWay2ConstructTree.md ]	./data/evo_mat_major_pops_train.tsv
Total Sample Number to construct p-distance matrix is [ 9 ]
Start To Cal ...
Start To Create P_distance ...
P_distance is created done ...
	Rscript	vistreecode.r	./data/evo_mat_minor_fewshot.tsv
	or see more at
		[ https://github.com/hewm2008/VCF2Dis/blob/main/OtherWay2ConstructTree.md ]	./data/evo_mat_minor_fewshot.tsv


In [None]:
def calc_evo_matrix(vcf_path, out_tsv, VCF2DIS_BIN):
    """计算并对称化距离矩阵"""
    subprocess.run([VCF2DIS_BIN, "-InPut", vcf_path, "-OutPut", out_tsv], check=True)
    dist = pd.read_csv(out_tsv, sep='\t', index_col=0, skiprows=1, header=None)
    idx  = [i.strip() for i in dist.index]
    dist.columns, dist.index = idx, idx
    dist.to_csv(out_tsv, sep='\t', header=True, index=True)

calc_evo_matrix("/mnt/qmtang/EvoFill_data/20251211_chr22/augment/hg38_chr22_aDNA_augment_pop.vcf.gz",
                "/mnt/qmtang/EvoFill_data/20251211_chr22/augment/evo_mat_aDNA.tsv",
                VCF2DIS)

Total Sample Number to construct p-distance matrix is [ 8953 ]
Start To Cal ...
Start To Create P_distance ...
P_distance is created done ...
	Rscript	vistreecode.r	/mnt/qmtang/EvoFill_data/20251211_chr22/augment/evo_mat_major_pops_train.tsv
	or see more at
		[ https://github.com/hewm2008/VCF2Dis/blob/main/OtherWay2ConstructTree.md ]	/mnt/qmtang/EvoFill_data/20251211_chr22/augment/evo_mat_major_pops_train.tsv
