In [1]:
import os, logging
import pandas as pd
import numpy as np
from tqdm import tqdm
import subprocess
import vcfpy

from helpers import get_ip_address, has_write_permission, save_preprocessed_data

### Set variables

In [2]:
data_locations = {
    '223.195.111.48': '/project/datacamp/team11/data',
    '147.47.44.229': '/home/jinhyun/data/1kGP',
}

chr_list = [str(x) for x in range(1,23)]
gt_dict = {"0|0" :0, "0|1" : 1, "1|0" : 2, "1|1" : 3 } # genotype dict for converting string-> inteter 

### Validation checks

In [3]:
data_path = data_locations.get(get_ip_address(), '/not_found')
sample_annotation_file = os.path.join(data_path, "igsr-1000 genomes 30x on grch38.tsv")
output_path = os.path.join(data_path, "preprocessed")

assert os.path.exists(data_path), f"Data path not exists: {data_path} OR IP setting is incorrect: {get_ip_address()}"
assert os.path.isfile(sample_annotation_file), f"File not exists : {sample_annotation_file}"
assert has_write_permission(data_path), f"You do not have write permission for {data_path}"

if not os.path.exists(output_path):
    os.makedirs(output_path)

assert has_write_permission(output_path), f"You do not have write permission for {output_path}"


In [4]:
def count_vcf_line_count(filename):
    cmd = f"grep -v '^#' {filename} | wc -l"
    result = subprocess.run(cmd, shell=True, text=True, capture_output=True)
    return int(result.stdout.strip())

def preprocess_vcf(vcf_file_name, sample_annotation_file, output_file_prefix):
    vcf_line_count = count_vcf_line_count(vcf_file_name)
    print(f"----- processing {vcf_file_name} file with {vcf_line_count} variants -----")

    reader = vcfpy.Reader.from_path(vcf_file_name)
    sample_names_from_vcf = reader.header.samples.names

    gt_mat_raw = np.empty((vcf_line_count, len(sample_names_from_vcf)), dtype=np.int8)
    gt_mat_raw.fill(0)

    variant_info_header = ['CHROM', 'POS', 'REF', 'ALT']
    variant_info_df_raw = pd.DataFrame(index = range(vcf_line_count), columns = variant_info_header, dtype = str)

    status_counter = {
        "total_variant" : 0,
        "total_snps" : 0,
        "total_meaningful_snps" : 0
    }

    for record in tqdm(reader, total = vcf_line_count):
        status_counter["total_variant"] += 1
        if not record.is_snv():
            continue
        status_counter["total_snps"] += 1

        gt_int = [gt_dict[call.data.get('GT')] if call.data.get('GT') in gt_dict else len(gt_dict) for call in record.calls]
        if all(v == 0 for v in gt_int):
            continue
        status_counter["total_meaningful_snps"] += 1

        assert len(gt_int) == gt_mat_raw.shape[1]
        gt_mat_raw[status_counter["total_meaningful_snps"]-1, :] = gt_int

        variant_info = [record.CHROM, str(record.POS), record.REF] + [alt.value for alt in record.ALT]
        variant_info_df_raw.iloc[status_counter["total_meaningful_snps"]-1] = variant_info
        
    reader.close()

    assert(status_counter["total_variant"] == vcf_line_count)
    print(status_counter)

    sample_annotation = pd.read_csv(sample_annotation_file, sep="\t")
    sample_name_to_idx = {name : idx for idx, name in enumerate(sample_names_from_vcf)}
    indices_for_sort_sample = [sample_name_to_idx[name] for name in sample_annotation["Sample name"]]

    gt_mat = gt_mat_raw[:status_counter["total_meaningful_snps"], :].transpose()[indices_for_sort_sample,:]
    variant_info_df = variant_info_df_raw.iloc[:status_counter["total_meaningful_snps"]]

    save_preprocessed_data(gt_mat, variant_info_df, output_file_prefix)
    print(f"sample annotations (#samples, ) : {sample_annotation.shape}")



### Read each vcf file and convert to matrix format

In [5]:
## inject a test code
#preprocess_vcf("/home/jinhyun/data/1kGP/temp.vcf", 
#               os.path.join(data_path, "temp_igsr.tsv"), 
#               os.path.join(output_path, "chr_test"))
    
#preprocess_vcf(os.path.join(data_path, "test.vcf"), 
#               sample_annotation_file, 
#               os.path.join(output_path, "chrT"))

----- processing /home/jinhyun/data/1kGP/test.vcf file with 4889 variants -----


100%|██████████| 4889/4889 [01:33<00:00, 52.43it/s]


{'total_variant': 4889, 'total_snps': 4243, 'total_meaningful_snps': 4243}
genotype matrix shape (#samples, #features) : (3202, 4243) -> saved to /home/jinhyun/data/1kGP/preprocessed/chrT_matrix.npy
variant info dataframe (#features, ): (4243, 4) -> saved to /home/jinhyun/data/1kGP/preprocessed/chrT_variant.csv
sample annotations (#samples, ) : (3202, 9)


In [None]:
for chr in chr_list[::-1]:
    vcf_file_name = os.path.join(data_path, f"1kGP_high_coverage_Illumina.chr{chr}.filtered.SNV_INDEL_SV_phased_panel.vcf")
    if not os.path.exists(vcf_file_name):
        logging.warning(f"can not find vcf file for chromosome {chr}")
        continue

    preprocess_vcf(vcf_file_name, 
        sample_annotation_file, 
        os.path.join(output_path, f"chr{chr}"))