This notebook implements the processing pipelines, from alignment to calling. The tools were installed on WSL (Windows Subsystem for Linux), while the data is retrieved in from within this notebook. This setup necessitates different paths for each OS, depending where it is needed.

In [1]:
from pprint import pprint as pp
from tqdm.notebook import tqdm
import numpy as np
import os
import re
import covid_utilities as cu

In [3]:
data = cu.load_json(cu.COVID_PATH + r"\sorted_samples.json")

In [4]:
vaf_dir = cu.DATA_PATH + "\\vafs\\"
vaf_dir_wsl = cu.DATA_PATH_WSL + "vafs/"

vcf_dir = cu.DATA_PATH + "\\vcfs\\"
vcf_dir_wsl = cu.DATA_PATH_WSL + "vcfs/"

samples_path = cu.DATA_PATH + "\\split_genomics\\"
samples_path_wsl = cu.DATA_PATH_WSL + "split_genomics/"

In [6]:
def extract_vafs_and_depth(file: str) -> (list[float], int):
    vafs = []
    depths = []
    with open(file, "r") as f:
        line = f.readline()
        while line:
            columns = line.split("\t")
            if len(columns) == 10:
                vals = columns[9].split(":")
                cols = columns[8].split(":")
                if len(cols) == 4 and cols[3] == "VAF":
                    vafs.extend(list(map(lambda x:float(x), vals[3].split(","))))
                cols = columns[7].split(";")
                for c in cols:
                    if c.startswith("DP") and not c.startswith("DP4"):
                        depth = int(c.split("=")[1])
                        depths.append(depth)
                        break
            line = f.readline()
    return vafs, int(np.rint(np.mean(depths)))

In [7]:
def combine_fastas(output_path: str, input_samples: list[str]):
    with open(output_path, "w") as outF:
        for sample in input_samples:
            with open(samples_path + sample + ".fa", "r") as f:
                outF.write(f.read()+"\n")

In [12]:
# bcftools and hisat2 are not added to the PATH and have to be thus called directly

def create_vcf_command(infile : str, outfile : str, counts : int, name : str):
    threads = 6
    commands = []
    if name+".fa" in os.listdir(cu.DATA_PATH + "\\aligned_samples"):
        aligned_file = cu.DATA_PATH_WSL + f"aligned_samples/{name}.fa"
    else:
        commands.append("echo \'Aligning....\'")
        if counts > 1000:
            commands.append(f"mafft --6merpair --thread {threads} --keeplength --addfragments {infile} NC_045512.2.fa | tee temp_aligned.fa > /dev/null")
        else:
            commands.append(f"mafft --thread {threads} --auto --add {infile} NC_045512.2.fa | tee temp_aligned.fa > /dev/null")
        commands.append("echo \'Copying aligned fasta....\'")
        commands.append(f"cp temp_aligned.fa /mnt/d/ncbi_dataset/ncbi_dataset/data/aligned_samples/{name}.fa")
        aligned_file = "temp_aligned.fa"
    commands.append("echo \'Creating SAM....\'")
    if counts < 100:
        commands.append(f"\'" + cu.SOURCE_PATH_WSL + "hisat2-2.2.1/hisat2\' -x NC_045512.2.fa -f {aligned_file} -S temp_all.sam --threads {threads} -k 50 --score-min L,0,-2")
    else:
        commands.append(f"\'" + cu.SOURCE_PATH_WSL + "hisat2-2.2.1/hisat2\' -x NC_045512.2.fa -f {aligned_file} -S temp_all.sam --threads {threads}")
    commands.append("echo \'Sorting....\'")
    commands.append(f"samtools sort --verbosity 10 --threads {threads} temp_all.sam -o temp_all-sorted.bam")
    commands.append("echo \'Pileup....\'")
    commands.append(f"bcftools mpileup -a FORMAT/AD,INFO/AD -d 120000 --threads {threads} -f NC_045512.2.fa temp_all-sorted.bam | tee temp_pileup > /dev/null")
    commands.append("echo \'Calling....\'")
    commands.append("bcftools call --ploidy 1 -mA temp_pileup | tee temp_called > /dev/null")
    commands.append("echo \'Annotating....\'")
    commands.append(f"bcftools +fill-tags temp_called -Ov -o {outfile+'.vcf'} -- -t AF,FORMAT/VAF")
    
    returncode = os.system('start /wait cmd /c wsl -- eval "{}"\n'.format(" && ".join(commands).replace("bcftools", "\'" + cu.SOURCE_PATH_WSL + "bcftools-1.14/bcftools\'")))
    if returncode != 0:
        print(returncode)

In [None]:
# loops over all countries and programmatically creates the VCF files and immediately extracts the VAFs to write them into another file

for country, info in tqdm(list(data.items())):
    print(country)
    country = country.replace(" ", "-")
    for month, month_samples in tqdm(list(info["months"].items())):
        print(month, month_samples["counts"], sep="\t")
        if month_samples["counts"] < 2:
            continue
        name = "m_{}_{}_{}".format(country, month, month_samples["counts"])
        temp_combined = name+".temp.fa"
        if name+".vcf" not in os.listdir(vcf_dir): # check if vcf has not been created already
            if temp_combined not in os.listdir():
                combine_fastas(temp_combined, month_samples["samples"])
            vcf_path_wsl = vcf_dir_wsl + name
            create_vcf_command(temp_combined, vcf_path_wsl, month_samples["counts"], name)
        r = re.compile("m_{}_{}_[0-9]+\.vaf".format(country, month))
        if len(list(filter(r.match, os.listdir(vaf_dir)))) == 0: # check if vaf has not been created already
            vcf_path = vcf_dir + name + ".vcf"
            vafs, avg_depth = extract_vafs_and_depth(vcf_path)
            name = "m_{}_{}_{}".format(country, month, avg_depth)
            vaf_path = vaf_dir + name + ".vaf"
            with open(vaf_path, "w") as f:
                f.write("\n".join(map(lambda x:str(x), vafs)))
        try:
            os.remove(temp_combined)
        except:
            pass
    print("----------------------------------------------------")