# NCLDV filtration

NCLDV filtration is an auotmatic tool to filter candidate NCLDVs based on the core gene density.

Citation:

> "Genome-resolved year-round dynamics reveal a broad range of giant virus microdiveristy" In Prep. 

In [1]:
import os
import subprocess
import seaborn
import math

ModuleNotFoundError: No module named 'seaborn'

In [36]:
# The needed input is a directory wiht .fna files of the bins.
wkdir = os.getcwd()

# Here, modify the "test_bins"
# I use a set of 20 bins from a coastal metagenome as the example.
bindir = os.path.join(wkdir, "test_bins")

In [38]:
# Make sure the prodigal is available in the environment
# Use Prodigal -meta for gene call

prodigal_output_dir = os.path.join(wkdir, "prodigal_output")
os.makedirs(prodigal_output_dir, exist_ok=True)

for bin_file in os.listdir(bindir):
    if bin_file.endswith(".fna"):
        input_path = os.path.join(bindir, bin_file)
        base_name = os.path.splitext(bin_file)[0]
        output_faa_path = os.path.join(prodigal_output_dir, f"{base_name}.faa")
        os.makedirs(os.path.dirname(output_faa_path), exist_ok=True)
        cmd = f"prodigal -i {input_path} -a {output_faa_path} -p meta"
        subprocess.run(cmd, shell=True)

In [39]:
# For convenience, I've modified the IDs of Prodigal output to align with the bin names.
# All protein sequence are saved in 
outfile1_path = os.path.join(wkdir, "faa_name_change.csv")
outfile2_path = os.path.join(wkdir, "all_cds.faa")

with open(outfile1_path, "w") as outfile1, open(outfile2_path, "w") as outfile2:
    for path,dir_list,file_list in os.walk(prodigal_output_dir):
        for filename in file_list:
            if "bin" in filename:
                corename = filename.replace('.faa','')
                n = 1
                with open(os.path.join(prodigal_output_dir, f"{corename}.faa"), "r") as infile:
                    context = infile.read()
                    tmp_list = context.split(">")
                    for tmp in tmp_list:
                        if len(tmp) != 0:
                            lines = tmp.split("\n")
                            outfile1.write(f"{lines[0]},{corename}_{n}\n")
                            outfile2.write(f"\n>{corename}_{n}\n")
                            outfile2.writelines(lines[1:])
                            n += 1

In [55]:
# For convenience, I've modified the IDs of Prodigal output to align with the bin names.
# All protein sequence are saved in "all_cds.faa"
# Modification could be checked in "faa_name_change.csv"

outfile1_path = os.path.join(wkdir, "faa_name_change.csv")
outfile2_path = os.path.join(wkdir, "all_cds.faa")

with open(outfile1_path, "w") as outfile1, open(outfile2_path, "w") as outfile2:
    for root, dirs, files in os.walk(prodigal_output_dir):
        for filename in filter(lambda f: "bin" in f and f.endswith('.faa'), files):
            corename = filename.rsplit('.', 1)[0]
            with open(os.path.join(root, filename), "r") as infile:
                for i, record in enumerate(infile.read().strip().split(">")[1:], start=1):
                    header, *sequence = record.split("\n")
                    outfile1.write(f"{header},{corename}_{i}\n")
                    outfile2.write(f">{corename}_{i}\n{''.join(sequence)}\n")

In [56]:
# Weights of NCVOGs, which is determined by the conservation of each NCVOG per NCLDV family.
# This is determined by a in-house analysis using reference genomes.
ncvog_weights = {
    "NCVOG0022": 0.9,
    "NCVOG0023": 1.1,
    "NCVOG0037": 0.5,
    "NCVOG0038": 1.1,
    "NCVOG0052": 0.9,
    "NCVOG0076": 1.0,
    "NCVOG0236": 0.8,
    "NCVOG0249": 1.0,
    "NCVOG0261": 0.7,
    "NCVOG0262": 1.0,
    "NCVOG0271": 0.9,
    "NCVOG0272": 1.0,
    "NCVOG0273": 0.8,
    "NCVOG0274": 0.9,
    "NCVOG0276": 0.8,
    "NCVOG1060": 0.6,
    "NCVOG1117": 0.7,
    "NCVOG1127": 0.4,
    "NCVOG1164": 1.0,
    "NCVOG1353": 0.8,
}

In [None]:
# Execute hmmsearch for each HMM profile

NCVOGs_list = [
    "NCVOG0022", "NCVOG0023", "NCVOG0037", "NCVOG0038", "NCVOG0052", "NCVOG0076", "NCVOG0236", "NCVOG0249",
    "NCVOG0261", "NCVOG0262", "NCVOG0271", "NCVOG0272", "NCVOG0273", "NCVOG0274", "NCVOG0276", "NCVOG1060",
    "NCVOG1117", "NCVOG1127", "NCVOG1164", "NCVOG1353"
]

prodigal_output_dir = os.path.join(wkdir, "hmmout")
os.makedirs("hmmout", exist_ok=True)
for ncvo in NCVOGs_list:
    hmm_file = os.path.join(f"hallmark_hmm/{ncvo}.hmm")
    output_file = f"hmmout/{ncvo}.out"
    cmd = f"hmmsearch --cpu 10 --notextw --incE 1e-5 --tblout {output_file} {hmm_file} {outfile2_path}"
    subprocess.run(cmd, shell=True)

In [52]:
# Parse the hmmsearch output and calculate hits
# Store in a dictionary
info_dict = {dirname.replace(".faa",""): [0] * len(NCVOGs_list) for dirname in os.listdir(prodigal_output_dir) if "bin" in dirname}

for i, ncvo in enumerate(NCVOGs_list):
    with open(os.path.join(f"hmmout/{ncvo}.out"), "r") as infile:
        lines = infile.readlines()
        for line in lines:
            if not line.startswith("#"):
                for key in info_dict.keys():
                    if key in line:
                        info_dict[key][i] += 1

In [51]:
# Summerize the result and output
# 5.75 of Density_index is reccomended as the cut-off for potential NCLDV bins
summary_file_path = os.path.join(wkdir, "summary_core_genes.csv")
with open(summary_file_path, "w") as outfile:
    outfile.write("ID," + ",".join(NCVOGs_list) + ",genome_size,weighted_sum,density_index\n")
    for binID in info_dict:
        genomesize = 0
        fastafile = os.path.join(bindir, f"{binID}.fna")
        with open(fastafile, "r") as f:
            for line in f:
                if not line.startswith(">"):
                    genomesize += len(line.strip())
        # Prepare the list to hold binary values for NCVOG presence/absence
        binary_hits = [1 if hit > 0 else 0 for hit in info_dict[binID][:len(NCVOGs_list)]]
        weighted_hits = [hit * ncvog_weights[NCVOGs_list[i]] for i, hit in enumerate(binary_hits)]
        weighted_sum = sum(weighted_hits)
        density_index = weighted_sum/(math.log10(genomesize)-4)
        # Append the genome size to the binary hits list
        binary_hits_with_genome_size = binary_hits + [genomesize, weighted_sum,density_index]
        # Write the binID, binary hit values, and genome size to the file
        outfile.write(f"{binID}," + ",".join(map(str, binary_hits_with_genome_size)) + "\n")