In [None]:
!gcc zhunt3-alan.c -lm -o zhunt3
!chmod a+x zhunt3

In [None]:
!./zhunt3 --help

In [None]:
file = "../ncbi_dataset/data/GCF_000787575.1/GCF_000787575.1_Asub_2.0_genomic.fna"
from Bio import SeqIO

# divide fna file into separate file scaffold wise
# create new directory for them
import os

os.makedirs("scaffolds", exist_ok=True)

for record in SeqIO.parse(file, "fasta"):
    with open(f"scaffolds/{record.id}.fna", "w") as f:
        SeqIO.write(record, f, "fasta")


In [None]:
# run zhunt3 on all files in scaffolds directory in parallel (8 threads)
import os
import subprocess
import concurrent.futures

def run_zhunt3(file):
    subprocess.run(["./zhunt3", "12", "8", "12", f"scaffolds/{file}"])

files = os.listdir("scaffolds")

# Create a ThreadPoolExecutor
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
    # Use the executor to map the function to the files
    executor.map(run_zhunt3, files)

In [None]:
import pandas as pd
def combine_overlapping_regions(zhunt_filtered) -> pd.DataFrame:
    zhunt_filtered = zhunt_filtered.sort_values("Start")
    zhunt_filtered = zhunt_filtered.reset_index(drop=True)
    i = 0
    while i < len(zhunt_filtered) - 1:
        if zhunt_filtered.loc[i, "End"] >= zhunt_filtered.loc[i+1, "Start"]:
            zhunt_filtered.loc[i, "End"] = max(zhunt_filtered.loc[i, "End"], zhunt_filtered.loc[i+1, "End"])
            zhunt_filtered = zhunt_filtered.drop(i+1)
            zhunt_filtered = zhunt_filtered.reset_index(drop=True)
        else:
            i += 1
    return zhunt_filtered

In [None]:
# combine all the output files into one bed file with columns as scaffold, start, end, score
# output files have the same name as input files with .Z-SCORE extension
# 

import pandas as pd


# all_files = os.listdir("scaffolds")
# output_files = [file for file in all_files if file.endswith(".Z-SCORE")]

final = pd.DataFrame(columns=["Scaffold","Start","End","Score"])

for record in SeqIO.parse(file, "fasta"):
    scaffold = record.id
    output_file_name = f"scaffolds/{scaffold}.fna.Z-SCORE"
    zhunt = pd.read_csv(output_file_name, skiprows=1, names=["Start","End","1","2","3","Score","Seq","4"], delim_whitespace=True)
    zhunt["Scaffold"] = scaffold
    zhunt = zhunt[["Scaffold","Start","End","Score"]]
    # filter by score
    zhunt = zhunt[zhunt["Score"] > 1000]
    # combine regions if they overlap
    zhunt = combine_overlapping_regions(zhunt)
    
    final = pd.concat([final, zhunt])

In [None]:
final.head(10)

In [None]:
len(final)

In [None]:
# save the final dataframe to a bed file
final.to_csv("zhunt.bed", sep="\t", index=False, header=False)

In [None]:
# !./zhunt3 12 8 12 "../ncbi_dataset/data/GCF_000787575.1/GCF_000787575.1_Asub_2.0_genomic.fna"