In [None]:
from utils.bedtools_utils import bbtools_kmercountexact
from pathlib import Path
from Bio import SeqIO
import argparse
import re


test = True
if test:
    input_fasta_dir = Path('/home/jpereira/OEs/Results/OE1/NamSeqs/Data/mcl_clustering/sequneces_clusters/')
    params_min_fasta_size = 35
    params_kmer_size = 35
    params_min_kmer_counts = 35
    params_threads = 5
    output_seeds_dir = Path('/home/jpereira/OEs/Results/OE1/NamSeqs/Data/make_seeds/')
else:
    parser = argparse.ArgumentParser()
    parser.add_argument('--input-fasta-dir', required=True, help="directory with the sequences useed to make seeds")    
    parser.add_argument('--params-min-fasta-size', default=35, help='Minimum number of reads reqeried in each fasta')
    parser.add_argument('--params-kmersize', default=30, help='Size of K-mer')
    parser.add_argument('--params-threads', default=5)
    parser.add_argument('--output-seeds-dir', required=True, help='Output directory with the seeds tables in tsv format')
    args = parser.parse_args()

    input_fasta_dir        = args.input_fasta_dir
    params_min_fasta_size  = args.params_min_fasta_size
    params_kmer_size       = args.params_kmer_size
    params_min_kmer_counts = args.params_min_kmer_counts
    params_threads         = args.params_threads
    output_seeds_dir       = args.output_seeds_dir

output_seeds_dir.mkdir(exist_ok=True)


# Loop over files in the input directory
for fasta in input_fasta_dir.iterdir():
    
    if not fasta.is_file():
        continue

    if not re.search(r"\.(fasta|fa|fastq)$", fasta.name):
        print(f"❌ Skipping non-FASTA/Q file: {fasta}")
        continue
    
    # Count the number of entries in the sequence file
    q = sum(1 for _ in SeqIO.parse(fasta, "fasta"))  # or "fastq" depending on format
    
    if q < params_min_fasta_size:
        print(f"⚠️ Skipping {fasta.name}: only {q} entries (min is {params_min_fasta_size})")
        continue

    # Remove file extension using regex
    file_name = re.sub(r"\.(fasta|fa|fastq)$", "", fasta.name)
    
    output_seed_tsv = output_seeds_dir / f"{file_name}.tsv"
    
    bbtools_kmercountexact(
        infile=fasta,
        outfile=output_seed_tsv,
        k=params_kmer_size,
        min_counts=params_min_kmer_counts,
        khist=None,
        make_fasta=False,
        threads=params_threads,
        extra=None,
        exe="kmercountexact.sh")
    

❌ Skipping non-FASTA/Q file: /home/jpereira/OEs/Results/OE1/NamSeqs/Data/mcl_clustering/sequneces_clusters/.snakemake_timestamp
⚠️ Skipping cluster_2.fasta: only 0 entries (min is 35)
▶ Running: kmercountexact.sh in=/home/jpereira/OEs/Results/OE1/NamSeqs/Data/mcl_clustering/sequneces_clusters/cluster_1.fasta k=35 threads=5 mincount=35 fastadump=f out=/home/jpereira/OEs/Results/OE1/NamSeqs/Data/make_seeds/cluster_1.tsv overwrite=true


java -ea -Xmx170951m -Xms170951m -cp /usr/local/bin/current/ jgi.KmerCountExact in=/home/jpereira/OEs/Results/OE1/NamSeqs/Data/mcl_clustering/sequneces_clusters/cluster_1.fasta k=35 threads=5 mincount=35 fastadump=f out=/home/jpereira/OEs/Results/OE1/NamSeqs/Data/make_seeds/cluster_1.tsv overwrite=true
Executing jgi.KmerCountExact [in=/home/jpereira/OEs/Results/OE1/NamSeqs/Data/mcl_clustering/sequneces_clusters/cluster_1.fasta, k=35, threads=5, mincount=35, fastadump=f, out=/home/jpereira/OEs/Results/OE1/NamSeqs/Data/make_seeds/cluster_1.tsv, overwrite=true]

Set threads to 5
Initial:
Memory: max=179281m, total=179281m, free=179195m, used=86m

Executing ukmer.KmerTableSetU [in=/home/jpereira/OEs/Results/OE1/NamSeqs/Data/mcl_clustering/sequneces_clusters/cluster_1.fasta, k=35, threads=5, mincount=35, fastadump=f, out=/home/jpereira/OEs/Results/OE1/NamSeqs/Data/make_seeds/cluster_1.tsv, overwrite=true]

K was changed from 35 to 34
Initial:
Ways=31, initialSize=128000, prefilter=f, preall


# Old script used to generate seeds

In [7]:
############################################ Loading libraries and user arguments ###########################################

import os
from utils.blast_utils import unnoise_coords, makeblast_db, blastn, alignment_absolute_start_end, select_internal_aligns, retrieve_single_alignments
import pandas as pd
import matplotlib.pyplot as plt
import argparse
from Bio import SeqIO
from pathlib import Path  

test = True
if test:
    print('Warning! Developer mode activated')
    input_fasta = '/home/jpereira/OEs/Results/OE1/NamSeqs/Data/mcl_clustering/sequneces_clusters/cluster_1.fasta'
    param_min_instances = 1
    param_threads = 90
    params_cluster_identity = 0.97
    params_cluster_coverage = 0.97
    params_select_internals_alns = True
    params_retrive_sinlge_aln = False
    
    output_dir = Path('/home/jpereira/OEs/Results/OE1/NamSeqs/Data/make_seeds/')
    work_name = 'single_95_95'
    work_name = 'all_93_93'
    work_name = 'all_97_97'
     
else:
    parser = argparse.ArgumentParser()
    parser.add_argument('--input-fasta')
    parser.add_argument('--input-clust-names', type=float, default=95)
    parser.add_argument('--input-inner-fasta-tsv', type=int, default=50)
    parser.add_argument('--param-threads', type=int, default=20)
    parser.add_argument('--output-blocks-fasta')
    
    # Capture the parsed arguments in a variable (args)
    args = parser.parse_args()
    
    input_fasta = args.input_fasta
    input_inner_fasta_tsv = args.input_inner_fasta_tsv
    param_threads = args.param_threads
    output_blocks_fasta = args.output_blocks_fasta
    
if work_name:
    output_dir = output_dir / work_name    

output_dir.mkdir(exist_ok=True, parents=True)

############################################ Preparing sequences for iterative block finding ###########################################

output_reciprocal_blast_tsv = output_dir / "blastn.infasta_reciprocal.tsv"
infasta_db_dir = os.path.join(os.path.dirname(output_reciprocal_blast_tsv), 'blast_db', 'infasta_db') #'/home/jpereira/OEs/OE1.v2/Data/blocks/blast_db/infasta_db' 
os.makedirs(infasta_db_dir, exist_ok=True)

## Make a reciprocal blast using blastn function from blast utils
infasta_db = makeblast_db(seqs_path=input_fasta, db_out=infasta_db_dir, remove_old_db=True)
blastn(blast_input_seqs=input_fasta, blast_db_file=infasta_db, blast_output_table_tsv=output_reciprocal_blast_tsv,
             num_threads=100, reward=2, gap_extend=2, gapopen=4, penalty=-2, word_size=15)


##### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

############################################ Preparing sequences for iterative block finding ###########################################




blast_df = pd.read_csv(output_reciprocal_blast_tsv, sep='\t')
blast_df.columns = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 
                    'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'qlen', 'slen']

blast_size = blast_df.shape[0]
print(f'Org_size: {blast_size}')
s0_df = blast_df[(blast_df['pident'] > 90) & (blast_df['length'] > 15)]
print(f'Filtered pident 90%: {s0_df.shape[0]/blast_size}')
s1_df = alignment_absolute_start_end(s0_df)
s2_df = select_internal_aligns(df=s1_df,border=10) 

# Filter by query alignment interval
if params_select_internals_alns:
    s3_df = s2_df# blast.fuse_contained_intervals(df=s2_df, start_col='qstart', end_col='qend', group_cols=['qseqid', 'sseqid'], keep_highest_score_col='bitscore')
else:
    s3_df = s1_df

##### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


############################################ Iterative Block finding ###########################################

import warnings
warnings.filterwarnings('ignore')

import shutil
import pandas as pd
from utils.bedtools_utils import bedtools_getfasta, coords_to_regions, blast_to_bed
from utils.vsearch_utils import vsearch_dereplication, vsearch_clustersize, vsearch_sortbysize


iteration_dir = output_dir /  'block_iterations'
print(f"Iteration dir: {iteration_dir}")
if os.path.isdir(iteration_dir): 
    print("Removieng previous iteration dir")
    shutil.rmtree(iteration_dir)
iteration_dir.mkdir(exist_ok=True)

iteration = 0
num_blocks = []
while iteration < 2:
    
    print(f"\n###### ITERATION {iteration} ######\n")

    in_fasta                         = iteration_dir / f'{iteration}'     / 'centroids.fasta'
    blast_db                         = iteration_dir / f'{iteration + 1}' / 'blast_db'
    reciprocal_blast_tsv             = iteration_dir / f'{iteration + 1}' / 'blastn.reciprocal.tsv'
    coords_bed                       = iteration_dir / f'{iteration + 1}' / 'regions.bed'
    extracted_regions_fasta          = iteration_dir / f'{iteration + 1}' / 'regions.fasta'
    retrieve_coords_bed              = iteration_dir / f'{iteration}'     / 'retrieve' / 'regions.bed'
    retrieve_extracted_regions_fasta = iteration_dir / f'{iteration}'     / 'retrieve' / 'regions.fasta'
    derep_fasta                      = iteration_dir / f'{iteration + 1}' / 'derep.fasta'
    sorted_fasta                     = iteration_dir / f'{iteration + 1}' / 'sorted.fasta'
    centroids_fasta                  = iteration_dir / f'{iteration + 1}' / 'centroids.fasta'
    clusters_uc                      = iteration_dir / f'{iteration + 1}' / 'clusters.uc'
    
    iteration_path = os.path.dirname(clusters_uc)
    iteration_log =  os.path.join(iteration_path, 'logs')
    
    (clusters_uc.parent).mkdir( exist_ok=True)
    (retrieve_coords_bed.parent).mkdir(exist_ok=True, parents=True)
    
    ## Make a reciprocal blast using blastn function from blast utils
    if iteration == 0:
        blast_df = s3_df
        in_fasta = input_fasta
    else:
        blast_db = makeblast_db(seqs_path=in_fasta, db_out=blast_db, remove_old_db=True, log_file=iteration_log)
        
        blastn(blast_input_seqs=in_fasta,
               blast_db_file=blast_db,
               blast_output_table_tsv=reciprocal_blast_tsv,
               num_threads=80, reward=1, gap_extend=5, gapopen=5,
               penalty=-1, word_size=10, log_file=iteration_log)
        
        blast_df = pd.read_csv(reciprocal_blast_tsv, sep='\t')
        blast_df.columns = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 
                    'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'qlen', 'slen']

    blast_df = blast_df[blast_df['pident'] > 90]
    blast_df = blast_df[blast_df['length'] > 20]
    
    if params_retrive_sinlge_aln:
        # Retrive aligmentes that only map with itself
        retrive_df, blast_df = retrieve_single_alignments(blast_df)
        if retrive_df.shape[0] != 0:
            print(f"Retrieving sinlge mapped blocks iteration: {iteration}")
            retrive_bed_df = blast_to_bed(retrive_df)
            retrive_bed_df = retrive_bed_df.drop_duplicates()
            retrive_bed_df.to_csv(retrieve_coords_bed, sep='\t', header=False, index=False)
            bedtools_getfasta(fasta_path=in_fasta, bed_path=retrieve_coords_bed, extracted_regions_path=retrieve_extracted_regions_fasta, show_command=True, log_file=iteration_log)
    else:
        print(f"Retrieving all blocks iteration: {iteration}")
        retrive_bed_df = blast_to_bed(blast_df)
        retrive_bed_df = retrive_bed_df.drop_duplicates()
        retrive_bed_df.to_csv(retrieve_coords_bed, sep='\t', header=False, index=False)
        bedtools_getfasta(fasta_path=in_fasta, bed_path=retrieve_coords_bed, extracted_regions_path=retrieve_extracted_regions_fasta, show_command=True, log_file=iteration_log)
    
    print(f"Number of Retrived Seeds in iteration {iteration}: {len(retrive_bed_df)}")

        
    if blast_df.empty:
        print("Descomposition finished, all blocks are independent")
        break

    unnoised_df = blast_df.groupby('qseqid').apply(lambda x: unnoise_coords(x, radious=15))
    unnoised_df = unnoised_df.reset_index().drop(columns='level_1')

    # Filter coordinates with low number of instances 
    unnoised_df = unnoised_df[unnoised_df['instances'] > param_min_instances]

    # Apply the function across groups
    get_fasta_df = unnoised_df.groupby('qseqid', group_keys=False).apply(coords_to_regions)
    get_fasta_df.to_csv(coords_bed, sep='\t', header=False, index=False)
    bedtools_getfasta(fasta_path=in_fasta, bed_path=coords_bed, extracted_regions_path=extracted_regions_fasta, show_command=True, log_file=iteration_log)
    vsearch_dereplication(input_fasta=extracted_regions_fasta, derep_fasta=derep_fasta, min_seq_length=32, log_file=iteration_log)
    vsearch_sortbysize(derep_fasta=derep_fasta, sorted_fasta=sorted_fasta, log_file=iteration_log)
    vsearch_clustersize( sorted_fasta=sorted_fasta, centroids_fasta=centroids_fasta, uc_file=clusters_uc,query_cov=params_cluster_coverage,
                        target_cov=params_cluster_coverage, id_thresh=params_cluster_identity, min_seq_length=32, log_file=iteration_log,
                        use_both_strands=True,use_sizein=True)
    
    
    iteration += 1

##### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -   



Building a new DB, current time: 06/27/2025 18:13:50
New DB name:   /home/jpereira/OEs/Results/OE1/NamSeqs/Data/make_seeds/all_97_97/blast_db/infasta_db/cluster_1.fasta
New DB title:  /home/jpereira/OEs/Results/OE1/NamSeqs/Data/mcl_clustering/sequneces_clusters/cluster_1.fasta
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 209 sequences in 0.00580883 seconds.


makeblastdb ran successfully.
Running: blastn -query /home/jpereira/OEs/Results/OE1/NamSeqs/Data/mcl_clustering/sequneces_clusters/cluster_1.fasta -db /home/jpereira/OEs/Results/OE1/NamSeqs/Data/make_seeds/all_97_97/blast_db/infasta_db/cluster_1.fasta -out /home/jpereira/OEs/Results/OE1/NamSeqs/Data/make_seeds/all_97_97/blastn.infasta_reciprocal.tsv -num_threads 100 -reward 2 -gapextend 2 -gapopen 4 -penalty -2 -word_size 15 -outfmt '6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen'
blastn ran successfully.
Org_size: 2604

In [6]:
retrive_bed_df.drop_duplicates()

Unnamed: 0,qseqid,qstart,qend
0,Seq1;size=92,1,45
8,Seq1;size=92,1,44
12,Seq1;size=92,3,45
13,Seq1;size=92,1,43
16,Seq1;size=92,1,42
...,...,...,...
33766,Seq988;size=1,44,85
33767,Seq988;size=1,50,85
33768,Seq988;size=1,44,79
33769,Seq988;size=1,52,89
