In [1]:
############################################ Loading libraries and user arguments ###########################################

import os
from utils.blast_utils import unnoise_coords, makeblast_db, blastn, alignment_absolute_start_end, select_internal_aligns, retrieve_single_alignments
import pandas as pd
import matplotlib.pyplot as plt
import argparse
from Bio import SeqIO
from pathlib import Path  

test = True
if test:
    print('Warning! Developer mode activated')
    input_fasta = '/home/jpereira/OEs/Results/OE1/NamSeqs/Data/seed_extension/all_97_97/blocks.fasta'
    input_fasta = '/home/jpereira/OEs/Results/OE1/NamSeqs/Data/seed_extension/all_97_97_slice_Mp_B/blocks.fasta'
    input_fasta = '/home/jpereira/OEs/Blocks_Namasivayam.fa'
    param_min_instances = 0 # Must be always 0 when you are working with extended seeds
    param_threads = 20
    params_cluster_identity = 0.85
    params_cluster_coverage = 0.85
    params_select_internals_alns = False # Must be always false when you are working with extended seeds
    params_retrive_sinlge_aln = False
    
    output_dir = Path('/home/jpereira/OEs/Results/OE1/NamSeqs/Data/TrimNam/')
    #work_name = 'single_95_95'
    work_name = 'all_95_95'
    work_name = 'all_93_93'
    work_name = 'all_97_97'
    work_name = 'all_97_97_slice_Mp_B'
    work_name = 'all_97_97_slice_Mp_B.2'
    work_name = 'all_97_97_s70'
    work_name = 'kmer30'
    work_name = 'NamBlocks_trim'
else:
    parser = argparse.ArgumentParser()
    parser.add_argument('--input-fasta')
    parser.add_argument('--input-clust-names', type=float, default=95)
    parser.add_argument('--input-inner-fasta-tsv', type=int, default=50)
    parser.add_argument('--param-threads', type=int, default=20)
    parser.add_argument('--output-blocks-fasta')
    
    # Capture the parsed arguments in a variable (args)
    args = parser.parse_args()
    
    input_fasta = args.input_fasta
    input_inner_fasta_tsv = args.input_inner_fasta_tsv
    param_threads = args.param_threads
    output_blocks_fasta = args.output_blocks_fasta
    
if work_name:
    output_dir = output_dir / work_name    

output_dir.mkdir(exist_ok=True, parents=True)


##### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


output_reciprocal_blast_tsv = output_dir / "blastn.infasta_reciprocal.tsv"
infasta_db_dir = os.path.join(os.path.dirname(output_reciprocal_blast_tsv), 'blast_db', 'infasta_db') #'/home/jpereira/OEs/OE1.v2/Data/blocks/blast_db/infasta_db' 
os.makedirs(infasta_db_dir, exist_ok=True)

## Make a reciprocal blast using blastn function from blast utils
infasta_db = makeblast_db(seqs_path=input_fasta, db_out=infasta_db_dir, remove_old_db=True)
blastn(blast_input_seqs=input_fasta, blast_db_file=infasta_db, blast_output_table_tsv=output_reciprocal_blast_tsv,
             num_threads=100, reward=2, gap_extend=2, gapopen=4, penalty=-2, word_size=15)


############################################ Preparing sequences for iterative block finding ###########################################

blast_df = pd.read_csv(output_reciprocal_blast_tsv, sep='\t')
blast_df.columns = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 
                    'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'qlen', 'slen']

blast_size = blast_df.shape[0]
print(f'Org_size: {blast_size}')
s0_df = blast_df[(blast_df['pident'] > 90) & (blast_df['length'] > 15)]
print(f'Selected pident 90%: {s0_df.shape[0]/blast_size}')
s1_df = alignment_absolute_start_end(s0_df)
s2_df = select_internal_aligns(df=s1_df,border=10) 

# Filter by query alignment interval
if params_select_internals_alns:
    s3_df = s2_df# blast.fuse_contained_intervals(df=s2_df, start_col='qstart', end_col='qend', group_cols=['qseqid', 'sseqid'], keep_highest_score_col='bitscore')
else:
    s3_df = s1_df

##### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



Building a new DB, current time: 08/29/2025 13:57:15
New DB name:   /home/jpereira/OEs/Results/OE1/NamSeqs/Data/TrimNam/NamBlocks_trim/blast_db/infasta_db/Blocks_Namasivayam.fa
New DB title:  /home/jpereira/OEs/Blocks_Namasivayam.fa
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 24 sequences in 0.00122404 seconds.


makeblastdb ran successfully.
Running: blastn -query /home/jpereira/OEs/Blocks_Namasivayam.fa -db /home/jpereira/OEs/Results/OE1/NamSeqs/Data/TrimNam/NamBlocks_trim/blast_db/infasta_db/Blocks_Namasivayam.fa -out /home/jpereira/OEs/Results/OE1/NamSeqs/Data/TrimNam/NamBlocks_trim/blastn.infasta_reciprocal.tsv -num_threads 100 -reward 2 -gapextend 2 -gapopen 4 -penalty -2 -word_size 15 -outfmt '6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen'
blastn ran successfully.
Org_size: 31
Selected pident 90%: 1.0


In [2]:
from Bio import SeqIO

def get_fasta_from_bed(fasta_file, bed_file, output_file):
    # Read the FASTA file into a dictionary for quick access
    fasta_dict = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta"))
    
    with open(bed_file, 'r') as bed, open(output_file, 'w') as out:
        for line in bed:
            if line.strip() == "" or line.startswith("#"):
                continue  # Skip empty lines and comments

            chrom, start, end = line.strip().split()[:3]
            start, end = int(start), int(end)
            
            if chrom not in fasta_dict:
                raise ValueError(f"Sequence {chrom} not found in FASTA file.")
            
            seq = fasta_dict[chrom].seq[start:end]
            out.write(f">{chrom}:{start}-{end}\n{seq}\n")


In [3]:



############################################ Iterative Block finding ###########################################

import warnings
warnings.filterwarnings('ignore')

import shutil
import pandas as pd
from utils.bedtools_utils import bedtools_getfasta, coords_to_regions, blast_to_bed
from utils.vsearch_utils import vsearch_dereplication, vsearch_clustersize, vsearch_sortbysize


iteration_dir = output_dir /  'block_iterations'
print(f"Iteration dir: {iteration_dir}")
if os.path.isdir(iteration_dir): 
    print("Removieng previous iteration dir")
    shutil.rmtree(iteration_dir)
iteration_dir.mkdir(exist_ok=True)

iteration = 0
num_blocks = []
while iteration < 2:
    
    print(f"\n###### ITERATION {iteration} ######\n")

    in_fasta                         = iteration_dir / f'{iteration}'     / 'centroids.fasta'
    blast_db                         = iteration_dir / f'{iteration + 1}' / 'blast_db'
    reciprocal_blast_tsv             = iteration_dir / f'{iteration + 1}' / 'blastn.reciprocal.tsv'
    coords_bed                       = iteration_dir / f'{iteration + 1}' / 'regions.bed'
    extracted_regions_fasta          = iteration_dir / f'{iteration + 1}' / 'regions.fasta'
    retrieve_coords_bed              = iteration_dir / f'{iteration}'     / 'retrieve' / 'regions.bed'
    retrieve_extracted_regions_fasta = iteration_dir / f'{iteration}'     / 'retrieve' / 'regions.fasta'
    derep_fasta                      = iteration_dir / f'{iteration + 1}' / 'derep.fasta'
    sorted_fasta                     = iteration_dir / f'{iteration + 1}' / 'sorted.fasta'
    centroids_fasta                  = iteration_dir / f'{iteration + 1}' / 'centroids.fasta'
    clusters_uc                      = iteration_dir / f'{iteration + 1}' / 'clusters.uc'
    
    iteration_path = os.path.dirname(clusters_uc)
    iteration_log =  os.path.join(iteration_path, 'logs')
    
    (clusters_uc.parent).mkdir( exist_ok=True)
    (retrieve_coords_bed.parent).mkdir(exist_ok=True, parents=True)
    
    ## Make a reciprocal blast using blastn function from blast utils
    if iteration == 0:
        blast_df = s3_df
        in_fasta = input_fasta
    else:
        blast_db = makeblast_db(seqs_path=in_fasta, db_out=blast_db, remove_old_db=True, log_file=iteration_log)
        
        blastn(blast_input_seqs=in_fasta,
               blast_db_file=blast_db,
               blast_output_table_tsv=reciprocal_blast_tsv,
               num_threads=80, reward=1, gap_extend=5, gapopen=5,
               penalty=-1, word_size=10, log_file=iteration_log)
        
        blast_df = pd.read_csv(reciprocal_blast_tsv, sep='\t', header=None)
        blast_df.columns = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 
                    'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'qlen', 'slen']
        
    if iteration == 0:
        blast_df0 = blast_df.copy()

    blast_df = blast_df[blast_df['pident'] > 90]
    blast_df = blast_df[blast_df['length'] > 20]
    
    if params_retrive_sinlge_aln:
        # Retrive aligmentes that only map with itself
        retrive_df, blast_df = retrieve_single_alignments(blast_df)
        if retrive_df.shape[0] != 0:
            print(f"Retrieving sinlge mapped blocks iteration: {iteration}")
            retrive_bed_df = blast_to_bed(retrive_df)
            retrive_bed_df = retrive_bed_df.drop_duplicates()
            retrive_bed_df.to_csv(retrieve_coords_bed, sep='\t', header=False, index=False)
            #bedtools_getfasta(fasta_path=in_fasta, bed_path=retrieve_coords_bed, extracted_regions_path=retrieve_extracted_regions_fasta, show_command=True, log_file=iteration_log)
            get_fasta_from_bed(in_fasta, retrieve_coords_bed, retrieve_extracted_regions_fasta)
    else:
        print(f"Retrieving all blocks iteration: {iteration}")
        retrive_bed_df = blast_to_bed(blast_df)
        retrive_bed_df = retrive_bed_df.drop_duplicates()
        retrive_bed_df.to_csv(retrieve_coords_bed, sep='\t', header=False, index=False)
        #bedtools_getfasta(fasta_path=in_fasta, bed_path=retrieve_coords_bed, extracted_regions_path=retrieve_extracted_regions_fasta, show_command=True, log_file=iteration_log)
        get_fasta_from_bed(in_fasta, retrieve_coords_bed, retrieve_extracted_regions_fasta)
    
    print(f"Number of Retrived Seeds in iteration {iteration}: {len(retrive_bed_df)}")

        
    if blast_df.empty:
        print("Descomposition finished, all blocks are independent")
        break

    unnoised_df = blast_df.groupby('qseqid').apply(lambda x: unnoise_coords(x, radious=10))
    unnoised_df = unnoised_df.reset_index().drop(columns='level_1')

    # Filter coordinates with low number of instances 
    unnoised_df = unnoised_df[unnoised_df['instances'] > param_min_instances]
    
    if iteration == 0:
        unnoised_df0 = unnoised_df.copy()


    # Apply the function across groups
    get_fasta_df = unnoised_df.groupby('qseqid', group_keys=False).apply(coords_to_regions)
    get_fasta_df.to_csv(coords_bed, sep='\t', header=False, index=False)
    #bedtools_getfasta(fasta_path=in_fasta, bed_path=coords_bed, extracted_regions_path=extracted_regions_fasta, show_command=True, log_file=iteration_log)
    get_fasta_from_bed(in_fasta, coords_bed, extracted_regions_fasta)
    print(extracted_regions_fasta)
    vsearch_dereplication(input_fasta=extracted_regions_fasta, derep_fasta=derep_fasta, min_seq_length=20, log_file=iteration_log)
    vsearch_sortbysize(derep_fasta=derep_fasta, sorted_fasta=sorted_fasta, log_file=iteration_log)
    vsearch_clustersize( sorted_fasta=sorted_fasta, centroids_fasta=centroids_fasta, uc_file=clusters_uc,query_cov=params_cluster_coverage,
                        target_cov=params_cluster_coverage, id_thresh=params_cluster_identity, min_seq_length=20, log_file=iteration_log,
                        use_both_strands=True,use_sizein=True)
    
    
    iteration += 1

##### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -   


Iteration dir: /home/jpereira/OEs/Results/OE1/NamSeqs/Data/TrimNam/NamBlocks_trim/block_iterations

###### ITERATION 0 ######

Retrieving all blocks iteration: 0
Number of Retrived Seeds in iteration 0: 28
/home/jpereira/OEs/Results/OE1/NamSeqs/Data/TrimNam/NamBlocks_trim/block_iterations/1/regions.fasta
Executing command: vsearch --derep_fulllength /home/jpereira/OEs/Results/OE1/NamSeqs/Data/TrimNam/NamBlocks_trim/block_iterations/1/regions.fasta --output /home/jpereira/OEs/Results/OE1/NamSeqs/Data/TrimNam/NamBlocks_trim/block_iterations/1/derep.fasta --sizeout --relabel Seq --minseqlength 20
Executing command: vsearch --sortbysize /home/jpereira/OEs/Results/OE1/NamSeqs/Data/TrimNam/NamBlocks_trim/block_iterations/1/derep.fasta --output /home/jpereira/OEs/Results/OE1/NamSeqs/Data/TrimNam/NamBlocks_trim/block_iterations/1/sorted.fasta
Executing command: vsearch --cluster_size /home/jpereira/OEs/Results/OE1/NamSeqs/Data/TrimNam/NamBlocks_trim/block_iterations/1/sorted.fasta --id 0.85 --

In [56]:
blast_df = pd.read_csv(reciprocal_blast_tsv, sep='\t')
blast_df


Unnamed: 0,Seq1;size=4,Seq1;size=4.1,100.000,82,0,0.1,1,82.1,1.1,82.2,8.50e-35,131,82.3,82.4
0,Seq1;size=4,Seq1;size=4,100.000,10,0,0,10,19,19,10,1.900000e+00,17.4,82,82
1,Seq1;size=4,Seq18;size=1,100.000,10,0,0,53,62,72,63,1.900000e+00,17.4,82,83
2,Seq2;size=4,Seq2;size=4,100.000,111,0,0,1,111,1,111,1.810000e-48,177.0,111,111
3,Seq3;size=4,Seq3;size=4,100.000,160,0,0,1,160,1,160,1.140000e-71,255.0,160,160
4,Seq3;size=4,Seq27;size=1,84.211,19,3,0,71,89,154,172,1.600000e-01,22.2,160,334
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,Seq36;size=1,Seq22;size=1,92.308,13,1,0,657,669,22,10,9.700000e+00,19.0,1001,89
118,Seq39;size=1,Seq39;size=1,100.000,133,0,0,1,133,1,133,7.040000e-59,212.0,133,133
119,Seq39;size=1,Seq36;size=1,73.333,30,8,0,74,103,753,724,4.200000e-02,23.8,133,1001
120,Seq39;size=1,Seq24;size=1,100.000,10,0,0,88,97,38,47,3.400000e+00,17.4,133,355


In [45]:
unnoised_df0.sort_values('coords').tail(10)

Unnamed: 0,qseqid,coords,instances
71,block_25,742,2.0
57,block_23,781,4.0
36,block_19,881,13.0
68,block_25,886,11.0
35,block_19,965,15.0
67,block_25,970,15.0
56,block_23,977,8.0
79,block_27,1235,2.0
78,block_27,1440,5.0
77,block_27,1601,9.0


In [32]:
len(unnoised_df0['qseqid'].unique())
unnoised_df0.groupby('qseqid', group_keys=False).apply(coords_to_regions)

Unnamed: 0,id,start,end
0,block_1,1,85
0,block_10,1,70
1,block_10,70,89
2,block_10,89,171
0,block_11,1,145
...,...,...,...
1,block_8,162,173
2,block_8,173,306
3,block_8,306,323
4,block_8,323,405


In [34]:
extracted_regions_fasta

PosixPath('/home/jpereira/OEs/Results/OE1/NamSeqs/Data/define_blocks/all_97_97_slice_Mp_B.2/block_iterations/2/regions.fasta')