In [6]:
# Imports
from Bio import pairwise2, Align, SeqIO
from multiprocessing import Pool
from psutil import cpu_count
from functools import partial
import numpy as np

from time import time

def _nwalign_pct_id(a,b):
    """
    Wrapper for pairwise2.align.globaldx() that can be used by partials()
    Computes global alignment score with free end gaps, and returns percent identity.
    """
    if a.seq != b.seq:
        # Compute alignment and store aligned sequences
        align = pairwise2.align.globalxs(
            a.seq, b.seq, open=-5, extend=-5, penalize_end_gaps=False,
            one_alignment_only=True
        )[0]

        # Store aligned sequences
        a = align.seqA
        b = align.seqB
        denom = align.end - align.start

        # Compute percent identity
        ident = 0
        for i,j in zip(a,b):
            if i == j: ident += 1
        p_id = ident/denom
    else:
        p_id = 1

    return p_id

def fasta_removeredundants_multi(file: str, identthreshold: float=0.95, maxtime=0):
    # Read proteins from file
    ptns = list( SeqIO.parse(file, 'fasta') )[:6]
    N = len(ptns)

    # Timing
    tic = time()
    stopped = False

    # Loop through list of proteins
    avoid = []
    numaligns = 0
    for i in range(N-1):
        if i in avoid:
            continue
        with Pool(cpu_count()) as pool:
            identities = pool.map(partial(_nwalign_pct_id, ptns[i]), ptns[i+1:])

        # Get indices of proteins that exceed threshold
        identities = np.array(identities)
        ind = i+1 + np.where( identities > identthreshold )[0]
        avoid.extend(ind.tolist())
        
        # Timing
        numaligns += N-i
        if maxtime !=0 and time()-tic > maxtime:
            print(f"---WARNING: I only processed up to the {i:d}th protein")
            stopped = True
            break

    # Timing
    toc = time()-tic;
    numtotalaligns = N*(N-1)/2;
    print("Completed {:d} of {:.2f} comparisons in {:.2f}secs ({:.2f}mins) ({:.3f}secs per comparison)"\
            .format(numaligns, numtotalaligns, toc, toc/60, toc/numaligns))
    if stopped:
        print("Full comparison would have taken around {:.2f}mins.".format(toc/numaligns * numtotalaligns/60))

    return avoid

In [None]:
def fasta_removeredundants_loop(file: str, identthreshold: float=0.90):
    """
    Remove Redundant Proteins From BLAST Results (FASTA file)

    This function takes a fasta file containing aligned sequences from BLAST
    and writes a file with the extension 'nonredundant.fasta' that contains
    the non-redundant proteins from the fasta file.
    It removes entries thate are more than 'identthreshold' similar to any prior
    entry in the file.
    Similarity is calculated based on global alignment scores with free end gaps.
    The function returns the fraction of sequences removed.

    Parameters
    ----------
    file: str
        The file containing BLAST results
    identthreshold: float
        Threshold for similarity between sequences
    """

    # Read proteins from file
    ptns = list( SeqIO.parse(file, 'fasta') )
    ptns = ptns[:6]
    N = len(ptns)

    # Timing
    tic = time()

    # Loop through list of proteins
    avoid = []
    for i in range(N-1):
        if i in avoid:
            continue
        for j in range(i+1, N):
            if ptns[i].seq != ptns[j].seq:
                pct_identity = _nwalign_pct_id(ptns[i], ptns[j])

                if pct_identity > identthreshold:
                    # Avoid samples
                    avoid.append(j)
                
    # Timing
    toc = time() - tic
    print(f'That took {toc:.2f}secs ({toc/60:.2f}mins)')

    return avoid

In [7]:
fasta_removeredundants_multi("blastresult.fasta", 0.90, 10)

Completed 5019 of 506521.00 comparisons in 10.06secs (0.17mins) (0.002secs per comparison)
Full comparison would have taken around 16.92mins.


[1, 4]