**Fasta similarity filter**

Takes fasta input and runs a pairwise alignment of each sequence against the rest, printing the % similarity to the nearest neighbour. It then removes any within a defined cutoff (95% by default), and writes a new fasta file with the remainder. It then runs the alignment again with the filtered sequence and prints the closest matches again.

In [None]:
%pip install biopython

from Bio import SeqIO
from Bio.Align.Applications import ClustalOmegaCommandline
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import pairwise2
from Bio.pairwise2 import format_alignment

# Function to calculate sequence similarity
def calculate_similarity(seq1, seq2):
    alignments = pairwise2.align.globalxx(seq1, seq2)
    best_alignment = alignments[0]
    alignment_score = best_alignment[2]
    similarity_percentage = (alignment_score / len(seq1)) * 100
    return similarity_percentage

# Upload the input FASTA file
from google.colab import files
uploaded = files.upload()

# Load protein FASTA file
input_file = list(uploaded.keys())[0]
sequences = list(SeqIO.parse(input_file, "fasta"))

# Perform Clustal alignment
clustal_output = "output.aln"
clustalomega_cline = ClustalOmegaCommandline(infile=input_file, outfile=clustal_output, verbose=True, auto=True, force=True)
stdout, stderr = clustalomega_cline()

# Filter sequences based on similarity
filtered_sequences = []
for record in sequences:
    is_unique = True
    for filtered_record in filtered_sequences:
        similarity = calculate_similarity(str(record.seq), str(filtered_record.seq))
        if similarity >= 95:
            is_unique = False
            break
    if is_unique:
        filtered_sequences.append(record)

# Write filtered sequences to a new FASTA file
filtered_file = "filtered.fasta"
SeqIO.write(filtered_sequences, filtered_file, "fasta")

# Print original files with similarity percentages
print("Original files with similarity % to their closest match:")
for record in sequences:
    closest_similarity = 0
    for other_record in sequences:
        if record.id != other_record.id:
            similarity = calculate_similarity(str(record.seq), str(other_record.seq))
            if similarity > closest_similarity:
                closest_similarity = similarity
    print(f"{record.id}: {closest_similarity:.2f}%")

# Print filtered sequences with similarity percentages
print("\nFiltered sequences with similarity % to their closest match in the filtered list:")
for record in filtered_sequences:
    closest_similarity = 0
    for other_record in filtered_sequences:
        if record.id != other_record.id:
            similarity = calculate_similarity(str(record.seq), str(other_record.seq))
            if similarity > closest_similarity:
                closest_similarity = similarity
    print(f"{record.id}: {closest_similarity:.2f}%")


In [None]:
%pip install biopython
%apt-get install clustalo
import os
from Bio import SeqIO
from Bio.Align.Applications import ClustalOmegaCommandline
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import pairwise2
from Bio.pairwise2 import format_alignment

# Function to calculate sequence similarity
def calculate_similarity(seq1, seq2):
    alignments = pairwise2.align.globalxx(seq1, seq2)
    best_alignment = alignments[0]
    alignment_score = best_alignment[2]
    similarity_percentage = (alignment_score / len(seq1)) * 100
    return similarity_percentage

# Upload the input FASTA file
from google.colab import files
uploaded = files.upload()

# Load protein FASTA file
input_file = list(uploaded.keys())[0]
sequences = list(SeqIO.parse(input_file, "fasta"))

# Perform Clustal alignment
clustal_output = "output.aln"
clustalomega_cline = ClustalOmegaCommandline(infile=input_file, outfile=clustal_output, verbose=True, auto=True, force=True)
stdout, stderr = clustalomega_cline()

# Filter sequences based on similarity
filtered_sequences = []
for record in sequences:
    is_unique = True
    for filtered_record in filtered_sequences:
        similarity = calculate_similarity(str(record.seq), str(filtered_record.seq))
        if similarity >= 95:
            is_unique = False
            break
    if is_unique:
        filtered_sequences.append(record)

# Write filtered sequences to a new FASTA file
input_base_name = os.path.splitext(os.path.basename(input_file))[0]
cutoff_percentage = 95
filtered_file = f"{input_base_name}_filtered_{cutoff_percentage}.fasta"
SeqIO.write(filtered_sequences, filtered_file, "fasta")

# Print original files with similarity percentages
print("Original files with similarity % to their closest match:")
for record in sequences:
    closest_similarity = 0
    for other_record in sequences:
        if record.id != other_record.id:
            similarity = calculate_similarity(str(record.seq), str(other_record.seq))
            if similarity > closest_similarity:
                closest_similarity = similarity
    print(f"{record.id}: {closest_similarity:.2f}%")

# Print filtered sequences with similarity percentages
print("\nFiltered sequences with similarity % to their closest match in the filtered list:")
for record in filtered_sequences:
    closest_similarity = 0
    for other_record in filtered_sequences:
        if record.id != other_record.id:
            similarity = calculate_similarity(str(record.seq), str(other_record.seq))
            if similarity > closest_similarity:
                closest_similarity = similarity
    print(f"{record.id}: {closest_similarity:.2f}%")

print(f"\nFiltered sequences saved to: {filtered_file}")


In [None]:
%pip install biopython
%apt-get install clustalo
import os
from Bio import SeqIO
from Bio.Align.Applications import ClustalOmegaCommandline
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import pairwise2
from Bio.pairwise2 import format_alignment

# Function to calculate sequence similarity
def calculate_similarity(seq1, seq2):
    alignments = pairwise2.align.globalxx(seq1, seq2)
    best_alignment = alignments[0]
    alignment_score = best_alignment[2]
    similarity_percentage = (alignment_score / len(seq1)) * 100
    return similarity_percentage

# Upload the input FASTA file
from google.colab import files
uploaded = files.upload()

# Load protein FASTA file
input_file = list(uploaded.keys())[0]
sequences = list(SeqIO.parse(input_file, "fasta"))

# Perform Clustal alignment
clustal_output = "output.aln"
clustalomega_cline = ClustalOmegaCommandline(infile=input_file, outfile=clustal_output, verbose=True, auto=True, force=True)
stdout, stderr = clustalomega_cline()

# Remove gaps ("-") from sequences
def remove_gaps(sequence):
    return sequence.replace("-", "")

# Filter sequences based on similarity
filtered_sequences = []
for record in sequences:
    record.seq = Seq(remove_gaps(str(record.seq)))  # Remove gaps before processing
    is_unique = True
    for filtered_record in filtered_sequences:
        similarity = calculate_similarity(str(record.seq), str(filtered_record.seq))
        if similarity >= 95:
            is_unique = False
            break
    if is_unique:
        filtered_sequences.append(record)

# Write filtered sequences to a new FASTA file
input_base_name = os.path.splitext(os.path.basename(input_file))[0]
cutoff_percentage = 95
filtered_file = f"{input_base_name}_filtered_{cutoff_percentage}.fasta"
SeqIO.write(filtered_sequences, filtered_file, "fasta")

# Print original files with similarity percentages
print("Original files with similarity % to their closest match:")
for record in sequences:
    closest_similarity = 0
    for other_record in sequences:
        if record.id != other_record.id:
            similarity = calculate_similarity(str(record.seq), str(other_record.seq))
            if similarity > closest_similarity:
                closest_similarity = similarity
    print(f"{record.id}: {closest_similarity:.2f}%")

# Print filtered sequences with similarity percentages
print("\nFiltered sequences with similarity % to their closest match in the filtered list:")
for record in filtered_sequences:
    closest_similarity = 0
    for other_record in filtered_sequences:
        if record.id != other_record.id:
            similarity = calculate_similarity(str(record.seq), str(other_record.seq))
            if similarity > closest_similarity:
                closest_similarity = similarity
    print(f"{record.id}: {closest_similarity:.2f}%")

print(f"\nFiltered sequences saved to: {filtered_file}")


In [None]:
!pip install biopython
!apt-get install clustalo

import os
import numpy as np
from Bio import SeqIO
from Bio.Align.Applications import ClustalOmegaCommandline
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import AlignInfo
from Bio import AlignIO
from google.colab import files

# Upload the input FASTA file
uploaded = files.upload()

# Load protein FASTA file
input_file = list(uploaded.keys())[0]
sequences = list(SeqIO.parse(input_file, "fasta"))

# Perform Clustal alignment
clustal_output = "output.aln"
clustalomega_cline = ClustalOmegaCommandline(infile=input_file, outfile=clustal_output, verbose=True, auto=True, force=True)
stdout, stderr = clustalomega_cline()

# Parse the aligned sequences
aligned_sequences = AlignIO.read(clustal_output, "fasta")

# Remove gaps from aligned sequences
def remove_gaps(sequence):
    return sequence.replace("-", "")

cleaned_sequences = []
for record in aligned_sequences:
    cleaned_seq = remove_gaps(str(record.seq))
    cleaned_record = SeqRecord(Seq(cleaned_seq), id=record.id, description=record.description)
    cleaned_sequences.append(cleaned_record)

# Compute similarity matrix from the MSA
def compute_similarity_matrix(aligned_sequences):
    num_sequences = len(aligned_sequences)
    similarity_matrix = np.zeros((num_sequences, num_sequences))

    for i in range(num_sequences):
        for j in range(i, num_sequences):
            similarity = sum(res1 == res2 for res1, res2 in zip(aligned_sequences[i].seq, aligned_sequences[j].seq)) / len(aligned_sequences[i].seq)
            similarity_matrix[i, j] = similarity
            similarity_matrix[j, i] = similarity

    return similarity_matrix

similarity_matrix = compute_similarity_matrix(cleaned_sequences)

# Filter sequences based on similarity
cutoff_percentage = 95
cutoff = cutoff_percentage / 100

filtered_indices = []
for i in range(len(cleaned_sequences)):
    if i in filtered_indices:
        continue
    is_unique = True
    for j in range(len(cleaned_sequences)):
        if i != j and j not in filtered_indices:
            if similarity_matrix[i, j] >= cutoff:
                is_unique = False
                break
    if is_unique:
        filtered_indices.append(i)

filtered_sequences = [cleaned_sequences[i] for i in filtered_indices]

# Write filtered sequences to a new FASTA file
input_base_name = os.path.splitext(os.path.basename(input_file))[0]
filtered_file = f"{input_base_name}_filtered_{cutoff_percentage}.fasta"
SeqIO.write(filtered_sequences, filtered_file, "fasta")

# Print original files with similarity percentages
print("Original files with similarity % to their closest match:")
for i, record in enumerate(cleaned_sequences):
    closest_similarity = 0
    for j, other_record in enumerate(cleaned_sequences):
        if i != j:
            similarity = similarity_matrix[i, j] * 100
            if similarity > closest_similarity:
                closest_similarity = similarity
    print(f"{record.id}: {closest_similarity:.2f}%")

# Print filtered sequences with similarity percentages
print("\nFiltered sequences with similarity % to their closest match in the filtered list:")
for i, record in enumerate(filtered_sequences):
    closest_similarity = 0
    for j, other_record in enumerate(filtered_sequences):
        if i != j:
            similarity = similarity_matrix[filtered_indices[i], filtered_indices[j]] * 100
            if similarity > closest_similarity:
                closest_similarity = similarity
    print(f"{record.id}: {closest_similarity:.2f}%")

print(f"\nFiltered sequences saved to: {filtered_file}")


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
clustalo is already the newest version (1.2.4-7).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


Saving combined_filtered.fasta to combined_filtered (3).fasta
Original files with similarity % to their closest match:
EPO:JC181660_JC181660_Sequence_2_from_Patent_WO2012025577.: 9.05%
EPO:JC181662_JC181662_Sequence_4_from_Patent_WO2012025577.: 9.56%
EPO:JC519409_JC519409_Sequence_10_from_Patent_WO2014090327.: 23.18%
EPO:JC519411_JC519411_Sequence_12_from_Patent_WO2014090327.: 23.53%
EPO:JC519413_JC519413_Sequence_14_from_Patent_WO2014090327.: 23.18%
EPO:JC519415_JC519415_Sequence_16_from_Patent_WO2014090327.: 23.53%
EPO:JC519417_JC519417_Sequence_18_from_Patent_WO2014090327.: 10.40%
EPO:JC519419_JC519419_Sequence_20_from_Patent_WO2014090327.: 11.33%
EPO:JC519421_JC519421_Sequence_22_from_Patent_WO2014090327.: 99.51%
EPO:JC519423_JC519423_Sequence_24_from_Patent_WO2014090327.: 99.51%
EPO:JC519431_JC519431_Sequence_32_from_Patent_WO2014090327.: 10.37%
USPTO:AGD90175_AGD90175_Sequence_896_from_patent_US_8354262.: 99.55%
USPTO:AGD89894_AGD89894_Sequence_334_from_patent_US_8354262.: 99.55%

In [None]:
# similarity filter, mltiple inputs

import os
from Bio import SeqIO
from Bio.Align.Applications import ClustalOmegaCommandline
from Bio import pairwise2

# Function to calculate sequence similarity
def calculate_similarity(seq1, seq2):
    alignments = pairwise2.align.globalxx(seq1, seq2)
    best_alignment = alignments[0]
    alignment_score = best_alignment[2]
    similarity_percentage = (alignment_score / len(seq1)) * 100
    return similarity_percentage

# Function to filter sequences in a single file
def filter_sequences_in_file(input_file, threshold=95):
    sequences = list(SeqIO.parse(input_file, "fasta"))

    # Write sequences to a temporary file for MSA
    temp_input_file = "temp.fasta"
    SeqIO.write(sequences, temp_input_file, "fasta")

    # Perform Clustal Omega alignment
    clustal_output = "aligned.aln"
    clustalomega_cline = ClustalOmegaCommandline(infile=temp_input_file, outfile=clustal_output, verbose=True, auto=True, force=True)
    stdout, stderr = clustalomega_cline()

    # Read the aligned sequences
    aligned_sequences = list(SeqIO.parse(clustal_output, "fasta"))

    # Filter sequences based on similarity
    filtered_sequences = []
    for record in aligned_sequences:
        is_unique = True
        for filtered_record in filtered_sequences:
            similarity = sum(a == b for a, b in zip(record.seq, filtered_record.seq)) / min(len(record.seq), len(filtered_record.seq))
            if similarity >= threshold / 100:
                is_unique = False
                break
        if is_unique:
            filtered_sequences.append(record)

    # Write filtered sequences to a new FASTA file
    input_base_name = os.path.splitext(os.path.basename(input_file))[0]
    filtered_file = f"{input_base_name}_filtered_{threshold}.fasta"
    SeqIO.write(filtered_sequences, filtered_file, "fasta")
    return filtered_file, sequences, aligned_sequences, filtered_sequences




# Prepend the /content/ path to each file in new_files
#content_dir = '/content/'
#new_files = [os.path.join(content_dir, f) for f in new_files]

# Process each file independently
for input_file in new_files:
    filtered_file, original_sequences, aligned_sequences, filtered_sequences = filter_sequences_in_file(input_file)

    # Print original files with similarity percentages
    print(f"Original sequences in {input_file} with similarity % to their closest match:")
    for record in aligned_sequences:
        closest_similarity = 0
        for other_record in aligned_sequences:
            if record.id != other_record.id:
                similarity = sum(a == b for a, b in zip(record.seq, other_record.seq)) / min(len(record.seq), len(other_record.seq)) *100
                if similarity > closest_similarity:
                    closest_similarity = similarity
        print(f"{record.id}: {closest_similarity:.2f}%")

    # Print filtered sequences with similarity percentages
    print(f"\nFiltered sequences in {filtered_file} with similarity % to their closest match in the filtered list:")
    for record in filtered_sequences:
        closest_similarity = 0
        for other_record in filtered_sequences:
            if record.id != other_record.id:
                similarity = sum(a == b for a, b in zip(record.seq, other_record.seq)) / min(len(record.seq), len(other_record.seq)) * 100
                if similarity > closest_similarity:
                    closest_similarity = similarity
        print(f"{record.id}: {closest_similarity:.2f}%")

    print(f"\nFiltered sequences saved to: {filtered_file}")


Original sequences in /content/combined_filtered (2)_filtered_node1_branch1_node2_branch1.fasta with similarity % to their closest match:
USPTO_AGD89889_AGD89889_Sequence_324_from_patent_US_8354262.: 98.65%
USPTO_AGD89907_AGD89907_Sequence_360_from_patent_US_8354262.: 98.21%
USPTO_AGD90029_AGD90029_Sequence_604_from_patent_US_8354262.: 98.21%
USPTO_AGD90058_AGD90058_Sequence_662_from_patent_US_8354262.: 98.65%
USPTO_AGD90056_AGD90056_Sequence_658_from_patent_US_8354262.: 98.65%
USPTO_AGD90059_AGD90059_Sequence_664_from_patent_US_8354262.: 98.65%
USPTO_AGD90063_AGD90063_Sequence_672_from_patent_US_8354262.: 98.65%
USPTO_AGD90048_AGD90048_Sequence_642_from_patent_US_8354262.: 99.10%
USPTO_AGD90046_AGD90046_Sequence_638_from_patent_US_8354262.: 99.10%
USPTO_AGD90033_AGD90033_Sequence_612_from_patent_US_8354262.: 98.21%
USPTO_AGD90032_AGD90032_Sequence_610_from_patent_US_8354262.: 98.21%
USPTO_AGD90035_AGD90035_Sequence_616_from_patent_US_8354262.: 97.76%
USPTO_AGD90050_AGD90050_Sequence_6