
This script filters FASTA sequences by length and specific text criteria, removes duplicates, and outputs the results in FASTA and CSV formats, providing detailed feedback on the filtering process.

In [None]:
import csv
from google.colab import files
!pip install biopython
from Bio import SeqIO
import os

In [None]:
# Define subroutines

def parse_fasta(input_file):
    sequences = {}
    with open(input_file, 'r') as f:
        current_id = None
        current_seq = ''
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if current_id is not None:
                    sequences[current_id] = current_seq
                current_id = line[1:]
                current_seq = ''
            else:
                current_seq += line
        if current_id is not None:
            sequences[current_id] = current_seq
    return sequences

# Run filters: Length, custom text, duplicatets
def filter_sequences(sequences, min_length=200, remove_texts=None):
    original_count = len(sequences)
    filtered_sequences = {}
    removed_duplicates = []
    removed_length = []
    unique_patent_sequence_names = set()
    for seq_id, seq in sequences.items():
        if remove_texts and any(text in seq_id for text in remove_texts):
            continue  # Skip sequences containing any of the specified texts
        if len(seq) >= min_length:
            is_duplicate = False
            for other_id, other_seq in sequences.items():
                if seq_id != other_id and seq in other_seq:
                    is_duplicate = True
                    removed_duplicates.append(seq_id)
                    break
            if not is_duplicate:
                filtered_sequences[seq_id] = seq
                if "patent" in seq_id:
                    _, seq_name = seq_id.split("patent", 1)
                    unique_patent_sequence_names.add(seq_name.strip())
            else:
                removed_length.append(seq_id)
    remaining_count = len(filtered_sequences)
    removed_duplicates_count = len(removed_duplicates)
    removed_length_count = len(removed_length)
    return filtered_sequences, removed_duplicates_count, removed_length_count, remaining_count, original_count, unique_patent_sequence_names

# Write output fasta and csv files

def write_fasta(output_file, sequences):
    with open(output_file, 'w') as f:
        for seq_id, seq in sequences.items():
            f.write(f'>{seq_id}\n')
            f.write(f'{seq}\n')

def write_csv(output_file, sequences):
    with open(output_file, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Sequence ID", "Sequence"])
        for seq_id, seq in sequences.items():
            writer.writerow([seq_id, seq])

#def filter_and_save(input_file, output_file, remove_texts=None):
#    sequences = parse_fasta(input_file)
#    filtered_sequences, removed_duplicates_count, removed_length_count, remaining_count, original_count, unique_patent_sequence_names = filter_sequences(sequences, remove_texts=remove_texts)

def filter_and_save(input_file, remove_texts=None):
    sequences = parse_fasta(input_file)
    filtered_sequences, removed_duplicates_count, removed_length_count, remaining_count, original_count, unique_patent_sequence_names = filter_sequences(sequences, remove_texts=remove_texts)

    output_file_base = os.path.splitext(input_file)[0]
    fasta_output_file = f"{output_file_base}_filtered.fasta"
    csv_output_file = f"{output_file_base}_filtered.csv"

    # Replace colons in sequence IDs with underscores
    filtered_sequences = {seq_id.replace(':', '_'): seq for seq_id, seq in filtered_sequences.items()}

    write_fasta(fasta_output_file, filtered_sequences)
    write_csv(csv_output_file, filtered_sequences)

    return original_count, removed_duplicates_count, removed_length_count, remaining_count, unique_patent_sequence_names



In [None]:
# Main code

if __name__ == "__main__":
    uploaded = files.upload()
    input_file = list(uploaded.keys())[0]  # Use the first uploaded file
    #output_file = "filtered.fasta"
    remove_texts_input = input("Enter comma-separated list of texts to remove from sequence IDs (leave blank if none): ").split(',')
    remove_texts = [text.strip() for text in remove_texts_input if text.strip()]

    original_count, removed_duplicates_count, removed_length_count, remaining_count, unique_patent_sequence_names = filter_and_save(input_file, remove_texts)

    # Print results and URLs
    print("Number of original sequences:", original_count)
    print("Number removed based on identical sequences:", removed_duplicates_count)
    print("Number removed based on length:", removed_length_count)
    print("Number remaining:", remaining_count)
    print("List of unique patent numbers:")
    for patent_number in unique_patent_sequence_names:
        print(patent_number.replace(' ', ''))
        patent_url = f"https://patents.google.com/?q={patent_number.replace(' ', '')}"
        print(f"Patent: {patent_number} - URL: {patent_url}")


Saving clean_.fasta to clean_.fasta
Enter comma-separated list of texts to remove from sequence IDs (leave blank if none): AGD90029  AGD90056  AGD90059  AGD90058  AGD90034  AGD90043  AGD90063  AGD90048  AGD90046  AGD90026  AGD90033  AGD90032  AGD90035  AGD90050  AGD90021  AGD90042  AGD90038  AGD90015  AGD89883  AGD90037  AGD89889  AGD90020  AGD90039  AGD89900  AGD89875  AGD89901  AGD89891  AGD89864  AGD90027  AGD89894  AGD89893  AGD90030  AGD90051  AGD90012  AGD90017  AGD90016  AGD90024  AGD90025  AGD90022  AGD90014  AGD90040  AGD90019  AGD90036  AGD90013  AGD90049  AGD90065  AGD90023  AGD90028  AGD90011  AGD90010  AGD89980  AGD89994  AGD89995  AGD90164  AGD90181  AGD90230  AGD90232
clean__filtered.fasta
Number of original sequences: 679
Number removed based on identical sequences: 0
Number removed based on length: 0
Number remaining: 679
List of unique patent numbers:
