This script takes a fasta file containing a full (or partial) genome, and searches for ORFs containing keywords and/ or of a specific length. It then prints the seq IDs that pass the filters and lists the number and size of any introns present. Hits are also saved as a fasta.

In [None]:
from Bio import SeqIO
import re

def extract_sequences_and_introns(fasta_file, output_file, min_length, max_length, keyword_filter=None):
    """
    Extract sequences within a size range, filter by keyword, determine presence of introns, and print details.

    Args:
        fasta_file (str): Path to the input FASTA file.
        output_file (str): Path to the output FASTA file for filtered sequences.
        min_length (int): Minimum sequence length (inclusive).
        max_length (int): Maximum sequence length (inclusive).
        keyword_filter (str): Keyword to filter sequences by description (optional).
    """
    count = 0  # Count of sequences in the range

    with open(fasta_file, "r") as input_handle, open(output_file, "w") as output_handle:
        for record in SeqIO.parse(input_handle, "fasta"):
            if keyword_filter and keyword_filter.lower() not in record.description.lower():
                # Skip sequences that do not contain the keyword
                continue

            seq_length = len(record.seq)
            # Check if the sequence is within the range
            if min_length <= seq_length <= max_length:
                SeqIO.write(record, output_handle, "fasta")
                count += 1

                # Parse genomic location from the header
                match = re.search(r'\[location=([^\]]+)\]', record.description)
                if match:
                    location_field = match.group(1)

                    if "join" in location_field:
                        # Extract exon coordinates from "join"
                        exon_coords = re.findall(r'(\d+)\.\.(\d+)', location_field)
                        exon_lengths = [int(end) - int(start) + 1 for start, end in exon_coords]

                        # Calculate intron sizes
                        intron_sizes = []
                        for i in range(len(exon_coords) - 1):
                            end_of_prev = int(exon_coords[i][1])
                            start_of_next = int(exon_coords[i + 1][0])
                            intron_sizes.append(start_of_next - end_of_prev - 1)

                        # Print results
                        total_exon_length = sum(exon_lengths)
                        total_intron_length = sum(intron_sizes)
                        print(f"Sequence {record.id}:")
                        print(f"  Exons: {len(exon_coords)}, Total Exon Length: {total_exon_length} bp")
                        print(f"  Introns: {len(intron_sizes)}, Total Intron Length: {total_intron_length} bp")
                        print(f"  Intron Sizes: {intron_sizes}")
                    else:
                        print(f"Sequence {record.id} has no introns.")
                else:
                    print(f"Genomic location not found in description for sequence {record.id}.")

    print(f"\nNumber of sequences in range {min_length}-{max_length} bp: {count}")
    print(f"Filtered sequences saved to {output_file}")

# Parameters
input_fasta = r"C:\Users\henry\Downloads\ncbi_dataset\ncbi_dataset\data\GCA_000721775.1\cds_from_genomic.fna"  # Replace with your input FASTA file
output_fasta = r"C:\Users\henry\Downloads\ncbi_dataset\ncbi_dataset\data\GCA_000721775.1\cds_from_genomic_filter.fna"  # Replace with desired output file
min_bp = 4570  # Minimum sequence length
max_bp = 4580  # Maximum sequence length
keyword_filter = None # Replace with your desired keyword, or set to None for no filtering

# Run the function
extract_sequences_and_introns(input_fasta, output_fasta, min_bp, max_bp, keyword_filter)


Sequence lcl|KL584824.1_cds_KEQ66995.1_57:
  Exons: 5, Total Exon Length: 4572 bp
  Introns: 4, Total Intron Length: 210 bp
  Intron Sizes: [50, 54, 56, 50]
Sequence lcl|KL584837.1_cds_KEQ61572.1_6331:
  Exons: 5, Total Exon Length: 4578 bp
  Introns: 4, Total Intron Length: 198 bp
  Intron Sizes: [47, 52, 52, 47]
Sequence lcl|KL584838.1_cds_KEQ61279.1_6553:
  Exons: 2, Total Exon Length: 4359 bp
  Introns: 1, Total Intron Length: 59 bp
  Intron Sizes: [59]

Number of sequences in range 4570-4580 bp: 3
Filtered sequences saved to C:\Users\henry\Downloads\ncbi_dataset\ncbi_dataset\data\GCA_000721775.1\cds_from_genomic_filter.fna


In [None]:
from Bio import SeqIO
import re

input_file = r"C:\Users\henry\Downloads\ncbi_dataset\ncbi_dataset\data\GCA_000721775.1\cds_from_genomic.fna"  # Replace with your input FASTA file
output_file = r"C:\Users\henry\Downloads\ncbi_dataset\ncbi_dataset\data\GCA_000721775.1\cds_from_genomic_filter2.fna"  # Replace with desired output file

maximum_length = 2000
minimum_length = 1000

keyword = "pullulan"
count = 0
with open(input_file, "r") as input_handle, open (output_file, "w") as output_handle:
    for sequence in SeqIO.parse(input_file, "fasta"):
        seq_len = len(sequence)
            
        if keyword not in sequence.description and sequence.__len__ < maximum_length and sequence.__len__ > minimim_length:
            continue
        else:
            print(sequence.description)
            SeqIO.write(sequence, output_handle, "fasta")
            count = count + 1

print ( f"Search complete, {count} sequence descriptors found containing the keyword \"{keyword}\"")


TypeError: '<' not supported between instances of 'method' and 'int'

In [23]:
input_file = r"C:\Users\henry\Downloads\ncbi_dataset\ncbi_dataset\data\GCA_000721775.1\cds_from_genomic.fna"  # Replace with your input FASTA file
seq_records = SeqIO.parse(input_file, "fasta")
seq_record = next(seq_records)
attributes = dir(seq_record)
print(attributes)

['_AnnotationsDict', '_AnnotationsDictValue', '__add__', '__annotations__', '__bool__', '__bytes__', '__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__radd__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_per_letter_annotations', '_seq', '_set_per_letter_annotations', '_set_seq', 'annotations', 'count', 'dbxrefs', 'description', 'features', 'format', 'id', 'islower', 'isupper', 'letter_annotations', 'lower', 'name', 'reverse_complement', 'seq', 'translate', 'upper']
