In [25]:
# Imports the necessary packages to allow the code to run

!pip install biopython

from Bio import Entrez
from Bio.Seq import Seq
from Bio.SeqUtils import nt_search

# 1. Function to fetch sequence from NCBI
def fetch_sequence_from_ncbi(accession_number): # Requires input of the accession number of interest
    Entrez.email = input("What email address can be used for this search? ")  # NCBI requires an email address
    try:
        # Fetch the sequence data from NCBI using Entrez
        handle = Entrez.efetch(db="nucleotide", id=accession_number, rettype="fasta", retmode="text")
        record = handle.read()
        handle.close()
        
        # Extract the sequence from the fasta record
        sequence = ''.join(record.split('\n')[1:])
        return sequence.upper() # Return standardised uppercase sequence

    except Exception as e:
        print(f"Error fetching sequence from NCBI: {e}") # Tells the user what error occurred which prevented the code from running
        return None

# 2. Function to search for a primer in the sequence
def find_primer_in_sequence(sequence, primer): # Requires the sequence which was output from the above function and the primer of interest
    primer = primer.upper() # Standardise the primer sequence to upper case as done with the sequence
    
    # Find all occurrences of primer in sequence
    positions = [] # Empty list for the positions of the sequences to be put into
    pos = sequence.find(primer)
    while pos != -1: # If the sequence is not found it will be -1 and thus this section will not be done
        positions.append(pos+1) # Add 1 to convert to 1-based position
        pos = sequence.find(primer, pos+1) # Searches again for the primer in the sequence starting from right after the last occurrence
                                            # If there is only one occurrence then this will return -1 and the while loop will be exited
    return positions # Returns a list of the positions of all occurrences of the primer in the sequence

# 3. Main function
def main():
    # Allows the user to input the accession number and primer sequence they would like to cross-reference
    accession_number = input("Which accession number would you like to search for? Eg. NM_001301717 :")  # Example accession number given to allow run of code
    primer_sequence = input("What is the primer sequence you would like to search for? Eg. AACCGT, GGGC :")  # Example primer sequence given to allow run of code
                                # AACCGT has one occurrence, GGGC has multiple to show how the code runs
    
    # Fetch the sequence
    sequence = fetch_sequence_from_ncbi(accession_number) # Calls function 1
    if sequence: # If the sequence was unable to be fetched, this variable will be empty and, as such the if won't run
        print(f"Fetched Sequence: {sequence[:100]}...")  # Print first 100 bases for preview to allow visualisation

        # Search for the primer sequence
        primer_locations = find_primer_in_sequence(sequence, primer_sequence) # Calls function 2
        if primer_locations: # If the primer was found then the location will be added to the list, if not then it was not found and the else runs
            print(f"Primer found at positions: {primer_locations}") # Reports the results to the user
            print("primer found", len(primer_locations), "times in sequence.") # Reports the results to the user
        else:
            print("Primer not found in the sequence.")
    else:
        print("Could not fetch the sequence.")

if __name__ == "__main__": # This section prevents the code running automatically if imported as a module
                            # Not strictly necessary here but helps allow reuse of code
    main() # Calls function 3, inside of which functions 1 and 2 are called





Which accession number would you like to search for? Eg. NM_001301717 : NM_001301717
What is the primer sequence you would like to search for? Eg. AACCGT, GGGC : GGGC
What email address can be used for this search?  j.anderson@rothamsted.ac.uk


Fetched Sequence: CTCTAGATGAGTCAGTGGAGGGCGGGTGGAGCGTTGAACCGTGAAGAGTGTGGTTGGGCGTAAACGTGGACTTAAACTCAGGAGCTAAGGGGGAAACCAA...
Primer found at positions: [20, 56, 294, 303, 310, 422, 621, 1113, 1418, 1530, 1553, 1913, 1990]
primer found 13 times in sequence.
