# Gene Information

In [None]:
# Import the Entrez module from Biopython for accessing NCBI databases
from Bio import Entrez

# Function to fetch genomic sequence using the provided accession number
def fetch_genomic_sequence(accession):
    Entrez.email = "kjglover4585@gmail.com"  # Replace with your email address
    # Set the email address to allow NCBI to contact you in case of any issues
    handle = Entrez.efetch(db="nuccore", id=accession, rettype="fasta", retmode="text")
    # 'efetch' is used to fetch the sequence data from the "nuccore" database using the provided accession number.
    # 'rettype="fasta"' specifies that the data should be returned in FASTA format (sequence data).
    # 'retmode="text"' indicates that the data should be returned as a text string.
    record = handle.read()  # Read the fetched sequence data
    handle.close()  # Close the handle to release resources
    return record  # Return the fetched genomic sequence as a string

# Example usage
genomic_accession = "NC_000001"  # Replace with the accession number of your genomic sequence of interest
sequence = fetch_genomic_sequence(genomic_accession)  # Call the function with the accession number to retrieve the sequence
print(sequence)  # Print the retrieved genomic sequence


# Protein Sequence

In [None]:
# Import the Entrez module from Biopython for accessing NCBI databases
from Bio import Entrez

# Function to fetch protein sequence using the provided accession number
def fetch_protein_sequence(accession):
    Entrez.email = "kjglover4585@gmail.com"  # Replace with your email address
    # Set the email address to allow NCBI to contact you in case of any issues
    handle = Entrez.efetch(db="protein", id=accession, rettype="fasta", retmode="text")
    # 'efetch' is used to fetch the sequence data from the "protein" database using the provided accession number.
    # 'rettype="fasta"' specifies that the data should be returned in FASTA format (sequence data).
    # 'retmode="text"' indicates that the data should be returned as a text string.
    record = handle.read()  # Read the fetched sequence data
    handle.close()  # Close the handle to release resources
    return record  # Return the fetched protein sequence as a string

# Example usage
protein_accession = "P01308"  # Replace with the accession number of your protein of interest
sequence = fetch_protein_sequence(protein_accession)  # Call the function with the accession number to retrieve the sequence
print(sequence)  # Print the retrieved protein sequence


# Mrna Sequence

In [None]:
# Import the Entrez module from Biopython for accessing NCBI databases
from Bio import Entrez

# Function to fetch mRNA sequence using the provided accession number
def fetch_mrna_sequence(accession):
    Entrez.email = "kjglover4585@gmail.com"  # Replace with your email address
    # Set the email address to allow NCBI to contact you in case of any issues
    handle = Entrez.efetch(db="nucleotide", id=accession, rettype="fasta", retmode="text")
    # 'efetch' is used to fetch the sequence data from the "nucleotide" database using the provided accession number.
    # 'rettype="fasta"' specifies that the data should be returned in FASTA format (sequence data).
    # 'retmode="text"' indicates that the data should be returned as a text string.
    record = handle.read()  # Read the fetched sequence data
    handle.close()  # Close the handle to release resources
    return record  # Return the fetched mRNA sequence as a string

# Example usage
mrna_accession = "NM_001123456"  # Replace with the accession number of your mRNA of interest
sequence = fetch_mrna_sequence(mrna_accession)  # Call the function with the accession number to retrieve the sequence
print(sequence)  # Print the retrieved mRNA sequence

# Genomic Sequence

In [None]:
# Import the Entrez module from Biopython for accessing NCBI databases
from Bio import Entrez

# Function to fetch genomic sequence using the provided accession number
def fetch_genomic_sequence(accession):
    Entrez.email = "kjglover4585@gmail.com"  # Replace with your email address
    # Set the email address to allow NCBI to contact you in case of any issues
    handle = Entrez.efetch(db="nuccore", id=accession, rettype="fasta", retmode="text")
    # 'efetch' is used to fetch the sequence data from the "nuccore" database using the provided accession number.
    # 'rettype="fasta"' specifies that the data should be returned in FASTA format (sequence data).
    # 'retmode="text"' indicates that the data should be returned as a text string.
    record = handle.read()  # Read the fetched sequence data
    handle.close()  # Close the handle to release resources
    return record  # Return the fetched genomic sequence as a string

# Example usage
genomic_accession = "NC_000001"  # Replace with the accession number of your genomic sequence of interest
sequence = fetch_genomic_sequence(genomic_accession)  # Call the function with the accession number to retrieve the sequence
print(sequence)  # Print the retrieved genomic sequence


# PDB Structure Retrieval

In [None]:
# Import the PDBList module from Biopython for accessing the Protein Data Bank (PDB)
from Bio.PDB import PDBList

# Function to fetch PDB structure using the provided PDB ID
def fetch_pdb_structure(pdb_id):
    pdbl = PDBList()
    # Create a PDBList object to interact with the PDB database
    
    pdbl.retrieve_pdb_file(pdb_id, file_format="pdb", pdir="./")
    # Use the retrieve_pdb_file method to fetch the PDB structure with the given PDB ID
    # 'file_format="pdb"' specifies that the data should be returned in PDB format (structure data).
    # 'pdir="./"' indicates that the downloaded PDB file will be saved in the current directory.
    
    pdb_filename = f"pdb{pdb_id.lower()}.ent"
    # The downloaded PDB file will have a filename in the format "pdbXXXX.ent", where XXXX is the PDB ID in lowercase.
    
    return pdb_filename  # Return the filename of the downloaded PDB structure

# Example usage
pdb_id = "1CRN"  # Replace with the PDB ID of the structure you want to retrieve
pdb_filename = fetch_pdb_structure(pdb_id)  # Call the function with the PDB ID to retrieve the structure
print(f"PDB structure {pdb_id} downloaded as {pdb_filename}")  # Print the filename of the downloaded PDB structure


# Querying Database

In [None]:
# Import necessary modules from Biopython
from Bio import Entrez, Medline, SeqIO

# Set your email address for NCBI Entrez access
Entrez.email = "kjglover4585@gmail.com"

# Use Entrez to retrieve information about available databases
handle = Entrez.einfo()
rec = Entrez.read(handle)
print(rec)

# Search for nucleotide sequences with gene name "CRT" from "Plasmodium falciparum"
handle = Entrez.esearch(db="nucleotide", term='CRT[Gene Name] AND "Plasmodium falciparum"[Organism]')
rec_list = Entrez.read(handle)

# If there are more results than the default limit, fetch all results
if int(rec_list['RetMax']) < int(rec_list['Count']):
    handle = Entrez.esearch(db="nucleotide", term='CRT[Gene Name] AND "Plasmodium falciparum"[Organism]',
                            retmax=rec_list['Count'])
    rec_list = Entrez.read(handle)

# Retrieve the list of matching sequence IDs
id_list = rec_list['IdList']

# Fetch the records corresponding to the retrieved IDs in GenBank format
hdl = Entrez.efetch(db='nucleotide', id=id_list, rettype='gb', retmax=rec_list['Count'])
recs = list(SeqIO.parse(hdl, 'gb'))

# Search for a specific sequence with name "KM288867"
for rec in recs:
    if rec.name == 'KM288867':
        break
print(rec.name)
print(rec.description)

# Analyze sequence features
for feature in rec.features:
    if feature.type == 'gene':
        print(feature.qualifiers['gene'])
    elif feature.type == 'exon':
        loc = feature.location
        print('Exon', loc.start, loc.end, loc.strand)
    else:
        print('not processed:\n%s' % feature)

# Print annotations of the retrieved sequence
for name, value in rec.annotations.items():
    print('%s=%s' % (name, value))

# Print the length of the sequence
print(len(rec.seq))

# Retrieve and display the references associated with the sequence
refs = rec.annotations['references']
print(refs)
for ref in refs:
    if ref.pubmed_id != '':
        print(ref.pubmed_id)
        handle = Entrez.efetch(db="pubmed", id=[ref.pubmed_id],
                                rettype="medline", retmode="text")
        records = Medline.parse(handle)
        for med_rec in records:
            for k, v in med_rec.items():
                print('%s: %s' % (k, v))

# Analyzing retrieved data

In [None]:
from Bio.SeqUtils import GC

sequence = "ATGCTGACTAGCTAGCTAGCGATATAGATAGTGATGATGATGTGTAGTATAGTAGTGATGTGATTAGTAGTAGTAGTATGATGATGTGATGTAGTATAGTAGTAGTAGTATGATGATGATTACATCTACTACTACTACTAGCTAGCTTAG"
gc_content = GC(sequence)
print(f"GC content: {gc_content}%")

# Filtering retrieved data

In [None]:
# Import the SeqIO module from Biopython for parsing sequence data from files
from Bio import SeqIO

# Function to filter sequences based on their length
def filter_sequences_by_length(sequences, min_length):
    filtered_sequences = []  # Create an empty list to store the filtered sequences
    for sequence in sequences:
        if len(sequence) >= min_length:
            # Check if the length of the sequence is greater than or equal to the minimum length
            # If the condition is met, add the sequence to the filtered_sequences list
            filtered_sequences.append(sequence)
    return filtered_sequences  # Return the list of filtered sequences

# Example usage
fasta_file = "sequence.fasta"  # Replace with the filename of your FASTA file containing sequences
min_length = 100  # Set the minimum length for filtering sequences
sequences = SeqIO.parse(fasta_file, "fasta")  # Parse the FASTA file to get a sequence iterator
filtered_sequences = filter_sequences_by_length(sequences, min_length)  # Call the function to filter sequences
for sequence in filtered_sequences:
    print(sequence)  # Print the filtered sequences