We have extracted CDS and Exon from mRNA ID.
We found the translation start site position from the CDS and Exon alignment.
We have the exon skip function, who
- Finds exon containing translation start site.
- Skip exon from the following exon.


In [None]:
# @title Prerequisite Libraries
!pip install biopython
!pip install termcolor

from Bio import Entrez, SeqIO
from Bio.Seq import Seq, translate
from termcolor import colored

# Set up your Entrez email
Entrez.email = "muzzammilbhaisaheb@gmail.com"  # replace with your email

Collecting biopython
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.84


In [None]:
# @title Function: Retrieve mRNA sequence and extract exons
def get_exons_from_mrna(mrna_id):
    #Fetch mRNA record from NCBI
    try:
        handle = Entrez.efetch(db="nucleotide", id=mrna_id, rettype="gb", retmode="text")
        record = SeqIO.read(handle, "genbank")
        handle.close()

        #Extract exons
        exons = []
        for feature in record.features:
            if feature.type == "exon":
                exons.append(feature.location.extract(record).seq)

        return exons

    except Exception as e:
        print(f"An error occurred: {e}")
        return []

In [None]:
# @title Function to fetch CDS from mRNA
def fetch_cds(mrna_id):
    try:
        # Fetch the GenBank record for the given mRNA ID
        handle = Entrez.efetch(db="nucleotide", id=mrna_id, rettype="gb", retmode="text")
        record = SeqIO.read(handle, "genbank")
        handle.close()

        # Search for the CDS feature in the GenBank record
        for feature in record.features:
            if feature.type == "CDS":
                # Extract the CDS sequence
                cds_seq = feature.extract(record.seq)
                cds_product = feature.qualifiers.get('product', ['Unknown Product'])[0]

                #print(colored(f"CDS Product: {cds_product}", "green"))
                #print(colored(f"CDS Length: {len(cds_seq)}", "blue"))
                #print(colored(f"CDS Sequence:\n{cds_seq}", "yellow"))

                return cds_seq

        # If no CDS is found
        print(colored(f"No CDS found for NCBI mRNA ID: {mrna_id}", "red"))
        return None
    except Exception as e:
        print(colored(f"Error: {str(e)}", "red"))
        return None

In [None]:
# @title Function to find translation start site position
def find_cds_start_position(mrna_id):
    # Get the exons and concatenate them to form the mRNA sequence
    exons = get_exons_from_mrna(mrna_id)
    if not exons:
        print("No exons found.")
        return None

    # Concatenate the exons to form the full mRNA sequence
    mrna_seq = "".join([str(exon) for exon in exons])

    # Fetch the CDS sequence
    cds_seq = fetch_cds(mrna_id)
    if cds_seq is None:
        print("No CDS sequence found.")
        return None

    # Find the start position of the CDS in the mRNA sequence
    start_position = mrna_seq.find(str(cds_seq))

    if start_position == -1:
        print("CDS not found within the mRNA sequence.")
        return None

    # Return the 1-based start position (since you prefer exon numbering starting from 1)
    return start_position + 1

In [None]:
# @title ESK_CDSv02

# Function to find the exon containing the CDS start position
def find_exon_containing_start(exons, start_position):
    cumulative_length = 0
    for i, exon in enumerate(exons):
        cumulative_length += len(exon)
        if cumulative_length >= start_position:
            return i  # Return the index of the exon that contains the start position
    return None

# Function to simulate exon skipping and translate modified mRNA starting from the CDS start exon
def simulate_exon_skipping_from_start_and_translate(mrna_id):
    # Find the start position of the CDS (ATG)
    start_position = find_cds_start_position(mrna_id)

    if start_position is None:
        print("Unable to find start position for CDS.")
        return None

    # Fetch exons and concatenate them to form the full mRNA sequence
    exons = get_exons_from_mrna(mrna_id)
    if not exons:
        print("No exons found.")
        return None

    # Find which exon contains the start position
    start_exon_index = find_exon_containing_start(exons, start_position)

    if start_exon_index is None:
        print("Unable to find the exon containing the start position.")
        return None

    print(f"Start codon (ATG) is located in exon {start_exon_index + 1}.")

    # Simulate skipping exons starting from the exon containing the start codon
    for i in range(start_exon_index + 1, len(exons)):
        # Create a modified mRNA sequence by skipping the i-th exon onwards
        modified_exons = exons[:i] + exons[i+1:]  # Skip the i-th exon
        modified_mrna_seq = "".join([str(exon) for exon in modified_exons])

        # Ensure that we only translate if the start position is within bounds
        if start_position - 1 < len(modified_mrna_seq):
            # Translate the modified mRNA sequence
            protein_sequence = translate(modified_mrna_seq[start_position - 1:])  # Adjust for 0-based index
            print(f"\nSkipping exon {i + 1}:")
            print(f"Modified mRNA length: {len(modified_mrna_seq)}")
            print(f"Modified mRNA Sequence: {modified_mrna_seq}")
            print(f"Translated Protein Sequence: {protein_sequence}\n")
        else:
            print(f"Start position {start_position} is out of bounds after skipping exon {i + 1}.")

In [None]:
# @title Input mRNA ID
#mrna_id = "NM_002046.7"  #mRNA ID = GAPDH
#mrna_id = "NM_000492.4"  #mRNA ID = CFTR
#mrna_id = "NM_170707.4"  #mRNA ID = LMNA
#mrna_id = "NM_001110556.2"  #mRNA ID = FLNA
#mrna_id = "NM_001165963.4" #mRNA ID = SCN1A
mrna_id = str(input("Enter the mRNA ID: "))
start_position = find_cds_start_position(mrna_id)
if start_position:
    print(f"The CDS starts at position {start_position} in the mRNA sequence.")

#exons = get_exons_from_mrna(mrna_id)  # List of exon sequences

#simulate_exon_skipping_and_translate(mrna_id)

simulate_exon_skipping_from_start_and_translate(mrna_id)


Enter the mRNA ID: NM_000492.4
The CDS starts at position 71 in the mRNA sequence.
Start codon (ATG) is located in exon 1.

Skipping exon 2:
Modified mRNA length: 5959
Modified mRNA Sequence: GTAGTAGGTCTTTGGCATTAGGAGCTTGAGCCCAGACGGCCCTAGCAGGGACCCCAGCGCCCGAGAGACCATGCAGAGGTCGCCTCTGGAAAAGGCCAGCGTTGTCTCCAAACTTTTTTTCAGAGAATGGGATAGAGAGCTGGCTTCAAAGAAAAATCCTAAACTCATTAATGCCCTTCGGCGATGTTTTTTCTGGAGATTTATGTTCTATGGAATCTTTTTATATTTAGGGGAAGTCACCAAAGCAGTACAGCCTCTCTTACTGGGAAGAATCATAGCTTCCTATGACCCGGATAACAAGGAGGAACGCTCTATCGCGATTTATCTAGGCATAGGCTTATGCCTTCTCTTTATTGTGAGGACACTGCTCCTACACCCAGCCATTTTTGGCCTTCATCACATTGGAATGCAGATGAGAATAGCTATGTTTAGTTTGATTTATAAGAAGACTTTAAAGCTGTCAAGCCGTGTTCTAGATAAAATAAGTATTGGACAACTTGTTAGTCTCCTTTCCAACAACCTGAACAAATTTGATGAAGGACTTGCATTGGCACATTTCGTGTGGATCGCTCCTTTGCAAGTGGCACTCCTCATGGGGCTAATCTGGGAGTTGTTACAGGCGTCTGCCTTCTGTGGACTTGGTTTCCTGATAGTCCTTGCCCTTTTTCAGGCTGGGCTAGGGAGAATGATGATGAAGTACAGAGATCAGAGAGCTGGGAAGATCAGTGAAAGACTTGTGATTACCTCAGAAATGATTGAAAATATCCAATCTGTTAAGGCATACTGCTGGGAAGAAGCAATGGAAAAA