# 🧬 Bioinformatics Lab Exercises: DNA/RNA/Protein Sequences

This notebook contains 5 exercises to practice sequence operations, translation, reverse translation, and simple data analysis using Biotite and Python.

## Imports and Setup

In [None]:
import biotite.sequence.io.fasta as fasta
from biotite.sequence import NucleotideSequence, ProteinSequence
import matplotlib.pyplot as plt
import numpy as np
import random

## Exercise 1: Basic Sequence Operations
**Objectives:**
- Load a FASTA file
- Compute nucleotide frequencies and GC content
- Translate DNA to protein

In [None]:
# Example FASTA sequences
fasta_content = ">Seq1\nATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG\n" +\
                ">Seq2\nGGGCCCAAATTTGGGCCCAGATATATATATGGGCCCTTT\n"

# Write to a file for demonstration
with open("exercise1.fasta", "w") as f:
    f.write(fasta_content)

# Read FASTA
# Compute nucleotide frequencies and GC content

# calculate these
nucleotide_counts = {'A': None, 'C': None, 'G': None, 'T': None}
GC_content = None

# printing at the end
print(f"Nucleotide counts: {nucleotide_counts}")
print(f"GC content: {GC_content}")


## Exercise 2: Reverse Translation (Protein -> DNA)
**Objectives:**
- Implement a function to generate a DNA sequence from a protein sequence
- Use a codon table

In [None]:
from typing import Optional

# Simplified codon table amino acids : codons
codon_table = {
    'A': ['GCT','GCC','GCA','GCG'],
    'C': ['TGT','TGC'],
    'D': ['GAT','GAC'],
    'E': ['GAA','GAG'],
    'F': ['TTT','TTC'],
    'G': ['GGT','GGC','GGA','GGG'],
    'H': ['CAT','CAC'],
    'I': ['ATT','ATC','ATA'],
    'K': ['AAA','AAG'],
    'L': ['TTA','TTG','CTT','CTC','CTA','CTG'],
    'M': ['ATG'],
    'N': ['AAT','AAC'],
    'P': ['CCT','CCC','CCA','CCG'],
    'Q': ['CAA','CAG'],
    'R': ['CGT','CGC','CGA','CGG','AGA','AGG'],
    'S': ['TCT','TCC','TCA','TCG','AGT','AGC'],
    'T': ['ACT','ACC','ACA','ACG'],
    'V': ['GTT','GTC','GTA','GTG'],
    'W': ['TGG'],
    'Y': ['TAT','TAC'],
    '*': ['TAA','TAG','TGA']  # Stop codons
}

# Implement reverse translation function
# Optionally, specify the number of sequences to generate
# What is the maximal number of sequences that can be generated from a protein sequence?
def reverse_translate(protein_seq: str, codon_table: dict = codon_table, sequence_num: Optional[int] = None) -> NucleotideSequence:
    pass

# Example protein sequence
prot = ProteinSequence('MGR*')
dna_candidate = reverse_translate(prot)
print(f"Protein: {prot}")
print(f"Reverse-translated DNA: {dna_candidate}")

## Exercise 3: mRNA Exon Sequence Retrieval (Mock Data)
**Objectives:**
- Simulate mRNA exons
- Concatenate exons and translate

In [None]:
# Mock sequence and exons
full_sequence = 'ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAGATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAGATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG'
exon_indices = [[0,15], [21,27], [31,37]]


# Concatenate exons and translate to protein

mrna = NucleotideSequence('') # TODO: concatenate exons
protein_seq = None # TODO: translate mrna
print(f"mRNA sequence: {mrna}")
print(f"Translated protein: {protein_seq}")

In [None]:
# Visualization: show nucleotide composition of mRNA as barplot with color=['green','blue','orange','red']
counts = {} # TODO: count nucleotides
plt.bar() # implement barplot
plt.title('mRNA Nucleotide Composition')
plt.show()

## Optional Exercise 3b: Fetch Real Gene Exons from NCBI
**Objectives:**
- Use NCBI Entrez API to retrieve gene sequence
- Extract exons/CDS and translate to protein

In [None]:
# Optional: requires Biopython
from Bio import Entrez, SeqIO


Entrez.email = "your_email@example.com"  # required by NCBI

def fetch_gene_protein(gene_name: str, organism: str = "Homo sapiens") -> Optional[str]:
    """ Fetching mRNA sequence from NCBI by gene name and organism """

    # search for gene
    handle = Entrez.esearch(db="nucleotide", term=f"{gene_name}[Gene] AND {organism}[Organism]", retmax=1)
    record = Entrez.read(handle)
    handle.close()
    if not record["IdList"]:
        print("Gene not found.")
        return None

    # fetch gene
    gene_id = record["IdList"][0]
    handle = Entrez.efetch(db="nucleotide", id=gene_id, rettype="gb", retmode="text")
    gb_record = SeqIO.read(handle, "genbank")
    handle.close()

    # extract exons
    exons = []
    for feature in gb_record.features:
        if feature.type == "CDS":
            exon_seq = feature.location.extract(gb_record).seq
            exons.append(str(exon_seq))
    if not exons:
        print("No CDS/exons found for this gene.")
        return None
    mrna_seq = "".join(exons)
    return mrna_seq

# Obtain mRNA sequence for BRCA1 gene and translate to protein sequence
gene_name = "BRCA1"


## Exercise 4: Translation Efficiency Simulation
**Objectives:**
- Calculate translation time based on codon availability
- Determine mean translation time for a protein

In [None]:
# Example tRNA abundance table (arbitrary values)
from numpy import ndarray


tRNA_table = {
    'GCT': 5,'GCC': 10,'GCA': 4,'GCG': 8,
    'ATG': 15,
    'GGT': 8,'GGC': 12,'GGA': 6,'GGG': 9,
    'TAA': 0,'TAG': 0,'TGA': 0  # stop codons
}

# Translation speed = 1/tRNA abundance
def calculate_translation_time(nucleotide_seq: str, tRNA_table: dict = tRNA_table) -> int:
    """ Calculate translation time for a nucleotide sequence """
    pass

def calculate_per_residue_mean_translation_time(protein_seq: str, codon_table: dict = codon_table, tRNA_table: dict = tRNA_table) -> ndarray:
    """ Calculate per-residue mean translation time for a protein sequence as mean over all source nucleotide sequences """
    pass


translation_times = calculate_per_residue_mean_translation_time(prot)
print(f"Per-residue translation times: {translation_times}")
print(f"Mean translation time: {translation_times.mean():.2f}")     # Is mean of means actually the mean translation time for the protein?

# Implement barplot showing translation time per amino acid
plt.bar() # TODO: implement barplot
plt.xlabel('Residue position')
plt.ylabel('Translation time')
plt.title('Translation Time per Amino Acid')
plt.show()

## Exercise 5: Sequence Analysis Pipeline
**Objectives:**
- Integrate previous steps
- Visualize composition and translation efficiency

In [None]:
# Input: protein sequence
protein_input = ProteinSequence('MGRMGR')

# Reverse translate
dna_seq = reverse_translate(protein_input)
print(f"DNA candidate: {dna_seq}")

# Translate back to protein (frame 0)
protein_check = dna_seq.translate(complete=True)        # what happens, if complete=False?
print(f"Translated back: {protein_check}")

# Propose a sequence with the fastest, and the slowest translation time
fastest_translation_sequence = NucleotideSequence('')
slowest_translation_sequence = NucleotideSequence('')

# Implement the function calculating the per-residue translation time starting from nucleotide sequence
def per_res_translation_time(nucleotide_seq: str, tRNA_table: dict = tRNA_table) -> ndarray:
    """ Calculate per-residue translation time for a nucleotide sequence """
    pass

# Calculate the translation time for the proposed sequences
fastest_translation_time = per_res_translation_time(fastest_translation_sequence)
slowest_translation_time = per_res_translation_time(slowest_translation_sequence)

# Visualization
# Create a plot with the following subplots:
# 1. Barplot of nucleotide composition of the fastest folding DNA sequence
# 2. Barplot of nucleotide composition of the slowest folding DNA sequence below the first one
# 3. Line plot of total translation time as a function of the number of residues for fastest and slowest folding DNA sequence

# Create figure with subplots: 2 plots in left column, 1 larger plot in right panel
fig = plt.figure(figsize=(12, 8))

# Left column: two subplots stacked vertically
ax1 = plt.subplot2grid((2, 2), (0, 0))  # Top left
ax2 = plt.subplot2grid((2, 2), (1, 0))  # Bottom left

# Right panel: one larger subplot spanning both rows
ax3 = plt.subplot2grid((2, 2), (0, 1), rowspan=2)  # Right side, spans both rows

# ax1 - barplot of nucleotide composition of the fastest folding DNA sequence
ax1.set_xlabel('Nucleotide')
ax1.set_ylabel('Count')
ax1.set_title('Nucleotide Composition - Fastest Translation Sequence')

# ax2 - barplot of nucleotide composition of the slowest folding DNA sequence
ax2.set_xlabel('Nucleotide')
ax2.set_ylabel('Count')
ax2.set_title('Nucleotide Composition - Slowest Translation Sequence')

# ax3 - line plot of total translation time as a function of the number of residues for fastest and slowest folding DNA sequence
ax3.set_xlabel('Number of Residues')

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plot
plt.show()
