# Find ORF and Translate to Protein

**Authors:** [Tony Kabilan Okeke](mailto:tko35@drexel.edu), [Ifeanyi Osuchukwu](mailto:imo27@drexel.edu)  
**Template Author:** [Ahmet Sacan](mailto:ahmetmsacan@gmail.com)  
**Date:** 01.08.2022

In [1]:
# Import packages and functions
from dnatools import seq_transcribe, seq_findgene, pprint
from urllib.request import urlretrieve
from Bio.SeqIO import parse

# Download data from NCBI for testing
url = "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&save=file&log$=seqview&db=nuccore&report=fasta&sort=&id=568815587&from=5225466&to=5227071&strand=on&maxplex=1"
urlretrieve(url, "NC_000011.fasta");

## Test Cases for `seq_transcribe`

In [2]:
pprint( seq_transcribe('CTTACCTCAT') )

{
  "noncode": "ATGAGGTAAG",
  "mrna": "CUUACCUCAU",
  "ptn": "LTS"
}


In [3]:
pprint( seq_transcribe('ATGAGGTAAG') )

{
  "noncode": "CTTACCTCAT",
  "mrna": "AUGAGGUAAG",
  "ptn": "MR*"
}


In [4]:
pprint( seq_transcribe('ACGTGAATCGATAATA') )

{
  "noncode": "TATTATCGATTCACGT",
  "mrna": "ACGUGAAUCGAUAAUA",
  "ptn": "T*IDN"
}


In [5]:
pprint( seq_transcribe('TGA') )

{
  "noncode": "TCA",
  "mrna": "UGA",
  "ptn": "*"
}


In [6]:
# Test it on a real DNA sequence
# NC_000011.10:c5227071-5225466 Homo sapiens chromosome 11, GRCh38.p13 Primary Assembly

# Parse data downloaded from NCBI
dna = ''.join([str(seq.seq) for seq in parse("NC_000011.fasta", "fasta")])

pprint( seq_transcribe(dna) )

{
  "noncode": "GCAATGAAAATAAATGTTTTTTATTAGGCAGAATCCAGATGCTCAAGGCCCTTCATAATATCCCCCAGTTTAGTAGTTGGACTTAGGGAACAAAGGAACCTTTAATAGAAATTGGACAGCAAGAAAGCGAGCTTAGTGATACTTGTGGGCCAGGGCATTAGCCACACCAGCCACCACTTTCTGATAGGCAGCCTGCACTGGTGGGGTGAATTCTTTGCCAAAGTGATGGGCCAGCACACAGACCAGCACGTTGCCCAGGAGCTGTGGGAGGAAGATAAGAGGTATGAACATGATTAGCAAAAGGGCCTAGCTTGGACTCAGAATAATCCAGCCTTATCCCAACCATAAAATAAAAGCAGAATGGTAGCTGGATTGTAGCTGCTATTAGCAATATGAAACCTCTTACATCAGTTACAATTTATATGCAGAAATATTTATATGCAGAGATATTGCTATTGCCTTAACCCAGAAATTATCACTGTTATTCTTTAGAATGGTGCAAAGAGGCATGATACATTGTATCATTATTGCCCTGAAAGAAAGAGATTAGGGAAAGTATTAGAAATAAGATAAACAAAAAAGTATATTAAAAGAAGAAAGCATTTTTTAAAATTACAAATGCAAAATTACCCTGATTTGGTCAATATGTGTACACATATTAAAACATTACACTTTAACCCATAAATATGTATAATGATTATGTATCAATTAAAAATAAAAGAAAATAAAGTAGGGAGATTATGAATATGCAAATAAGCACACATATATTCCAAATAGTAATGTACTAGGCAGACTGTGTAAAGTTTTTTTTTAAGTTACTTAATGTATCTCAGAGATATTTCCTTTTGTTATACACAATGTTAAGGCATTAAGTATAATAGTAAAAATTGCGGAGAAGAAAAAAAAAGAAAGCAAGAATTAAACAAAAGAAAACAATTGTTATGAACAGCAAATAAAAGAAACTAAAACGATCCTGAGACTTCC

In [7]:
# Add your own test case that encodes 4 Proline residues, 4 Tyrosine residues
# followed by a stop codon
pprint( seq_transcribe('CCCCCCCCCCCCTATTATTATTATTAG') )

{
  "noncode": "CTAATAATAATAATAGGGGGGGGGGGG",
  "mrna": "CCCCCCCCCCCCUAUUAUUAUUAUUAG",
  "ptn": "PPPPYYYY*"
}


## Test Cases for `seq_findgene`

In [8]:
# With a single start and a single stop codon
seq_findgene('CTTACCTCAT')

'MR*'

In [9]:
# With a single start and a single stop codon (Complement of the previous sequence)
seq_findgene('ATGAGGTAAG')

'MR*'

In [10]:
# With no start or stop codon
seq_findgene('ACGTGAATCGATAATA')

''

In [11]:
# With multiple start and stop codons
seq_findgene('CCCATGGGCAACTAGTATGCCGTGA')

'MGN*'

In [12]:
# Test it on a real DNA sequence
dna = ''.join([str(seq.seq) for seq in parse("NC_000011.fasta", "fasta")])
seq_findgene(dna)

'MKLVVRPWAGWYQGYKTGLRRPIETGHVETEKTLGFLIGTDSLCLLVYFPTLRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRVSLWDA*'

In [13]:
# Add your own test case that encodes: 1 Methionine, 4 Proline, and 4 Tyrosine amino acids, followed
# by a stop codon. Add at least one nucleotide before and after this open reading frame -- make sure
# the additional nucleotides you add do not end up producing a longer ORF.
seq_findgene('CATGCCCCCCCCCCCCTACTACTACTACTGAC')

'MPPPPYYYY*'