In [1]:
# Please put all your import statements here, according to the example. Include a comment with the name of 
# the library and its version. Do not include import statements in the rest of the code

# Example:
# Biopython v 1.78
import Bio # this would obviously be uncommented if you are importing Biopython
from Bio import SeqIO
from Bio.Seq import Seq
import re

# Question 1: Primers and melting temperature

### Question 1.a: Reading a DNA sequence from a file

Write a python function called ```readDNAsequence``` that takes as its argument the name of a file.  When passed the name of a FASTA file, the function should read the file, discard the header and return the sequence as a string. Your code should raise ```BadSequenceException``` (defined below) if the sequence part of the file contains characters that are not one of the letters A, C, T, G, U. All U nucleotides should be replaced by T in the returned string (for simplicity, we will be working with T only throughout the rest of this assignment).

In [2]:
# Run this cell to define the exception
class BadSequenceException(Exception):
    
    pass

In [3]:
# Your code here
def readDNAsequence(filename):
    try:
        for record in SeqIO.parse(filename, "fasta"):
            # replace all 'U' with 'T' if any
            sequence = str(record.seq).replace("U", "T")
            # sequence must contain A, C, G, T
            nucleotides = {'A', 'C', 'G', 'T'}
            # create set with unique bases of the sequence
            unique_bases = set(sequence)
            # to find out if other bases without A, C, G, T
            different = unique_bases.difference(nucleotides)
            # if base without A, C, G, T, raise exception
            if different:
                raise BadSequenceException
            else:
                return sequence
    except BadSequenceException:
        print(f"Sequence contains {different}, which are not one of the letters A, C, G, T, U.")
        return "Invalid Sequence!"

In [4]:
seq = readDNAsequence('BADexample.fasta')

Sequence contains {'N', 'X', 'Q'}, which are not one of the letters A, C, G, T, U.


In [5]:
print(seq)

Invalid Sequence!


In [6]:
s10 = "GAAGATCTCAGGCAGTGACCCTCTAGATGGAAGCACTGTCTGTTGTCTAAGAAAAGATCGTGCATCCTTTTAGAGTGTTACTGTTTGAGAAAATC"

In [7]:
# Test code
import os
with open("test9876345.fas", "wt") as _OUTF:
    _OUTF.write("> test\n")
    _OUTF.write("ACTG\n")
_seq=readDNAsequence("test9876345.fas")
assert type(_seq) is type(""), "Return value is not a string: %r" % _seq
os.remove("test9876345.fas")
print("OK")

OK


### Question 1.b: Computing the complement of a sequence

Write a function called ```complement``` that takes a string containing a DNA sequence as its only  parameter and returns the complement of the sequence in a string. The function should raise ```BadSequenceException``` if the argument sequence contains anything else than the four characters A, C, T, G. Do not reverse the string; for the avoidance of doubt, if the input string starts with A then the complement string should start with T.


In [8]:
# Your code here
def complement(dna):
    # create dictionary with base : complement pair
    complement_pair = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
    try:
        nucleotides = {'A', 'C', 'G', 'T'}
        different = set(dna).difference(nucleotides)
        if different:
            raise BadSequenceException
        else:
            # return complement sequence
            return "".join(complement_pair.get(base) for base in dna)
    
    except BadSequenceException:
        print(f"Sequence contains {different}, which are not one of the letters A, C, G, T.")
        return "Invalid Sequence!"

In [9]:
# Test code
_seq=complement("ACGTTTCGN")
assert type(_seq) is type(""), "Return value is not a string: %r" % _seq
print("OK")

Sequence contains {'N'}, which are not one of the letters A, C, G, T.
OK


### Question 1.c: Extracting primers

Write a function called ```primer``` that takes three parameters: a DNA sequence called ```sequence```, an integer ```length``` that is 20 by default and a Boolean value ```forward``` that is ```True``` by default. When ```forward``` is ```True``` (or is not passed), the function should return a Forward primer for the sequence passed as ```sequence```; when it is ```False```, it should return a  Reverse primer. The length of the primer is specified by ```length```; if this is not passed, a primer of length 20 should be returned.  Refer to the Background document for a definition of primers and how to compute them (for the avoidance of doubt, if the sequence string ends with a C, then the reverse primer string should start with a G). If the sequence is shorter than ```length``` nucleotides, your code should raise a ```BadSequenceException```.


In [10]:
# Your code here
def primer(sequence, length=20, forward=True):
    try:
        # if the sequence size is shorter than given length, raise exception
        if len(sequence) < length:
            raise BadSequenceException
        else:
            # if forward = True, execute this
            if forward:
                # return first length nucleotides
                return sequence[0:length]
            else:
                # return first length nucleotides of reverse strand
                return str(Seq(sequence).reverse_complement()[0:length])
            
    except BadSequenceException:
        print(f"Sequence length {len(sequence)} is shorter than given length.")
        return "Invalid Sequence!"
    


In [11]:
# Test code
_seq=primer(sequence="AAAAATTTTTCCCCCGGGGGAAAAA", length= 20, forward=False)
assert type(_seq) is type(""), "Return value is not a string: %r" % _seq
print("OK")

OK


### Question 1.d: Computing the melting temperature

Write a function called ```meltingTemp``` that takes a string representing a primer as its argument. The function should return the melting temperature of the primer in degrees Celsius according to the equation given in the Background document. If the sequence contains characters other than A, C, T, G, the function should raise a ```BadSequenceException```.

In [12]:
# Your code here
def meltingTemp(primer_seq):
    try:
        nucleotides = {'A', 'C', 'G', 'T'}
        different = set(primer_seq).difference(nucleotides)
        if different:
            raise BadSequenceException
        else:
            A, T = primer_seq.count("A"), primer_seq.count("T")
            G, C = primer_seq.count("G"), primer_seq.count("C")
            Tm = 4*(G + C) + 2*(A + T)
            return Tm
    
    except BadSequenceException:
        print(f"Sequence contains {different}, which are not one of the letters A, C, G, T.")
        return 0

In [13]:
# Test code
_temp=meltingTemp("AAAAATTTTTCCCCCGGGGG")
assert ((type(_temp) is type(0.0)) or
        (type(_temp) is type(0))), "Return value is not a number: %r" % _temp
print("OK")

OK


### Question 1.e: Putting it all together

Write a function called ```sequencePCRtemp``` that takes a string containing the name of a FASTA file as its argument. The function should return the average melting temperature of the two primers of the sequence as a ```float```.

In [14]:
# Your code here
def sequencePCRtemp(filename):
    # read fasta file with readDNAsequence function
    seq = readDNAsequence(filename)
    # complement the seq
    complement_seq = complement(seq)
    # find out primer for seq as well as complement_seq
    primer_seq = primer(seq)
    primer_complement_seq = primer(complement_seq)
    # calculate melting temperature
    Tm_seq = meltingTemp(primer_seq)
    Tm_complement_seq = meltingTemp(primer_complement_seq)
    # calculate average melting temperature
    avg_Tm = (Tm_seq + Tm_complement_seq)/2.0
    return avg_Tm

In [15]:
sequencePCRtemp("BADexample.fasta")

Sequence contains {'N', 'X', 'Q'}, which are not one of the letters A, C, G, T, U.
Sequence contains {'n', 'v', 'q', 'u', 'a', '!', 'e', 'S', 'l', ' ', 'c', 'i', 'I', 'd'}, which are not one of the letters A, C, G, T.
Sequence length 17 is shorter than given length.
Sequence length 17 is shorter than given length.
Sequence contains {'n', 'v', 'q', 'u', 'a', '!', 'e', 'S', 'l', ' ', 'c', 'i', 'I', 'd'}, which are not one of the letters A, C, G, T.
Sequence contains {'n', 'v', 'q', 'u', 'a', '!', 'e', 'S', 'l', ' ', 'c', 'i', 'I', 'd'}, which are not one of the letters A, C, G, T.


0.0

In [16]:
# Test code
import os
with open("test9876346.fas", "wt") as _OUTF:
    _OUTF.write("> test\n")
    _OUTF.write("AAAAACCCCCTTTTTGGGGGAAAAA\n")
_temp=sequencePCRtemp("test9876346.fas")
assert type(_temp) is type(0.0), "Return value is not a float: %r" % _temp
os.remove("test9876346.fas")
print("OK")

OK


# Question 2: Translation and reading frames

### Question 2.a: Reading frames
Write a function ```translate``` that takes a string containing a DNA sequence as its input and outputs a Python dictionary containing the translation of the sequence in all possible reading frames. The keys of the dictionary should be ```f1```, ```f2```, ```f3``` for the three forward frames and ```r1```, ```r2``` and ```r3``` for the reverse reading frames; the value of each key should be the translation of the sequence in the corresponding frame.
For simplicity and ease of debugging, **do not complement the sequence** when computing the reverse reading frames; just reverse it. Use an asterisk (```*```) to represent stop codons. Always translate the entire sequence.

In [17]:
# Your code here
def translate(sequence):
    # dictionary of codon : amino acid pair
    table = {
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',                 
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
        'TAC':'Y', 'TAT':'Y', 'TAA':'*', 'TAG':'*',
        'TGC':'C', 'TGT':'C', 'TGA':'*', 'TGG':'W' }
    
    # create a dictionary to store proteins
    proteins = {}
    
    # for iterating over forward sequence, frame: f1, f2, f3
    for frame in range(3):
        # make the sequence length as multiple of 3
        # if sequence length = 25 and frame = 0 then, 25-0=25, 25//3=8 and finally length=3*8
        length = 3 * ((len(sequence)-frame) // 3)
        # take the part of sequence to perform translation
        seq = sequence[frame:frame+length]
        #proteins["f"+str(frame+1)] = "".join([table[seq[i:i + 3]] for i in range(0, len(seq), 3)])
        
        
        # for your understanding
        amino_acid = []
        # take a codon each time and convert it into amino acid
        for i in range(0, len(seq), 3):
            # take amino acid of corresponding codon from the above codon table
            amino_acid.append(table[seq[i:i + 3]])
        
        # join each amino acid to make string
        protein = "".join(amino_acid)
        # add this protein to proteins dictionary
        proteins["f"+str(frame+1)] = protein    
        # end here
        
        
    
    # reverse the sequence
    rev_seq = sequence[::-1]
    
    # for iterating over reverse sequence
    for frame in range(3):
        length = 3 * ((len(rev_seq)-frame) // 3) # Multiple of three
        rev_seq_framed = rev_seq[frame:frame+length]
        proteins["r"+str(frame+1)] = "".join([table[rev_seq_framed[j:j + 3]] \
                                              for j in range(0, len(rev_seq_framed), 3)])
        
    return proteins       

In [18]:
translate(s10)

{'f1': 'EDLRQ*PSRWKHCLLSKKRSCILLECYCLRK',
 'f2': 'KISGSDPLDGSTVCCLRKDRASF*SVTV*EN',
 'f3': 'RSQAVTL*MEALSVV*EKIVHPFRVLLFEKI',
 'r1': 'LKEFVIVRFSYVLEKNLLSVTKVDLPVTDSR',
 'r2': '*KSLSL*DFPTC*KRICCLSRR*ISQ*RTLE',
 'r3': 'KRVCHCEIFLRARKESVVCHEGRSPSDGL*K'}

In [19]:
# Test code
_seqdic=translate("ACTGACTGACTGACTGACTGACTG")
assert type(_seqdic)==type(dict()), "Return value is not a dictionary: %r" % _seqdic
assert set(_seqdic.keys())==set(['f1', 'f2', 'f3', 'r1', 'r2', 'r3']), \
    "Output dictionary has incorrect/missing keys: %r"  % _seqdic.keys()
assert type(_seqdic['f1'])==type(""), \
    "Output dictionary values should be strings, not %r" % type(_seqdic['f1'])
print("OK")

OK


### Question 2.b: Locating an ORF

Write a function called ```openReadingFrame``` that takes a string containing an aminoacid sequence as its argument and returns a string containing the aminoacids between the first Methionine (included) and the first STOP codon that follows it (excluded). Assume the stop codon is represented by an asterisk (```*```) as would be returned by ```translate``` above. If either the Methionine or the STOP codon are missing, your function should return an empty string.

In [20]:
# Your code here
def openReadingFrame(sequence):
    # to find out orf between M and *
    pattern = "(M.*?)(?:\*)"
    orf = re.findall(pattern, sequence)
    # if orf is not empty return orf
    if orf:
        return orf[0]
    # if orf is empty return empty string
    else:
        return ""


In [21]:
s11 = 'EDLRQ*PSRWKHCLLSKKRSCILLECYCLRK'

In [22]:
# Test code
_seq=openReadingFrame("AMCAPP*L")
assert type(_seq) is type(""), "Return value is not a string: %r" % _seq
print("OK")

OK


### Question 2.c: Translating a sequence

Write a function called ```candidateProtein``` that takes a string containing a DNA sequence as its input and outputs the string of aminoacids corresponding to the longest ORF, as extracted by ```openReadingFrame``` above.

In [23]:
def candidateProtein(dna):
    # find out protein in all reading frames
    proteins = translate(dna)
    #print(proteins)
    # extract orf from all proteins
    ORFs = [openReadingFrame(protein) for frame, protein in proteins.items()]
    #print(ORFs)
    # find out the longest orf
    longest_orf = max(ORFs, key=len)
    #print(longest_orf)
    # find out the protein corresponding to the longest orf
    for prot in list(proteins.values()):
        if longest_orf in prot:
            return prot
            break   

In [24]:
candidateProtein(s10)

'RSQAVTL*MEALSVV*EKIVHPFRVLLFEKI'

In [25]:
# Test code
_seq=candidateProtein("ATGACTGCTGGGTAG")
assert type(_seq) is type(""), "Return value is not a string: %r" % _seq
print("OK")

OK


### Question 2.d: Writing a FASTA file

Write a function called ```writeFASTA``` that takes three string arguments called, in the order, ```sequence```, ```description``` and ```filename```. Argument ```sequence``` should contain an aminoacid sequence. Argument ```description``` should contain a description (eg name of protein, organism, etc). Argument ```filename``` should contain a file name. Your code should create the file with the name requested, write to  it the description as a FASTA header (i.e. starting with the character ```>```) and write the sequence to the file. Long sequences should be formatted over several lines. The function should not return any value.

In [26]:
# Your code here
def writeFASTA(sequence, description, filename):
    # sequence length
    L = len(sequence)
    # open a file with given filename as write mode
    with open(filename, "w") as f:
        # write description with '>' before it
        f.write(f">{description}\n")
        
        # format sequence as 80 bases per line
        for i in range(0, L, 80):
            if (i+80) < L:
                f.write(f"{sequence[i:i+80]}\n")
            else:
                f.write(f"{sequence[i:]}\n")


In [27]:
writeFASTA(s10, "dna seq", "seq.fasta")

In [28]:
# Test code
import os
import os.path
_rv=writeFASTA(sequence="TESTTESTTESTTESTTEST",
              description="test sequence",
              filename="test9876347.fas")
assert type(_rv) is type(None), "Function should not return anything; it returns %r" % _rv
_fe=os.path.isfile("test9876347.fas")
assert _fe, "Cannot find output file - has it been created?"
os.remove("test9876347.fas")
print("OK")

OK


### Question 2.e: Putting it all together


Write a function called ```maximalORF``` that takes as its argument string ```inputfile``` containing the name of an input file, string ```outputfile``` with the name of an output file and string  ```proteinname``` with a description of a candidate protein. The function should read a DNA sequence from the input file and write the candidate protein corresponding to the longest ORF to the output file, in FASTA format. The string supplied in ```proteinname``` should provide the header of the FASTA file. The function should not return any value.

In [29]:
# Your code here
def maximalORF(inputfile, outputfile, proteinname):
    # read a sequence from input filename
    seq = readDNAsequence(inputfile)
    # find out the protein corresponding to the longest orf
    protein = candidateProtein(seq)
    # write the protein sequence into an output file
    writeFASTA(protein, proteinname, outputfile)

In [30]:
maximalORF("example.fasta", "protein.fasta", "protein name")

In [31]:
# Test code
import os
import os.path
with open("test9876348.fas", "wt") as _OUTF:
    _OUTF.write("> test\n")
    _OUTF.write("ATGACTGCTGGGTAG\n")
_rv=maximalORF(inputfile="test9876348.fas", outputfile="test9876349.fas",
               proteinname="test protein")
assert type(_rv) is type(None), "Function should not return anything; it returns %r" % _rv
_fe=os.path.isfile("test9876349.fas")
assert _fe, "Cannot find output file - has it been created?"
os.remove("test9876348.fas")
os.remove("test9876349.fas")
print("OK")

OK
