In [3]:
from collections import Counter
import random

In [4]:
#Gene stuff
DNA_Nucleotides = ['A', 'C', 'G', 'T']
DNA_ReverseComplement = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G'}
DNA_Codons = {
    # 'M' - START, '_' - STOP
    "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
    "TGT": "C", "TGC": "C",
    "GAT": "D", "GAC": "D",
    "GAA": "E", "GAG": "E",
    "TTT": "F", "TTC": "F",
    "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
    "CAT": "H", "CAC": "H",
    "ATA": "I", "ATT": "I", "ATC": "I",
    "AAA": "K", "AAG": "K",
    "TTA": "L", "TTG": "L", "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
    "ATG": "M",
    "AAT": "N", "AAC": "N",
    "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
    "CAA": "Q", "CAG": "Q",
    "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", "AGA": "R", "AGG": "R",
    "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", "AGT": "S", "AGC": "S",
    "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
    "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
    "TGG": "W",
    "TAT": "Y", "TAC": "Y",
    "TAA": "_", "TAG": "_", "TGA": "_"
}


In [5]:
# Creating a random DNA sequence for testing:
randDNAStr = ''.join([random.choice(DNA_Nucleotides)
                      for nuc in range(50)])
print(randDNAStr)

CAGTCCCCTGCTAAAGGAAACGGGTATAGTCACTCTGGAGTATCCGTTGC


In [6]:
def reverse_complement(dnastr):
    """
    Swapping adenine with thymine and guanine with cytosine.
    Reversing newly generated string
    """
    return ''.join([DNA_ReverseComplement[nuc] for nuc in dnastr])[::-1]


def nucleotide_frequency(dnastr):
    """Count nucleotides in a given sequence. Return a dictionary"""
    return dict(Counter(dnastr))

def transcription(dnastr):
    """Generating RNA with given DNA sequence by replacing T's with U's"""
    return dnastr.replace('T','U')

def gccontent(dnastr):
    """GC Content in a DNA/RNA sequence"""
    nf = nucleotide_frequency(dnastr)
    gcc = (nf['C']+nf['G'])/len(dnastr) 
    return gcc

def gccontent_subseq(dnastr, k=20):
    """GC Content in a DNA/RNA sub-sequence length k. k=20 by default"""
    f = []
    for i in range(0,len(dnastr),k):
        d = dnastr[i:i+k]
        print(d)
        f.append(gccontent(d))
    
    return f 

def translate_seq(dnastr, init_pos=0):
    """Translates a DNA sequence into an aminoacid sequence using the Codons table"""
    #-2 because 50 is not divisible by 3 but 48 is
    cod = []
    for i in range(init_pos,len(dnastr)-2,3):
       cod.append(DNA_Codons[dnastr[i:i+3]])

    return cod 


def codon_usage(dnastr, aminoacid):
    """Provides the frequency of each codon encoding a given aminoacid in a DNA sequence"""
    tmpList = []
    for i in range(0, len(dnastr) - 2, 3):
        if DNA_Codons[dnastr[i:i + 3]] == aminoacid:
            tmpList.append(dnastr[i:i + 3])

    freqDict = dict(Counter(tmpList))
    totalWight = sum(freqDict.values())

    for seq in freqDict:
        freqDict[seq] = round(freqDict[seq] / totalWight, 2)

    return freqDict

def gen_reading_frames(dnastr):
    """Generate the six reading frames of a DNA sequence, including reverse complement"""
    frames = []
    frames.append(translate_seq(dnastr,0))
    frames.append(translate_seq(dnastr,1))
    frames.append(translate_seq(dnastr,2))
    #reverse DNA for other part 
    frames.append(translate_seq(reverse_complement(dnastr), 0))
    frames.append(translate_seq(reverse_complement(dnastr), 1))
    frames.append(translate_seq(reverse_complement(dnastr), 2))
    
    return frames

for i in gen_reading_frames(randDNAStr):
    print(i)

print("ammi",translate_seq(randDNAStr))

['Q', 'S', 'P', 'A', 'K', 'G', 'N', 'G', 'Y', 'S', 'H', 'S', 'G', 'V', 'S', 'V']
['S', 'P', 'L', 'L', 'K', 'E', 'T', 'G', 'I', 'V', 'T', 'L', 'E', 'Y', 'P', 'L']
['V', 'P', 'C', '_', 'R', 'K', 'R', 'V', '_', 'S', 'L', 'W', 'S', 'I', 'R', 'C']
['A', 'T', 'D', 'T', 'P', 'E', '_', 'L', 'Y', 'P', 'F', 'P', 'L', 'A', 'G', 'D']
['Q', 'R', 'I', 'L', 'Q', 'S', 'D', 'Y', 'T', 'R', 'F', 'L', '_', 'Q', 'G', 'T']
['N', 'G', 'Y', 'S', 'R', 'V', 'T', 'I', 'P', 'V', 'S', 'F', 'S', 'R', 'G', 'L']
ammi ['Q', 'S', 'P', 'A', 'K', 'G', 'N', 'G', 'Y', 'S', 'H', 'S', 'G', 'V', 'S', 'V']


In [5]:
def proteins_from_rf(dnastr):
    """Compute all possible proteins in an aminoacid seq and return a list of possible proteins"""
    ff = []
    for i in dnastr:
        print(i)
        try:
            sta = i.index('M')
            print("sta",sta)
        except:
            print("No m")
        try:
            sto = i.index('_')
            print("sto",sto)
        except:
            print('no _')
        print(''.join(i[sta:sto]))
        ff.append(''.join(i[sta:sto]))
    return(ff)


test = [
    ['L', 'M', 'T', 'A', 'L', 'V', 'V','L', 'L', 'R', 'R', 'G', 'S', '_', 'G', 'H'],
    ['L', 'M', 'T', 'A', 'L', 'V', 'V','L', 'L', 'R', 'R', 'G', 'S', '_', 'G', 'H'],
    ['L', 'M', 'T', 'A', 'L', 'V', 'V','L', 'L', 'R', 'R', 'G', 'S', '_', 'G', 'H']
                 ]

print(proteins_from_rf(test))
# test_rf_frame = [['L', 'M', 'T', 'A', 'L', 'V', 'V',
#                  'L', 'L', 'R', 'R', 'G', 'S', '_', 'G', 'H']]

# print(proteins_from_rf(test_rf_frame))

['L', 'M', 'T', 'A', 'L', 'V', 'V', 'L', 'L', 'R', 'R', 'G', 'S', '_', 'G', 'H']
sta 1
sto 13
MTALVVLLRRGS
['L', 'M', 'T', 'A', 'L', 'V', 'V', 'L', 'L', 'R', 'R', 'G', 'S', '_', 'G', 'H']
sta 1
sto 13
MTALVVLLRRGS
['L', 'M', 'T', 'A', 'L', 'V', 'V', 'L', 'L', 'R', 'R', 'G', 'S', '_', 'G', 'H']
sta 1
sto 13
MTALVVLLRRGS
['MTALVVLLRRGS', 'MTALVVLLRRGS', 'MTALVVLLRRGS']
