In [None]:
import random
import collections

In [None]:
# Structures
Nucleotides = ["A", "C", "G", "T"]
RNA_Nucleotides = ["A", "C", "G", "U"]
DNA_ReverseComplement = {"A":"T", "T":"A", "G":"C", "C":"G"}

In [None]:
# Check the sequence to see if it is a DNA string
def ValidateSeq(dna_seq):
    tmpseq = dna_seq.upper()
    for nuc in tmpseq:
        if nuc not in Nucleotides:
            return False
    return tmpseq

In [None]:
# DNAStr = ''.join([random.choice(Nucleotides) for nuc in range(50)])
# print(ValidateSeq(DNAStr))

# Counting Nucleotides

In [None]:
def countNucFrequency(seq):
    tmpFreq = { "A":0 , "C":0, "G":0, "T":0}
    for nuc in seq:
        if nuc in tmpFreq:
            tmpFreq[nuc] += 1
        else:
            return False
    return tmpFreq
#     return(dict(collections.Counter(seq)))

In [None]:
# print(countNucFrequency(DNAStr))

# Transcription

In [None]:
def transcription(seq): 
    """DNA -> RNA Transcription"""
    return seq.replace("T", "U")

In [None]:
# print(transcription(DNAStr))

# Reverse Complement

In [None]:
def Reverse_Complement(seq):
    """Loop through the DNA reverse for each nucleotide in the seq, then reverse it!"""
#     return ''.join([DNA_ReverseComplement[nuc] for nuc in seq])[::-1] 
    # faster solution:
    mapping = str.maketrans('ATCG', 'TAGC')
    return seq.translate(mapping)[::-1]

In [None]:
# print(f"\nDNA String + Complement + Reverse Complement:\n\n5' {DNAStr} 3' DNA String")
# print(f"   {''.join(['|' for c in range(len(DNAStr))])}")
# print(f"3' {Reverse_Complement(DNAStr)[::-1]} 5' Complement")
# print(f"5' {Reverse_Complement(DNAStr)} 3' Reverse Complement\n")

In [None]:
# Utilities!!

# Coloring!
def colored(seq):
    colors= {
        "A":'\033[92m',
        "C":'\033[94m',
        "G":'\033[93m',
        "T":'\033[91m',
        "U":'\033[91m',
        "reset":'\033[0;0m'
    }
    
    tmpStr = ""
    for nuc in seq:
        if nuc in colors:
            tmpStr += colors[nuc]+nuc
        else:
            tmpStr += colors['reset']+nuc
    return tmpStr + '\033[0;0m'

In [None]:
# print(colored(DNAStr))
# print(f"\nDNA String + Complement + Reverse Complement:\n\n5' {colored(DNAStr)} 3' DNA String")
# print(f"   {''.join(['|' for c in range(len(DNAStr))])}")
# print(f"3' {colored(Reverse_Complement(DNAStr)[::-1])} 5' Complement")
# print(f"5' {colored(Reverse_Complement(DNAStr))} 3' Reverse Complement\n")

# GC-content Calculations

In [None]:
def gc_content(seq):
    """GC content in DNA/RNA sequence"""
    return round((seq.count('C') + seq.count('G'))/len(seq)*100)

In [None]:
# print(f'GC Content: {gc_content(DNAStr)}%')

In [None]:
def gc_content_subsec(seq, k=20):
    """GC content in DNA/RNA subsequence with a length of k, default set to k=20"""
    res = []
    for i in range(0, len(seq)-k+1, k):
        subseq = seq[i:i+k]
        res.append(gc_content(subseq))
    return res

In [None]:
# print(f'GC Content: {gc_content_subsec(DNAStr, k = 10)}%')

# DNA codons

In [None]:
DNA_Codons = {
    'TCA': 'S',    'TCC': 'S',    'TCG': 'S',    'TCT': 'S',    'AGC': 'S',    'AGT': 'S',    # Serina
    'TTC': 'F',    'TTT': 'F',    # Fenilalanina
    'TAC': 'Y',    'TAT': 'Y',    # Tirosina
    'TAA': '_',    'TAG': '_',    'TGA': '_',    # Stop
    'TGC': 'C',    'TGT': 'C',    # Cisteina
    'TGG': 'W',    # Triptofano
    'CTA': 'L',    'CTC': 'L',    'CTG': 'L',    'CTT': 'L',    'TTA': 'L',    'TTG': 'L',    # Leucina
    'CCA': 'P',    'CCC': 'P',    'CCG': 'P',    'CCT': 'P',    # Prolina
    'CAC': 'H',    'CAT': 'H',    # Histidina
    'CAA': 'Q',    'CAG': 'Q',    # Glutamina
    'CGA': 'R',    'CGC': 'R',    'CGG': 'R',    'CGT': 'R',    'AGA': 'R',    'AGG': 'R',    # Arginina
    'ATA': 'I',    'ATC': 'I',    'ATT': 'I',    # Isoleucina
    'ATG': 'M',    # Methionina
    'ACA': 'T',    'ACC': 'T',    'ACG': 'T',    'ACT': 'T',    # Treonina
    'AAC': 'N',    'AAT': 'N',    # Asparagina
    'AAA': 'K',    'AAG': 'K',    # Lisina
    'GTA': 'V',    'GTC': 'V',    'GTG': 'V',    'GTT': 'V',    # Valina
    'GCA': 'A',    'GCC': 'A',    'GCG': 'A',    'GCT': 'A',    # Alanina
    'GAC': 'D',    'GAT': 'D',    # Acido Aspartico
    'GAA': 'E',    'GAG': 'E',    # Acido Glutamico
    'GGA': 'G',    'GGC': 'G',    'GGG': 'G',    'GGT': 'G'     # Glicina
}

In [None]:
RNA_Codons = {
'GCU':'A', 'GCC':'A', 'GCA':'A', 'GCG':'A',
'UGU':'C', 'UGC':'C',
'GAU':'D', 'GAC':'D',
'UUU':'F', 'UUC':'F',
'GGU':'G', 'GGC':'G', 'GGA':'G', 'GGG':'G',
'CAU':'H', 'CAC':'H',
'AUU':'I', 'AUC':'I', 'AUA':'I',
'AAA':'K', 'AAG':'K',
'UUA':'L', 'UUG':'L', 'CUU':'L', 'CUC':'L', 'CUA':'L', 'CUG':'L',
'AUG':'M',
'AAU':'N', 'AAC':'N',
'CCU':'P', 'CCC':'P', 'CCA':'P', 'CCG':'P',
'CAA':'Q', 'CAG':'Q',
'CGU':'R', 'CGC':'R', 'CGA':'R', 'CGG':'R', 'AGA':'R', 'AGG':'R',
'UCU':'S', 'UCC':'S', 'UCA':'S', 'UCG':'S', 'AGU':'S', 'AGC':'S',
'ACU':'T', 'ACC':'T', 'ACA':'T', 'ACG':'T',
'GUU':'V', 'GUC':'V', 'GUA':'V', 'GUG':'V',
'UGG':'W',
'UAU':'Y', 'UAC':'Y'}

# Translate DNA into aminoacid

In [None]:
def translate_seq(seq, init_pos = 0):
    """Translating a DNA sequence into aminoacid sequence"""
    return [DNA_Codons[seq[pos:pos+3]] for pos in range(init_pos, len(seq)-2, 3)]

In [None]:
# print(f'Aminoacid sequence from DNA: {translate_seq(DNAStr)}')

In [None]:
def codon_usage(seq, aminoacid):
    """shows the frequency of each codon in a DNA sequence"""
    tmpList = []
    for i in range(0,len(seq)-2,3):
        if DNA_Codons[seq[i:i+3]] == aminoacid:
            tmpList.append(seq[i:i+3])
    freqDict = dict(collections.Counter(tmpList))
    totalweight = sum(freqDict.values())
    for seq in freqDict:
        freqDict[seq] = round(freqDict[seq]/totalweight, 2)
    return freqDict

In [None]:
# print(f' Codon frequency (T): {codon_usage(DNAStr, "T")} ')


# Ribosomes entering!


In [None]:
def open_reading_frames(seq):
    """generating th esix reading frames from a DNA sequence, wwith the reverse complement"""
    frames = []
    frames.append(translate_seq(seq, 0))
    frames.append(translate_seq(seq, 1))
    frames.append(translate_seq(seq, 2))
    frames.append(translate_seq(Reverse_Complement(seq), 0))
    frames.append(translate_seq(Reverse_Complement(seq), 1))
    frames.append(translate_seq(Reverse_Complement(seq), 2)) # these six frames replicates ribosome functionality
    return frames

In [None]:
# print('Reading frames:')
# frames = open_reading_frames(DNAStr)
# print(frames)

In [None]:
# Reading aminoacids between start M and stop _ codons:
def proteins_rf(aa_seq):
    """compute all possible proteins in an aminoacid seq and return a list of possble proteins"""
    current_prot = []
    proteins = []
    for aa in aa_seq:
        if aa != "_":
            if aa == "M": # M start is found?
                current_prot.append("")
            for i in range(len(current_prot)):
                current_prot[i] += aa
        elif aa == "_":
            if current_prot:
                for p in current_prot:
                    proteins.append(p)
                current_prot = []
    return proteins

In [None]:
# for p in frames:
# print(proteins_rf(['D', 'S', 'R', 'M', 'I', 'R', 'S', 'D', 'M', 'P', 'Q', 'S', 'Q', 'W', '_', 'V']))

In [None]:
# def all_proteins_from_orfs(seq, startReadPos = 0, endReadPos = 0, ordered = False):
#     """Compute all possible proteins from all open reading frames"""
#     if endReadPos > startReadPos:
#         rfs = open_reading_frames(seq[startReadPos:endReadPos])
#     else:
#         rfs = open_reading_frames(seq)
    
#     res = []
#     for rf in rfs:
#         prots = proteins_rf(rf)
#         for p in prots:
#             res.append(p)
    
#     if ordered:
#         return sorted(seq, key = len, reverse=True)
#     return res

# Finding proteins in DNA Sequence

In [None]:
# Protein search database (to extract insulin): https://www.ncbi.nlm.nih.gov/nuccore/NM_001185097.2
def all_proteins_from_orfs(seq, startReadPos=0, endReadPos=0, ordered=False):
    """Compute all possible proteins for all open reading frames"""
    """Protine Search DB: https://www.ncbi.nlm.nih.gov/nuccore/NM_001185097.2"""
    """API can be used to pull protein info"""
    if endReadPos > startReadPos:
        rfs = open_reading_frames(seq[startReadPos: endReadPos])
    else:
        rfs = open_reading_frames(seq)

    res = []
    for rf in rfs:
        prots = proteins_rf(rf)
        for p in prots:
            res.append(p)

    if ordered:
        return sorted(res, key=len, reverse=True)
    return res


In [None]:
# print('All proteins in 6 open reading frames:')
# for prot in all_proteins_from_orfs(DNAStr, 0, 0, True):
#     print(prot)