# Setup

In [1]:
import os
import random
from collections import defaultdict

DIR = r'c://downloads'

In [2]:
CODON_TABLE = {
    'UUU': 'F', 'UUC': 'F', 'UUA': 'L', 'UUG': 'L',
    'UCU': 'S', 'UCC': 'S', 'UCA': 'S', 'UCG': 'S',
    'UAU': 'Y', 'UAC': 'Y', 'UAA': '*', 'UAG': '*',
    'UGU': 'C', 'UGC': 'C', 'UGA': '*', 'UGG': 'W',
    'CUU': 'L', 'CUC': 'L', 'CUA': 'L', 'CUG': 'L',
    'CCU': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
    'CAU': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
    'CGU': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
    'AUU': 'I', 'AUC': 'I', 'AUA': 'I', 'AUG': 'M',
    'ACU': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
    'AAU': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
    'AGU': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
    'GUU': 'V', 'GUC': 'V', 'GUA': 'V', 'GUG': 'V',
    'GCU': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
    'GAU': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
    'GGU': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G',
}

# Q1

In [3]:
# Let us parse the file, like in the lab exercise.

f = open(os.path.join(DIR, 'orf_exons_chr17.txt'), 'r')

last_gene = None
orf_exons_per_gene = {}

for line in f:
    
    line = line.strip()
    
    if line.startswith('ORF Exon #'):
        orf_exon_seq = line[(line.find(': ') + 2):]
        orf_exons_per_gene[last_gene].append(orf_exon_seq)
    else:
        last_gene = line.replace(':', '')
        orf_exons_per_gene[last_gene] = []

f.close()

In [4]:
# We will use the data we parsed to reconstruct the coding DNA sequence of each gene (i.e. the part of the gene's sequence 
# coding to protein, which corresponds to the mRNA transcript without the UTRs), and use that CDS sequence to reconstruct
# the protein sequence.

def translate_seq(rna_seq):
    
    aa_letters = []
    
    for i in range(0, len(rna_seq), 3):
        codon = rna_seq[i:(i + 3)]
        aa_letters.append(CODON_TABLE[codon])
        
    return ''.join(aa_letters)

dna_seq_per_gene = {}
protein_seq_per_gene = {}

for gene_symbol, orf_exons in orf_exons_per_gene.items():
    dna_seq = ''.join(orf_exons)
    dna_seq_per_gene[gene_symbol] = dna_seq
    protein_seq = translate_seq(dna_seq.replace('T', 'U'))
    protein_seq_per_gene[gene_symbol] = protein_seq

In [5]:
# Find genes on chr17 with GCGCGCGCGC in their coding DNA sequence.

for gene_symbol, dna_seq in dna_seq_per_gene.items():
    if 'GCGCGCGCGC' in dna_seq:
        print(gene_symbol)

FZD2
GNA13
GRIN2C
ANKRD13B
SOST
UTS2R


In [6]:
# Find protein-coding genes on chr17 with RKRKRK in their protein sequence.

for gene_symbol, protein_seq in protein_seq_per_gene.items():
    if 'RKRKRK' in protein_seq:
        print(gene_symbol)

EFCAB3


# Q2A

In [7]:
# Extract the three stop codons from the codon table (of course a quick Google search would also work).

stop_codons = set()

for codon, aa in CODON_TABLE.items():
    if aa == '*':
        stop_codons.add(codon)
        
print('Stop codons: %s' % ', '.join(stop_codons))

Stop codons: UGA, UAG, UAA


In [8]:
RNA_LETTERS = list('ACGU')

def get_random_rna_nt():
    return random.choice(RNA_LETTERS)

def generate_random_codon():
    return get_random_rna_nt() + get_random_rna_nt() + get_random_rna_nt()

def generate_random_transcript():
    
    generated_codons = []
    
    while True:
        
        codon = generate_random_codon()
        generated_codons.append(codon)
        
        if codon in stop_codons:
            break
            
    return ''.join(generated_codons)
            
generate_random_transcript()

'CACUAUAGCCGGUCGUACCCUAAGGCAGAAGCAUAUCGGCGACGCUGGAUAGCCCAUGCGAAAGUACCCCUUCUAACUGUAAGGCUACUCGAAGAGUAA'

# Q2B

In [9]:
generated_transcript_lengths = []

for _ in range(1000):
    generated_transcript_lengths.append(len(generate_random_transcript()))
    
print('Average trasncript length of 1,000 randomly generated RNA transcripts: %.2f nt.' % (sum(generated_transcript_lengths) / \
        len(generated_transcript_lengths)))

Average trasncript length of 1,000 randomly generated RNA transcripts: 64.59 nt.


# Q2C

In [10]:
transcript_lengths_per_gc_content = defaultdict(list)

for _ in range(100000):
    
    random_transcript_seq = generate_random_transcript()
    transcript_length = len(random_transcript_seq)
    gc_content = (random_transcript_seq.count('G') + random_transcript_seq.count('C')) / transcript_length
    # Rounding the GC content to a resolution of 1 digit will provide its proper bin (e.g. ~30%).
    gc_content_group = round(gc_content, 1)
    
    if 0.1 <= gc_content_group <= 0.9:
        transcript_lengths_per_gc_content[gc_content_group].append(transcript_length)
    
for gc_content, transcript_lengths in sorted(transcript_lengths_per_gc_content.items()):
    n_transcripts = len(transcript_lengths)
    avg_length = sum(transcript_lengths) / n_transcripts
    print('Transcripts with a GC content of approximately %d%% [%d transcripts]: average length is %.2f nt.' % \
            (100 * gc_content, n_transcripts, avg_length))

Transcripts with a GC content of approximately 10% [225 transcripts]: average length is 10.47 nt.
Transcripts with a GC content of approximately 20% [2312 transcripts]: average length is 10.16 nt.
Transcripts with a GC content of approximately 30% [9423 transcripts]: average length is 11.94 nt.
Transcripts with a GC content of approximately 40% [18347 transcripts]: average length is 47.19 nt.
Transcripts with a GC content of approximately 50% [48973 transcripts]: average length is 88.14 nt.
Transcripts with a GC content of approximately 60% [16525 transcripts]: average length is 63.57 nt.
Transcripts with a GC content of approximately 70% [2250 transcripts]: average length is 20.49 nt.
Transcripts with a GC content of approximately 80% [173 transcripts]: average length is 14.45 nt.


We can see that average transcript length indeed varies, quite substantially, between GC-content groups.

The more obious reason for a (random) transcript length to depend on GC content is that stop codons are enriched with A/U compared to G/C (in the three stop codons, A/U occur 7 times, while G/C only 2 times - that is quite an imbalance). In short transcripts, the stop codon comprises a larger portion of all the the transcript's nucleotides, so the transcript will end up more enriched with A/U (as the transcript's stop codon is enriched with A/U). In longer transcripts, the imbalance of the stop codon would be less substantial in the overall transcript's GC content, so this effect will not be so noticeable. To summarize this effect: the imblance of GC content in stop codons is expected to lead to transcripts with a higher GC content to be, on average, longer.

What we get in reality, however, is more complicated than that. Transcripts with low GC content are, as expected, very short (e.g. look at the transcripts with a GC content of ~10%). But transcripts with a very high GC content (e.g. ~90%) are also short. What's going on? Well, there's another force at play here. In the process of collecting transcripts with a given GC content, we generated transcripts in an unbiased way (letting each of the 4 nucleotides occur with 25% probability), and collected them into GC-content groups only post-hoc. In other words, the collection of transcripts of the 90% group ended up whatever transcripts that ended up with a GC content of ~90% (in probability theory this is called conditioning). In the case of GC contents that are very extreme (e.g. 10% of 90%), this creates a bias towards very short transcripts, because long transcripts are very unlikely to end up with an imbalance of 10% or 90% (for short transcripts this is much more plausible, due to the "Law of small numbers").

To summarize, we have two forces at play here. First, high GC content naturally leads to longer transcripts (due to the low GC content of stop codons). But more importantly, very extreme GC contents (e.g. 10% or 90%) bias the resulst towards shorter transcripts. We get that transcripts with very high GC content also end up quite short (but not as short as those with a very low GC content; e.g. note that although the 90% transcripts are short, the 10% are even shorter). Since the first phenomenon is more substantial, we end up with the longest transcripts being those with a balanced GC content (~50%).