In [1]:
import os
import gzip
import csv
from collections import defaultdict, Counter

from Bio import SeqIO
from Bio.SeqUtils import GC

DIR = r'c://downloads'

# Question 1

In [2]:
'''
We download the entire repertoire of human transcriptome from NCBI in the genebank format (the human.#.rna.gbff.gz files at
ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/). We iterate over these transcripts and use the "CDS" feature in order
to determine that the trasncript is truly protein coding, and to find its 5' UTR, CDS and 3' UTR regions. We further validate
that the obtained CDS sequence truly translates to the amino-acid sequence that appears in the record as "translation".
Additionally, we make sure that the Comment of the record starts with the word "REVIEWED", to make sure we only consider
reviewed records.
Running this code on the entire human transcriptome is expected to take a few hours...
'''

def get_features_of_type(features, type_name):
    return [feature for feature in features if feature.type == type_name]
    
# Mapping region names ('5_utr', 'cds' and '3_utr') to the observed relative lengths and GC contents of the transcripts.
relative_lengths = defaultdict(list)
gc_contents = defaultdict(list)

transcript_count = 0

for file_index in range(8):

    # From: ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/
    f = gzip.open(os.path.join(DIR, 'human.%d.rna.gbff.gz' % (file_index + 1)), 'rt')

    for record in SeqIO.parse(f, 'genbank'):
    
        transcript_count += 1
        
        if transcript_count % 100 == 0:
            print('Processed %d transcripts.' % transcript_count, end = '\r')

        cds_features = get_features_of_type(record.features, 'CDS')

        if len(cds_features) > 0 and record.annotations['comment'].lower().startswith('reviewed'):

            cds_feature, = cds_features
            cds_start, cds_end = int(cds_feature.location.start), int(cds_feature.location.end)
            translation, = cds_feature.qualifiers['translation']

            seqs = {
                '5_utr': record.seq[:cds_start],
                'cds': record.seq[cds_start:cds_end],
                '3_utr': record.seq[cds_end:],
            }

            cds_translation = seqs['cds'].translate()

            if not cds_translation.endswith('*') or translation != cds_translation[:-1]:
                print('Translations do not match for: ' + record.id)
                continue

            for region, seq in seqs.items():
                relative_lengths[region].append(len(seq) / len(record.seq))
                gc_contents[region].append(GC(seq))
                
    f.close()
            
print('*' * 50)

for region in relative_lengths:
    print(region)
    print('\t' + 'Avg. relative length: %d%%' % (100 * sum(relative_lengths[region]) / len(relative_lengths[region])))
    print('\t' + 'Avg. GC content: %d%%' % (sum(gc_contents[region]) / len(gc_contents[region])))

Translations do not match for: NM_001350809.1
Translations do not match for: NM_001317231.1
Translations do not match for: NM_001362.3
Translations do not match for: NM_182704.1
Translations do not match for: NM_021237.5
Translations do not match for: NM_001319071.2
Translations do not match for: NM_001316374.2
Translations do not match for: NM_206926.2
Translations do not match for: NM_001318069.2
Translations do not match for: NM_001166304.2
Translations do not match for: NM_001173513.2
Translations do not match for: NM_001017371.5
Translations do not match for: NM_182701.1
Translations do not match for: NM_016332.3
Translations do not match for: NM_001352301.1
Translations do not match for: NM_001352302.1
Translations do not match for: NM_006440.5
Translations do not match for: NM_001099456.3
Translations do not match for: NM_014293.4
Translations do not match for: NM_052883.2
Translations do not match for: NM_001352300.2
Translations do not match for: NM_199072.4
Translations do no



Translations do not match for: NM_001301020.1
Translations do not match for: NM_004152.3
Translations do not match for: NM_001301302.1
Translations do not match for: NM_001100163.2
Translations do not match for: NM_002537.3
Translations do not match for: NM_001357016.1
Translations do not match for: NM_003214.4
Translations do not match for: NM_001042559.2
Translations do not match for: NM_002083.4
Translations do not match for: NM_003009.4
Translations do not match for: NM_001172705.1
Translations do not match for: NM_001418.4
Translations do not match for: NM_080430.4
Translations do not match for: NM_175886.3
Translations do not match for: NM_182742.3
Translations do not match for: NM_182729.3
Translations do not match for: NM_003330.4
Translations do not match for: NM_182743.3
Translations do not match for: NM_001261446.2
Translations do not match for: NM_001261445.2
Translations do not match for: NM_017861.4
Translations do not match for: NM_031895.6
Translations do not match for:

# Question 2

In [3]:
'''
We download and extract the entire seqeunce of chromosome 11 from the FASTA file found at:
ftp://hgdownload.cse.ucsc.edu/goldenPath/hg38/chromosomes/chr11.fa.gz.
Then, we use GENCODE annotations to find all exon annotations on chromosome 11, and group them by transcript. The grouping by
transcript is very important, because we want only intronic regions (not intergenic).
For each transcript we then use the coordinates of its exons to find the coordinates of the introns, and we extract the relevant
sequence from the chr11 sequence to get the intron seqeunces. We then use these sequences to extract only the pair of letter
at the beginning and end of each intron.
Note that it is crucial to use the same version of the human reference genome (GRCh38) for both files.
'''

# ftp://hgdownload.cse.ucsc.edu/goldenPath/hg38/chromosomes/chr11.fa.gz
f = gzip.open(os.path.join(DIR, 'chr11.fa.gz'), 'rt')
chr11_record, = SeqIO.parse(f, 'fasta')
chr11_seq = chr11_record.seq
f.close()

# ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_29/gencode.v29.annotation.gtf.gz
f = gzip.open(os.path.join(DIR, 'gencode.v29.annotation.gtf.gz'), 'rt')
csv_reader = csv.reader(f, delimiter = '\t')

for _ in range(5):
    next(csv_reader)

def parse_extra_fields(raw_extra_fields):

    extra_fields = {}

    for raw_extra_field in raw_extra_fields[:-1].split(';'):
        key, raw_value = raw_extra_field.strip().split(' ')
        value = raw_value.strip('"')
        extra_fields[key] = value
        
    return extra_fields
    
exons_per_transcript = defaultdict(list)
    
for a_chr, _, a_type, a_start, a_end, _, strand, _, raw_extra_fields in csv_reader:
    if a_chr == 'chr11' and a_type == 'exon':
        extra_fields = parse_extra_fields(raw_extra_fields)
        exons_per_transcript[extra_fields['transcript_id']].append((strand, int(a_start), int(a_end)))
    
f.close()

start_pairs = Counter()
end_pairs = Counter()

for transcript_exons in exons_per_transcript.values():
    
    strand, = {strand for strand, start, end in transcript_exons}
    
    if strand == '+':
        # On the positive strand, introns are between the end of the last exon (transcript_exons[i][2]) to the start of the next
        # one (transcript_exons[i + 1][1])
        introns = [(transcript_exons[i][2] + 1, transcript_exons[i + 1][1] - 1) for i in range(len(transcript_exons) - 1)]
    else:
        # On the negative strand, the exons appear in reverse order with respect to the chromosome coordinates (they are in
        # the correct order with respect to the gene itself). Therefore, each intron is found between the end of the next
        # exon (transcript_exons[i + 1][2]) to the start of the last one (transcript_exons[i][1]).
        introns = [(transcript_exons[i + 1][2] + 1, transcript_exons[i][1] - 1) for i in range(len(transcript_exons) - 1)]
        
    for start, end in introns:
        
        intron_seq = chr11_seq[(start - 1):end]
        
        if strand == '-':
            intron_seq = intron_seq.reverse_complement()
            
        start_pairs[str(intron_seq[:2]).upper()] += 1
        end_pairs[str(intron_seq[-2:]).upper()] += 1
        
print('Intron beginning pairs:')

for pair, count in start_pairs.most_common():
    print('\t' + '%s: %.2f%%' % (pair, 100 * count / sum(start_pairs.values())))

print('Intron end pairs:')

for pair, count in end_pairs.most_common():
    print('\t' + '%s: %.2f%%' % (pair, 100 * count / sum(end_pairs.values())))

Intron beginning pairs:
	GT: 98.16%
	GC: 1.39%
	AT: 0.11%
	TG: 0.05%
	AG: 0.04%
	GG: 0.03%
	CT: 0.03%
	GA: 0.03%
	AA: 0.03%
	TA: 0.03%
	TT: 0.02%
	CA: 0.02%
	CC: 0.02%
	AC: 0.02%
	T: 0.01%
	TC: 0.01%
	C: 0.01%
	A: 0.01%
	CG: 0.00%
	G: 0.00%
Intron end pairs:
	AG: 99.46%
	AC: 0.10%
	AA: 0.06%
	GG: 0.05%
	CC: 0.05%
	TG: 0.05%
	CT: 0.03%
	AT: 0.03%
	TT: 0.03%
	GA: 0.02%
	TC: 0.02%
	GC: 0.02%
	TA: 0.02%
	GT: 0.02%
	CA: 0.02%
	CG: 0.01%
	T: 0.01%
	C: 0.01%
	A: 0.01%
	G: 0.00%


The major spliceosome (constituting ~99% of splicing) splices introns containing GU at the 5' splice site and AG at the 3' splice site (these are known as the "canonical splice sites"; see [RNA splicing](https://en.wikipedia.org/wiki/RNA_splicing)). Our results, then, appear to be in line with lieterature.