In [1]:
import os

DIR = r'c://downloads'

In [2]:
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

# From: https://www.ncbi.nlm.nih.gov/nuccore/NM_000518 (download as FASTA)
hbb_record, = SeqIO.parse(os.path.join(DIR, 'sequence.fasta'), 'fasta')
hbb_seq = hbb_record.seq
hbb_translated_seq = hbb_seq[50:494].translate()[:-1]

hbb_protein_record = SeqRecord(hbb_translated_seq, id = hbb_record.id, description = hbb_record.description)
SeqIO.write(hbb_protein_record, os.path.join(DIR, 'translation.fasta'), 'fasta')

1

In [3]:
import gzip

longest_record = None

for i in range(8):
    
    # Downloaded the 8 files From: ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/
    f = gzip.open(os.path.join(DIR, 'human.%d.rna.fna.gz' % (i + 1)), 'rt')
    records = SeqIO.parse(f, 'fasta')
    
    for record in records:
        if longest_record is None or len(record.seq) > len(longest_record.seq):
            longest_record = record

    f.close()
    
print('The longest record is %s [length: %d nt]' % (longest_record.id, len(longest_record.seq)))

The longest record is NM_001267550.2 [length: 109224 nt]


In [4]:
from Bio.SeqUtils import GC
print(GC(longest_record.seq))

44.07364681754925


In [5]:
# From: https://www.ncbi.nlm.nih.gov/nuccore/NM_001267550
record, = SeqIO.parse(os.path.join(DIR, 'NM_001267550.gb'), 'genbank')
exons = [(int(feature.location.start), int(feature.location.end)) for feature in record.features if feature.type == 'exon']
print('There are %d exons.' % len(exons))

exon_seqs = [record.seq[start:end] for start, end in exons]
print('Max GC content: %.2f' % max(map(GC, exon_seqs)))

There are 363 exons.
Max GC content: 59.52


In [6]:
from collections import Counter

cds_feature, = [feature for feature in record.features if feature.type == 'CDS']
aa_seq, = cds_feature.qualifiers['translation']
aa_count = Counter(aa_seq)
aa_freq = {aa: count / len(aa_seq) for aa, count in aa_count.items()}
print(aa_freq)

{'M': 0.011336167375177128, 'T': 0.07257369898030062, 'Q': 0.027367953099385958, 'A': 0.06143202467283487, 'P': 0.0797699424856214, 'F': 0.025756439109777443, 'L': 0.06029285099052541, 'S': 0.07015642799588787, 'V': 0.09538495735044872, 'E': 0.09546831152232503, 'G': 0.05798671890194771, 'H': 0.013670084187713595, 'I': 0.05907032313633964, 'W': 0.013031035536661944, 'R': 0.04623378066738907, 'D': 0.04828985024033786, 'K': 0.08852213053263316, 'N': 0.031118890833819565, 'Y': 0.028034786474396375, 'C': 0.014503625906476619}


In [7]:
# URL: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/chromosomes/chr11.fa.gz
f = gzip.open(os.path.join(DIR, 'chr11.fa.gz'), 'rt')
chr11_record, = SeqIO.parse(f, 'fasta')
chr11_seq = chr11_record.seq
f.close()

In [8]:
HBB_EXONS = [
    (5248160, 5248301),
    (5247807, 5248029),
    (5246694, 5246956),
]

hbb_exon_seqs = [str(chr11_seq[(start - 1):end].reverse_complement()) for start, end in HBB_EXONS]
hbb_recovered_seq = ''.join(hbb_exon_seqs)

print(hbb_seq)
print('*' * 50)
print(hbb_recovered_seq)
print('*' * 50)
print(hbb_seq == hbb_recovered_seq)

ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGGTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTGAACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAGGCTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCTTTAGTGATGGCCTGGCTCACCTGGACAACCTCAAGGGCACCTTTGCCACACTGAGTGAGCTGCACTGTGACAAGCTGCACGTGGATCCTGAGAACTTCAGGCTCCTGGGCAACGTGCTGGTCTGTGTGCTGGCCCATCACTTTGGCAAAGAATTCACCCCACCAGTGCAGGCTGCCTATCAGAAAGTGGTGGCTGGTGTGGCTAATGCCCTGGCCCACAAGTATCACTAAGCTCGCTTTCTTGCTGTCCAATTTCTATTAAAGGTTCCTTTGTTCCCTAAGTCCAACTACTAAACTGGGGGATATTATGAAGGGCCTTGAGCATCTGGATTCTGCCTAATAAAAAACATTTATTTTCATTGCAA
**************************************************
ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGGTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTGAACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAGGCTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCTTTAGTGATGGCCTGGCTCACCTGGACAACCTCAAGGGCACCTTTGCCACACTGAGT

In [9]:
'''
Repeating the same analysis, but without loading the whole chromosome into memory.
The downside of this is that we can no longer store the sequence as a compressed file; we will have to uncompress
chr11.fa.gz into chr11.fa.
'''

from Bio.Seq import Seq
from Bio.Alphabet import Alphabet

f = open(os.path.join(DIR, 'chr11.fa'), 'r')
header_len = len(f.readline())
line_len = len(f.readline()) - 1

def convert_to_absolute_coordinate(position):
    position_zero_index = position - 1
    # For every 'line_len' chars, there's another \n char to take into account.
    return header_len + position_zero_index + (position_zero_index // line_len) 

def read_seq(start, end):

    global f
    
    absolute_start = convert_to_absolute_coordinate(start)
    absolute_length = convert_to_absolute_coordinate(end) - absolute_start + 1
    
    f.seek(absolute_start)
    str_seq = f.read(absolute_length).replace('\n', '')
    return Seq(str_seq, Alphabet())
    
hbb_exon_seqs = [str(read_seq(start, end).reverse_complement()) for start, end in HBB_EXONS]
hbb_recovered_seq = ''.join(hbb_exon_seqs)

print(hbb_seq)
print('*' * 50)
print(hbb_recovered_seq)
print('*' * 50)
print(hbb_seq == hbb_recovered_seq)

f.close()

ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGGTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTGAACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAGGCTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCTTTAGTGATGGCCTGGCTCACCTGGACAACCTCAAGGGCACCTTTGCCACACTGAGTGAGCTGCACTGTGACAAGCTGCACGTGGATCCTGAGAACTTCAGGCTCCTGGGCAACGTGCTGGTCTGTGTGCTGGCCCATCACTTTGGCAAAGAATTCACCCCACCAGTGCAGGCTGCCTATCAGAAAGTGGTGGCTGGTGTGGCTAATGCCCTGGCCCACAAGTATCACTAAGCTCGCTTTCTTGCTGTCCAATTTCTATTAAAGGTTCCTTTGTTCCCTAAGTCCAACTACTAAACTGGGGGATATTATGAAGGGCCTTGAGCATCTGGATTCTGCCTAATAAAAAACATTTATTTTCATTGCAA
**************************************************
ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGGTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTGAACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAGGCTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCTTTAGTGATGGCCTGGCTCACCTGGACAACCTCAAGGGCACCTTTGCCACACTGAGT

In [10]:
hbb_introns = [(HBB_EXONS[i + 1][1] + 1, HBB_EXONS[i][0] - 1) for i in range(len(HBB_EXONS) - 1)]

print('Exon lengths: ' + str([end - start + 1 for start, end in HBB_EXONS]))
print('Intron lengths: ' + str([end - start + 1 for start, end in hbb_introns]))

hbb_intron_seqs = [str(chr11_seq[(start - 1):end].reverse_complement()) for start, end in hbb_introns]

print('Exon GC content: %.2f' % GC(hbb_seq))
print('Intron GC content: %.2f' % GC(''.join(hbb_intron_seqs)))

Exon lengths: [142, 223, 263]
Intron lengths: [130, 850]
Exon GC content: 51.27
Intron GC content: 32.96
