In [1]:
import os

DIR = r'c://downloads'

In [2]:
f = open(os.path.join(DIR, 'orf_exons_chr17.txt'), 'r')

last_gene = None
orf_exons_per_gene = {}

for line in f:
    
    line = line.strip()
    
    if line.startswith('ORF Exon #'):
        orf_exon_seq = line[(line.find(': ') + 2):]
        orf_exons_per_gene[last_gene].append(orf_exon_seq)
    else:
        last_gene = line.replace(':', '')
        orf_exons_per_gene[last_gene] = []

f.close()

In [3]:
print('There are %d genes.' % len(orf_exons_per_gene))

n_orf_exons_per_gene = {}

for gene_symbol, orf_exons in orf_exons_per_gene.items():
    n_orf_exons_per_gene[gene_symbol] = len(orf_exons)

print('There are %d ORF exons in total.' % sum(n_orf_exons_per_gene.values()))
print('There are %.2f ORF exons per gene on average.' % (sum(n_orf_exons_per_gene.values()) / len(n_orf_exons_per_gene)))

There are 1026 genes.
There are 10412 ORF exons in total.
There are 10.15 ORF exons per gene on average.


In [4]:
from collections import Counter

print('Number of genes per number of ORF exons:')

for number, count in sorted(Counter(n_orf_exons_per_gene.values()).items()):
    print('%d: %d' % (number, count))

Number of genes per number of ORF exons:
1: 97
2: 64
3: 57
4: 85
5: 79
6: 71
7: 58
8: 67
9: 46
10: 33
11: 44
12: 46
13: 28
14: 29
15: 27
16: 11
17: 19
18: 17
19: 23
20: 15
21: 7
22: 13
23: 9
24: 8
25: 9
26: 5
27: 5
28: 2
29: 3
30: 7
31: 4
32: 5
34: 2
35: 1
36: 1
37: 2
38: 8
39: 5
40: 2
41: 1
42: 2
45: 2
51: 1
55: 1
58: 1
65: 1
67: 1
69: 1
85: 1


In [5]:
def get_value(pair):
    return pair[1]

print('Genes with the highest number of ORF exons:')
print(sorted(n_orf_exons_per_gene.items(), key = get_value)[-5:])

Genes with the highest number of ORF exons:
[('NF1', 58), ('MYO15A', 65), ('RNF213', 67), ('DNAH9', 69), ('DNAH2', 85)]


In [6]:
genes_with_single_orf_exon = []

for gene_symbol, n_orf_exons in n_orf_exons_per_gene.items():
    if n_orf_exons == 1:
        genes_with_single_orf_exon.append(gene_symbol)
        
f = open(os.path.join(DIR, 'genes_with_single_orf_exon.txt'), 'w')
f.write('\n'.join(genes_with_single_orf_exon))
f.close()

In [7]:
all_orf_exons = []

for orf_exons in orf_exons_per_gene.values():
    all_orf_exons.extend(orf_exons)
    
total_length = 0
    
for orf_exon in all_orf_exons:
    total_length += len(orf_exon)
    
print('Average length: %.2f' % (total_length / len(all_orf_exons)))

Average length: 165.57


In [8]:
print('Shortest ORF exons:')
print(sorted(all_orf_exons, key = len)[:5])

Shortest ORF exons:
['G', 'A', 'A', 'AT', 'CC']
