-
Notifications
You must be signed in to change notification settings - Fork 0
/
genome.py
496 lines (392 loc) · 22.2 KB
/
genome.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
"""
:Author: Ashwin Srinivasan <ashwins@mit.edu>
:Author: Bilal Shaikh <bilal.shaikh@columbia.edu>
:Author: Balazs Szigeti <balazs.szigeti@mssm.edu>
:Date: 2018-06-06
:Copyright: 2018, Karr Lab
:License: MIT
"""
import Bio.SeqIO
import Bio.SeqRecord
import math
import numpy
import random
import scipy.constants
import wc_kb
import wc_kb_gen
from Bio.Data import CodonTable
from Bio.Seq import Seq, Alphabet
from numpy import random
from scipy import stats
from wc_utils.util.units import unit_registry
from wc_onto import onto as wcOntology
from wc_utils.util.ontology import are_terms_equivalent
class GenomeGenerator(wc_kb_gen.KbComponentGenerator):
"""
Generate synthetic chromosome with randomized genes/intergenic
regions. Creates RNA and protein objects corresponding to the genes
this chromosome. Associates the chromosome, RNAs, proteins
with a knowledge base object (and its Cell attribute).
Options:
* num_chromosomes (:obj:`int`): number of chromosomes
* mean_gc_frac (:obj:`float`): fraction of nucleotides which are G or C
* num_genes (:obj:`float`): mean number of genes
* mean_gene_len (:obj:`float`): mean codon length of a gene
* mean_coding_frac (:obj:`float`): mean coding fraction of the genome
* translation_table (:obj:`int`): The NCBI standard genetic code used
* num_ncRNA (:obj:`float`): The proportion of non coding RNAs
* num_rRNA (:obj:`float`): The proportion of ribosomal RNAs
* tRNA_prop (:obj:`float`): The proportion of transfer RNAs
* five_prime_len (:obj:`int`): Average 5' UTR length for transcription units
* three_prime_len (:obj:`int`): Average 3' UTR length for transcription units
* operon_prop (:obj:`float`): Proportion of genes that should be in an operon (polycistronic mRNA)
* operon_gen_num (:obj:`int`): Average number of genes in an operon
* mean_copy_number (:obj:`float`): mean copy number of each RNA
* mean_half_life (:obj:`float`): mean half-life of RNAs
* genetic_code (:obj:`str`): 'normal' / 'reduced', if reduced only 'I': ['ATC'], 'L': ['CTG'],
'M': ['ATG'], 'T': ['ACG'] codons in genome
* seq_path (:obj:`str`): path to save genome sequence
"""
def clean_and_validate_options(self):
""" Apply default options and validate options """
# Default options are loosely based on Escherichia coli K-12
# Nucleic Acids Research 41:D605-12 2013
options = self.options
genetic_code = options.get('genetic_code', 'normal')
assert(genetic_code in ['normal', 'reduced'])
options['genetic_code'] = genetic_code
num_chromosomes = options.get('num_chromosomes', 1)
assert(num_chromosomes >= 1 and int(num_chromosomes) == num_chromosomes)
options['num_chromosomes'] = num_chromosomes
chromosome_topology = options.get('chromosome_topology', 'circular')
assert(chromosome_topology in ['circular', 'linear'])
options['chromosome_topology'] = chromosome_topology
num_genes = options.get('num_genes', 4500)
assert(num_genes >= 1)
options['num_genes'] = int(num_genes)
num_ncRNA = options.get('num_ncRNA', 10) # not sure
assert(isinstance(num_ncRNA, int))
options['num_ncRNA'] = num_ncRNA
# http://book.bionumbers.org/how-many-ribosomal-rna-gene-copies-are-in-the-genome/
num_rRNA = options.get('num_rRNA', 7)
assert(isinstance(num_rRNA, int))
options['num_rRNA'] = num_rRNA
num_tRNA = options.get('num_tRNA', 20)
assert(isinstance(num_tRNA, int))
options['num_tRNA'] = num_tRNA
min_prots = options.get('min_prots', 8)
assert(isinstance(min_prots, int))
options['min_prots'] = min_prots
assert((num_ncRNA + num_rRNA + num_tRNA + min_prots) <= num_genes)
mean_gc_frac = options.get('mean_gc_frac', 0.58)
assert(mean_gc_frac >= 0 and mean_gc_frac <= 1)
options['mean_gc_frac'] = mean_gc_frac
# DOI: 10.1093/molbev/msk019
mean_gene_len = options.get('mean_gene_len', 308) # codon length (924 bp)
assert(mean_gene_len >= 1)
options['mean_gene_len'] = mean_gene_len
# DOI: 10.1007/s10142-015-0433-4
mean_coding_frac = options.get('mean_coding_frac', 0.88)
assert(mean_coding_frac > 0 and mean_coding_frac < 1)
options['mean_coding_frac'] = mean_coding_frac
translation_table = int(options.get('translation_table', 1))
assert(translation_table in range(1, 32))
options['translation_table'] = translation_table
five_prime_len = int(options.get('five_prime_len', 7))
assert(five_prime_len >= 0)
options['five_prime_len'] = five_prime_len
three_prime_len = int(options.get('three_prime_len', 5)) # guess
assert(three_prime_len >= 0)
options['three_prime_len'] = three_prime_len
operon_prop = (options.get('operon_prop', 0.2)) # guess
assert(operon_prop >= 0 and operon_prop <= 1)
options['operon_prop'] = operon_prop
operon_gen_num = int(options.get('operon_gen_num', 3))
assert(operon_gen_num >= 2)
options['operon_gen_num'] = operon_gen_num
mean_rna_half_life = options.get('mean_rna_half_life', 8 * 60)
assert(mean_rna_half_life > 0)
options['mean_rna_half_life'] = mean_rna_half_life
# DOI: 10.1073/pnas.0308747101
mean_protein_half_life = options.get('mean_protein_half_life', 750 * 60)
assert(mean_protein_half_life > 0)
options['mean_protein_half_life'] = mean_protein_half_life
# DOI: 10.1038/ismej.2012.94
mean_rna_copy_number = options.get('mean_rna_copy_number', 0.4)
assert(mean_rna_copy_number > 0)
options['mean_rna_copy_number'] = mean_rna_copy_number
# DOI: 10.1038/ismej.2012.94
mean_protein_copy_number = options.get('mean_protein_copy_number', 75)
assert(mean_protein_copy_number > 0)
options['mean_protein_copy_number'] = mean_protein_copy_number
seq_path = options.get('seq_path', 'rand_seq.fna')
options['seq_path'] = seq_path
def gen_components(self):
self.gen_genome()
self.gen_tus()
self.gen_rnas_proteins()
self.gen_concentrations()
self.reduce_model()
def gen_genome(self):
'''Construct knowledge base components and generate the DNA sequence'''
# get options
options = self.options
genetic_code = options.get('genetic_code')
num_chromosomes = options.get('num_chromosomes')
mean_gene_len = options.get('mean_gene_len')
translation_table = options.get('translation_table')
num_genes = options.get('num_genes')
mean_coding_frac = options.get('mean_coding_frac')
mean_gc_frac = options.get('mean_gc_frac')
chromosome_topology = options.get('chromosome_topology')
num_ncRNA = options.get('num_ncRNA')
num_rRNA = options.get('num_rRNA')
num_tRNA = options.get('num_tRNA')
min_prots = options.get('min_prots')
seq_path = options.get('seq_path')
cell = self.knowledge_base.cell
self.knowledge_base.translation_table = translation_table
codon_table = CodonTable.unambiguous_dna_by_id[translation_table]
BASES = ['A', 'C', 'G', 'T']
PROB_BASES = [(1 - mean_gc_frac) / 2, mean_gc_frac /2, mean_gc_frac/2, (1-mean_gc_frac)/2]
if genetic_code == 'normal':
START_CODONS = codon_table.start_codons
STOP_CODONS = codon_table.stop_codons
elif genetic_code == 'reduced':
START_CODONS = ['TTA']
STOP_CODONS = ['TAA']
num_genes_all = num_genes
assignList = num_tRNA*[wcOntology['WC:tRNA']] + \
num_rRNA*[wcOntology['WC:rRNA']] + \
num_ncRNA*[wcOntology['WC:ncRNA']] + \
(num_genes_all-(num_ncRNA + num_tRNA + num_rRNA))*[wcOntology['WC:mRNA']]
random.shuffle(assignList)
# Create a chromosome n times
dna_seqs = []
for i_chr in range(num_chromosomes):
num_genes = math.ceil(num_genes_all / num_chromosomes)
gene_lens = 3 * self.rand(mean_gene_len, count=num_genes, min=2)
intergene_lens = 3 * self.rand(mean_gene_len / mean_coding_frac * (1 - mean_coding_frac), count=num_genes)
seq_len = numpy.sum(gene_lens) + numpy.sum(intergene_lens)
# Generate seq based on random codons (NOT start/stop codons)
seq_str = []
if genetic_code=='normal':
for i in range(0, seq_len, 3):
codon_i = random.choice(STOP_CODONS)
codon_i = "".join(random.choice(BASES, p=PROB_BASES, size=(3,)))
seq_str.append(codon_i)
elif genetic_code=='reduced':
for i in range(0, seq_len, 3):
codon_i = STOP_CODONS[0]
codon_i = "".join(random.choice(['ATC', 'CTG', 'ATG', 'ACG']))
seq_str.append(codon_i)
seq_str = "".join(seq_str)
seq = Seq(seq_str, Alphabet.DNAAlphabet())
chro = cell.species_types.get_or_create(id='chr_{}'.format(i_chr + 1), __type=wc_kb.core.DnaSpeciesType)
chro.name = 'Chromosome {}'.format(i_chr + 1)
chro.circular = chromosome_topology == 'circular'
chro.double_stranded = True
chro.sequence_path = seq_path
gene_starts = numpy.int64(numpy.cumsum(numpy.concatenate(([0], gene_lens[0:-1])) +
numpy.concatenate((numpy.round(intergene_lens[0:1] / 2), intergene_lens[1:]))))
# creates GeneLocus objects for the genes and labels their GeneType (which type of RNA they transcribe)
for i_gene, gene_start in enumerate(gene_starts):
gene = self.knowledge_base.cell.loci.get_or_create(
id='gene_{}_{}'.format(i_chr + 1, i_gene + 1), __type=wc_kb.prokaryote.GeneLocus)
gene.start = gene_start + 1 # 1-indexed
gene.polymer = chro
gene.end = gene.start + gene_lens[i_gene] - 1 # 1-indexed
gene.name = 'gene {} {}'.format(i_chr+1, i_gene+1)
if len(assignList) > 0:
gene.type = assignList.pop()
else:
gene.type = wcOntology['WC:mRNA']
if gene.type == wcOntology['WC:mRNA']: # if mRNA, then set up start/stop codons in the gene
start_codon = random.choice(START_CODONS)
stop_codon = random.choice(STOP_CODONS)
seq_str = str(seq)
seq_str = seq_str[:gene.start-1] + \
start_codon + \
seq_str[gene.start+2: gene.end-3] + \
stop_codon + seq_str[gene.end:]
for i in range(gene.start+2, gene.end-3, 3):
while seq_str[i:i+3] in START_CODONS or seq_str[i:i+3] in STOP_CODONS:
if genetic_code == 'normal':
codon_i = "".join(random.choice(BASES, p=PROB_BASES, size=(3,)))
elif genetic_code == 'reduced':
codon_i = "".join(random.choice(['ATC', 'CTG', 'ATG', 'ACG']))
seq_str = seq_str[:i]+codon_i+seq_str[i+3:]
seq = Seq(seq_str, Alphabet.DNAAlphabet())
dna_seqs.append(Bio.SeqRecord.SeqRecord(seq, chro.id))
with open(seq_path, 'w') as file:
writer = Bio.SeqIO.FastaIO.FastaWriter(
file, wrap=70, record2title=lambda record: record.id)
writer.write_file(dna_seqs)
def gen_tus(self):
""" Creates transcription units with 5'/3' UTRs, polycistronic mRNAs, and other types of RNA (tRNA, rRNA, sRNA) """
options = self.options
five_prime_len = options.get('five_prime_len') # 7 bp default (E. coli, wikipedia)
three_prime_len = options.get('three_prime_len') # 5 bp default guess
operon_prop = options.get('operon_prop') # 0.2 default guess
operon_gen_num = options.get('operon_gen_num') # 3 genes default (https://academic.oup.com/gbe/article/5/11/2242/653613)
for i_chr, chromosome in enumerate(self.knowledge_base.cell.species_types.get(__type=wc_kb.core.DnaSpeciesType)):
seq = chromosome.get_seq()
i_gene = 0
transcription_loci = []
# Todo make this into a proper for loop that deals with repeats/additional loci
while i_gene < len(chromosome.loci):
gene = chromosome.loci[i_gene]
if gene.type == wcOntology['WC:mRNA']:
# polycistronic mRNA (multiple GeneLocus objects per TranscriptionUnitLocus)
five_prime = self.rand(five_prime_len)[0]
three_prime = self.rand(three_prime_len)[0]
operon_prob = random.random()
# make an operon (polycistronic mRNA, put multiple genes in one TransUnitLocus)
if operon_prob <= operon_prop:
operon_genes = self.rand(operon_gen_num, min=2)[0]
# add 3', 5' UTRs to the ends of the transcription unit (upstream of first gene, downstream of last gene)
tu = self.knowledge_base.cell.loci.get_or_create(
id='tu_{}_{}'.format(i_chr + 1, i_gene + 1), __type=wc_kb.prokaryote.TranscriptionUnitLocus)
tu.name = 'tu {} {}'.format(i_chr+1, i_gene+1)
five_prime_start = gene.start - five_prime
if five_prime_start < 0:
five_prime_start = 0
tu.genes.append(gene)
tu.start = five_prime_start
for k in range(operon_genes-1):
i_gene += 1
if i_gene >= len(chromosome.loci):
break
if (chromosome.loci[i_gene]).type == wcOntology['WC:mRNA']:
gene = chromosome.loci[i_gene]
tu.genes.append(gene)
else:
break
three_prime_end = gene.end + three_prime
if three_prime_end >= len(seq):
three_prime_end = len(seq) - 1
tu.end = three_prime_end
transcription_loci.append(tu)
else: # make an individual transcription unit for the gene
five_prime_start = gene.start - five_prime
three_prime_end = gene.end + three_prime
if five_prime_start < 0:
five_prime_start = 0
if three_prime_end >= len(seq):
three_prime_end = len(seq) - 1
tu = self.knowledge_base.cell.loci.get_or_create(
id='tu_{}_{}'.format(i_chr + 1, i_gene + 1), __type=wc_kb.prokaryote.TranscriptionUnitLocus)
tu.start = five_prime_start
tu.end = three_prime_end
tu.name = 'tu {} {}'.format(i_chr+1, i_gene+1)
tu.genes.append(gene)
transcription_loci.append(tu)
i_gene += 1
# make a transcription unit that transcribes other types of RNA (tRNA, rRNA, sRNA)
else:
tu = self.knowledge_base.cell.loci.get_or_create(
id='tu_{}_{}'.format(i_chr + 1, i_gene + 1), __type=wc_kb.prokaryote.TranscriptionUnitLocus)
tu.name = 'tu {} {}'.format(i_chr+1, i_gene+1)
tu.start = gene.start
tu.end = gene.end
tu.genes.append(gene)
transcription_loci.append(tu)
i_gene += 1
for locus in transcription_loci:
locus.polymer = chromosome
def gen_rnas_proteins(self):
""" Creates RNA and protein objects corresponding to genes on chromosome. """
cell = self.knowledge_base.cell
options = self.options
mean_rna_half_life = options.get('mean_rna_half_life')
mean_protein_half_life = options.get('mean_protein_half_life')
for chromosome in self.knowledge_base.cell.species_types.get(__type=wc_kb.core.DnaSpeciesType):
for i in range(len(chromosome.loci)):
locus = chromosome.loci[i]
if type(locus) == wc_kb.prokaryote.TranscriptionUnitLocus:
tu = locus
# creates RnaSpeciesType for RNA sequence corresponding to gene
rna = self.knowledge_base.cell.species_types.get_or_create(id='rna_{}'.format(tu.id), __type=wc_kb.prokaryote.RnaSpeciesType)
rna.name = 'rna {}'.format(tu.id)
# GeneLocus object for gene sequence, attribute of ProteinSpeciesType object
if are_terms_equivalent(tu.genes[0].type, wcOntology['WC:mRNA']):
rna.type = wcOntology['WC:mRNA']
elif are_terms_equivalent(tu.genes[0].type, wcOntology['WC:rRNA']):
rna.type = wcOntology['WC:rRNA']
elif are_terms_equivalent(tu.genes[0].type, wcOntology['WC:tRNA']):
rna.type = wcOntology['WC:tRNA']
elif are_terms_equivalent(tu.genes[0].type, wcOntology['WC:ncRNA']):
rna.type = wcOntology['WC:ncRNA']
rna.half_life = random.normal(mean_rna_half_life, numpy.sqrt(mean_rna_half_life))
rna.transcription_units.append(tu)
if are_terms_equivalent(rna.type, wcOntology['WC:mRNA']):
for gene in tu.genes:
# creates ProteinSpecipe object for corresponding protein sequence(s)
prot = self.knowledge_base.cell.species_types.get_or_create(
id='prot_{}'.format(gene.id),
__type=wc_kb.prokaryote.ProteinSpeciesType)
prot.name = 'prot_{}'.format(gene.id)
prot.cell = cell
prot.cell.knowledge_base = self.knowledge_base
prot.gene = gene
prot.rna = rna
prot.half_life = random.normal(mean_protein_half_life, numpy.sqrt(mean_protein_half_life))
def gen_concentrations(self):
""" Creates the concentration objects of RNA and protein objects """
options = self.options
cell = self.knowledge_base.cell
cytosol = cell.compartments.get_one(id='c')
mean_rna_copy_number = options.get('mean_rna_copy_number')
mean_protein_copy_number = options.get('mean_protein_copy_number')
if self.knowledge_base.cell.parameters.get_one(id='mean_volume') is not None:
mean_volume = self.knowledge_base.cell.parameters.get_one(id='mean_volume').value
else:
mean_volume = 0.000000000000000067
print('"mean_volume" parameter is missing, using Mycoplasma pneumoniae value (6.7E-17L).')
for rna in cell.species_types.get(__type=wc_kb.prokaryote.RnaSpeciesType):
rna_specie = rna.species.get_or_create(compartment=cytosol)
conc = round(abs(random.normal(loc=mean_rna_copy_number,scale=15))) / scipy.constants.Avogadro / mean_volume
cell.concentrations.get_or_create(id='CONC({})'.format(rna_specie.id), species=rna_specie, value=conc, units=unit_registry.parse_units('M'))
for prot in cell.species_types.get(__type=wc_kb.prokaryote.ProteinSpeciesType):
prot_specie = prot.species.get_or_create(compartment=cytosol)
conc = round(abs(random.normal(loc=mean_protein_copy_number,scale=15))) / scipy.constants.Avogadro / mean_volume
cell.concentrations.get_or_create(id='CONC({})'.format(prot_specie.id), species=prot_specie, value=conc, units=unit_registry.parse_units('M'))
def reduce_model(self):
options = self.options
genetic_code = options.get('genetic_code')
seq_path = options.get('seq_path')
if genetic_code == 'normal':
pass
elif genetic_code == 'reduced':
cell = self.knowledge_base.cell
kb = self.knowledge_base
bases = "TCAG"
codons = [a + b + c for a in bases for b in bases for c in bases]
dna = kb.cell.species_types.get(__type = wc_kb.core.DnaSpeciesType)[0]
seq_str = str(dna.get_seq())
seq_list = list(seq_str)
for prot in kb.cell.species_types.get(__type = wc_kb.prokaryote.ProteinSpeciesType):
for base_num in range(prot.gene.start+2,prot.gene.end-3,3):
new_codon = random.choice(['ATC', 'CTG', 'ATG', 'ACG'])
seq_list[base_num]=new_codon[0]
seq_list[base_num+1]=new_codon[1]
seq_list[base_num+2]=new_codon[2]
seq_str_new = ''.join(seq_list)
seq=Seq(seq_str_new)
dna_seqs = [Bio.SeqRecord.SeqRecord(seq, dna.id)]
with open(seq_path, 'w') as file:
writer = Bio.SeqIO.FastaIO.FastaWriter(
file, wrap=70, record2title=lambda record: record.id)
writer.write_file(dna_seqs)
def rand(self, mean, count=1, min=0, max=numpy.inf):
""" Generated 1 or more random normally distributed integer(s) with standard deviation equal
to the square root of the mean value.
Args:
mean (:obj:`float`): mean value
count (:obj:`int`): number of random numbers to generate
Returns:
:obj:`int` or :obj:`numpy.ndarray` of :obj:`int`: random normally distributed integer(s)
"""
a = (min-mean)/numpy.sqrt(mean)
b = (max - mean)/numpy.sqrt(mean)
return numpy.int64(numpy.round(stats.truncnorm.rvs(a, b, loc=mean, scale=numpy.sqrt(mean), size=count)))