In [92]:
import json, gumpy, numpy, copy, pandas

import gpas_covid_synthetic_reads as gcsr

from collections import defaultdict
from tqdm import tqdm

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


load in the constellations definition of Omicron

In [64]:
lineages_reference=gcsr.load_lineages_dataframe()
lineages_reference[:3]

Unnamed: 0,pango_lineage,who_label
0,cB.1.1.7,Alpha
1,cB.1.351,Beta
2,cP.1,Gamma


In [65]:
pango_definitions=gcsr.load_pango_definitions('../constellations/', lineages_reference)
pango_definitions['cB.1.1.7']

{'label': 'Alpha (B.1.1.7-like)',
 'description': 'B.1.1.7 lineage defining mutations',
 'sources': ['https://virological.org/t/563'],
 'type': 'variant',
 'variant': {'Pango_lineages': ['B.1.1.7'],
  'mrca_lineage': 'B.1.1.7',
  'PHE_label': 'VOC-20DEC-01',
  'WHO_label': 'Alpha',
  'representative_genome': ''},
 'tags': ['B.1.1.7', 'VOC 202012/01'],
 'sites': ['nuc:C913T',
  '1ab:T1001I',
  '1ab:A1708D',
  'nuc:C5986T',
  '1ab:I2230T',
  '1ab:SGF3675-',
  'nuc:C14676T',
  'nuc:C15279T',
  'nuc:T16176C',
  's:HV69-',
  's:Y144-',
  's:N501Y',
  's:A570D',
  's:P681H',
  's:T716I',
  's:S982A',
  's:D1118H',
  'nuc:C26801T',
  '8:Q27*',
  '8:R52I',
  '8:Y73C',
  'N:D3L',
  'N:S235F'],
 'rules': {'min_alt': 15, 'max_ref': 3}}

load in the constellation definition of the Covid genome

In [95]:
with open('../constellations/constellations/data/SARS-CoV-2.json') as f:
    constellation_genome=json.load(f)
constellation_genome['genes'].values(), constellation_genome['proteins'].values()

(dict_values([{'name': 'ORF1a', 'coordinates': {'from': 266, 'to': 13468}}, {'name': 'ORF1b', 'coordinates': {'from': 13468, 'to': 21555}}, {'name': 'spike', 'coordinates': {'from': 21563, 'to': 25384}}, {'name': 'ORF3a', 'coordinates': {'from': 25393, 'to': 26220}}, {'name': 'E', 'coordinates': {'from': 26245, 'to': 26472}}, {'name': 'M', 'coordinates': {'from': 26523, 'to': 27191}}, {'name': 'ORF6', 'coordinates': {'from': 27202, 'to': 27387}}, {'name': 'ORF7a', 'coordinates': {'from': 27394, 'to': 27759}}, {'name': 'ORF8', 'coordinates': {'from': 27894, 'to': 28259}}, {'name': 'N', 'coordinates': {'from': 28274, 'to': 29533}}, {'name': 'ORF10', 'coordinates': {'from': 29558, 'to': 29674}}]),
 dict_values([{'name': 'NSP1', 'gene': '1ab', 'description': 'leader protein', 'coordinates': {'from': 1, 'to': 180}}, {'name': 'NSP2', 'gene': '1ab', 'coordinates': {'from': 181, 'to': 818}}, {'name': 'NSP3', 'gene': '1ab', 'coordinates': {'from': 819, 'to': 2763}}, {'name': 'NSP4', 'gene': '1a

In [68]:
name_to_gene_lookup={}
for i in constellation_genome['genes']:
    name_to_gene_lookup[constellation_genome['genes'][i]['name']]=i
name_to_gene_lookup

{'ORF1a': 'ORF1a',
 'ORF1b': 'ORF1b',
 'spike': 'S',
 'ORF3a': 'ORF3a',
 'E': 'E',
 'M': 'M',
 'ORF6': 'ORF6',
 'ORF7a': 'ORF7a',
 'ORF8': 'ORF8',
 'N': 'N',
 'ORF10': 'ORF10'}

load in the Covid reference genome

In [69]:
reference=gumpy.Genome('gpas_covid_synthetic_reads/data/NC_045512.2.gbk')
reference

NC_045512
NC_045512.2
Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome
29903 bases
attaaa...aaaaaa
metadata for all genes/loci have been included

In [73]:
amino_acid_to_codon=gcsr.create_amino_acid_to_codon(reference)

In [75]:
amino_acid_to_codon['Y']

['tat', 'tac']

In [76]:
determine_closet_codon('taa', amino_acid_to_codon['Y'])

'tat'

In [25]:
reference.genes.keys()

dict_keys(['ORF1ab', 'ORF1ab_2', 'S', 'ORF3a', 'E', 'M', 'ORF6', 'ORF7a', 'ORF7b', 'ORF8', 'N', 'ORF10'])

In [17]:
reference.save_fasta('reference.fasta')

In [99]:
sample = {}

for lineage in tqdm(lineages_reference['pango_lineage']):

    sample[lineage] = copy.deepcopy(reference)
    
    pg = gcsr.PangoGenome(sample[lineage],pango_definitions[lineage], lineage)

  0%|                                                                                                                                  | 0/17 [00:00<?, ?it/s]

cB.1.1.7


 12%|██████████████▎                                                                                                           | 2/17 [00:02<00:21,  1.43s/it]

cP.1


 18%|█████████████████████▌                                                                                                    | 3/17 [00:04<00:19,  1.36s/it]

cB.1.617.2


 35%|███████████████████████████████████████████                                                                               | 6/17 [00:07<00:12,  1.17s/it]

cP.2


 76%|████████████████████████████████████████████████████████████████████████████████████████████▌                            | 13/17 [00:17<00:05,  1.46s/it]

cBA.1


 82%|███████████████████████████████████████████████████████████████████████████████████████████████████▋                     | 14/17 [00:19<00:04,  1.58s/it]

cBA.2


 88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 15/17 [00:22<00:03,  2.00s/it]

cBA.3


 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉       | 16/17 [00:24<00:01,  1.89s/it]

cB.1.1.529


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17/17 [00:26<00:00,  1.54s/it]


In [96]:
sample['cBA.2'].genes.keys()

dict_keys(['ORF1ab', 'ORF1ab_2', 'S', 'ORF3a', 'E', 'M', 'ORF6', 'ORF7a', 'ORF7b', 'ORF8', 'N', 'ORF10'])

In [102]:
pango_definitions['cBA.3']

{'label': 'Omicron (BA.3-like)',
 'description': 'BA.3 lineage defining mutations',
 'sources': [],
 'type': 'variant',
 'variant': {'Pango_lineages': ['BA.3'],
  'WHO_label': 'Omicron',
  'mrca_lineage': 'BA.3',
  'lineage_name': 'BA.3',
  'parent_lineage': 'B.1.1.529',
  'representative_genome': ''},
 'tags': ['BA.3'],
 'sites': ['orf1ab:S135R',
  'nuc:C832T',
  'orf1ab:G1307S',
  'orf1ab:T3090I',
  'nuc:G10447A',
  'nuc:C11235T',
  'nuc:C12880T',
  'nuc:C15714T',
  'orf1ab:I5967V',
  'spike:A67V',
  'del:21765:6',
  'del:21987:9',
  'del:22194:3',
  'spike:S371F',
  'spike:D405N',
  'spike:G446S',
  'orf3a:T223V',
  'nuc:C26858T',
  'n:S413R'],
 'note': 'Unique mutations for sublineage',
 'rules': {'min_alt': 11,
  'max_ref': 3,
  'nuc:C832T': 'not ref',
  'nuc:C11235T': 'not ref',
  'orf1ab:I5967V': 'not ref'}}