From 49eb56aa46ff8529b97831dcbd723e528eec125f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Fri, 4 Jan 2019 13:51:01 +0100 Subject: [PATCH 001/170] Autocomplete working with pgsql --- backend/modules/browser/browser_handlers.py | 3 ++- backend/modules/browser/pgsql.py | 25 +++++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 backend/modules/browser/pgsql.py diff --git a/backend/modules/browser/browser_handlers.py b/backend/modules/browser/browser_handlers.py index 1d57f3988..7dff60210 100644 --- a/backend/modules/browser/browser_handlers.py +++ b/backend/modules/browser/browser_handlers.py @@ -2,6 +2,7 @@ from . import lookups from . import mongodb +from . import pgsql from .utils import get_xpos, add_consequence_to_variant, remove_extraneous_vep_annotations, \ order_vep_by_csq, get_proper_hgvs @@ -264,7 +265,7 @@ class Autocomplete(handlers.UnsafeHandler): def get(self, dataset, query): ret = {} - results = mongodb.get_autocomplete(dataset, query) + results = pgsql.get_autocomplete(dataset, query) ret = {'values': sorted(list(set(results)))[:20]} self.finish( ret ) diff --git a/backend/modules/browser/pgsql.py b/backend/modules/browser/pgsql.py new file mode 100644 index 000000000..90b757784 --- /dev/null +++ b/backend/modules/browser/pgsql.py @@ -0,0 +1,25 @@ +""" +Replaces mongodb.py +""" + +import logging + +from . import db +from . import lookups +from .utils import get_xpos + + +def get_autocomplete(dataset, query): + """ + Provide autocomplete suggestions based on the query + NOTE: dataset is not used for sql + Args: + dataset (str): name of the dataset + query (str): the query to compare to the available gene names + Returns: + list: A list of genes names whose beginning matches the query + """ + genes = db.Gene.select(db.Gene.name).where(db.Gene.name.startswith(query)) + gene_names = [str(gene.name) for gene in genes] + logging.error('Autocomplete: {}'.format(gene_names)) + return gene_names From 010b761b6535d09fce55fe5a9abcbd61b8eb503b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Fri, 4 Jan 2019 14:39:51 +0100 Subject: [PATCH 002/170] skeleton for get_variant_list --- backend/modules/browser/pgsql.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/backend/modules/browser/pgsql.py b/backend/modules/browser/pgsql.py index 90b757784..a8d761377 100644 --- a/backend/modules/browser/pgsql.py +++ b/backend/modules/browser/pgsql.py @@ -23,3 +23,7 @@ def get_autocomplete(dataset, query): gene_names = [str(gene.name) for gene in genes] logging.error('Autocomplete: {}'.format(gene_names)) return gene_names + + +def get_variant_list: + pass From e8c5e7e3f61924b4e93d0911700a1f9bbfaa7767 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Fri, 4 Jan 2019 15:18:55 +0100 Subject: [PATCH 003/170] first function passing test --- backend/modules/browser/lookups.py | 20 +++++++++++++++---- backend/modules/browser/test_lookups.py | 26 +++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 4 deletions(-) create mode 100644 backend/modules/browser/test_lookups.py diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index 829dff050..430f461e6 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -1,15 +1,27 @@ import re -from .utils import METRICS, AF_BUCKETS, get_xpos, xpos_to_pos, add_consequence_to_variants, add_consequence_to_variant +import db + +#from .utils import METRICS, AF_BUCKETS, get_xpos, xpos_to_pos, add_consequence_to_variants, add_consequence_to_variant SEARCH_LIMIT = 10000 -def get_gene(sdb, gene_id): - return sdb.genes.find_one({'gene_id': gene_id}, projection={'_id': False}) +def get_gene(gene_id): + """ + Retrieve gene by gene_id + Args: + gene_id: the id of the gene + + """ + try: + return db.Gene.select().where(db.Gene.gene_id==gene_id).dicts().get() + except db.Gene.DoesNotExist: + return {} -def get_gene_by_name(sdb, gene_name): +def get_gene_by_name(gene_name): # try gene_name field first + gene = db.Gene.select().where(db.Gene.gene_id==gene_id).dicts().get() gene = sdb.genes.find_one({'gene_name': gene_name}, projection={'_id': False}) if gene: return gene diff --git a/backend/modules/browser/test_lookups.py b/backend/modules/browser/test_lookups.py new file mode 100644 index 000000000..fe1fa0a59 --- /dev/null +++ b/backend/modules/browser/test_lookups.py @@ -0,0 +1,26 @@ +import lookups + +def test_get_gene(): + expected = {'id': 1, + 'reference_set': 1, + 'gene_id': 'ENSG00000223972', + 'gene_name': 'DDX11L1', + 'full_name': 'DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 1', + 'canonical_transcript': 'ENST00000456328', + 'chrom': '1', + 'start_pos': 11870, + 'strand': '+'} + result = lookups.get_gene('ENSG00000223972') + print(result) + assert result['id'] == expected['id'] + assert result['reference_set'] == expected['reference_set'] + assert result['gene_id'] == expected['gene_id'] + assert result['name'] == expected['gene_name'] + assert result['full_name'] == expected['full_name'] + assert result['canonical_transcript'] == expected['canonical_transcript'] + assert result['chrom'] == expected['chrom'] + assert result['start'] == expected['start_pos'] + assert result['strand'] == expected['strand'] + + result = lookups.get_gene('NOT_A_GENE') + assert not result From b85793e1a427ff48c96b73712948318cb087f8e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Fri, 4 Jan 2019 16:53:01 +0100 Subject: [PATCH 004/170] transcript retrieval working --- backend/modules/browser/lookups.py | 60 ++++++++++++----- backend/modules/browser/test_lookups.py | 88 +++++++++++++++++++++++++ 2 files changed, 132 insertions(+), 16 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index 430f461e6..f2754409b 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -8,10 +8,11 @@ def get_gene(gene_id): """ - Retrieve gene by gene_id + Retrieve gene by gene id Args: gene_id: the id of the gene - + Returns: + dict: values for the gene; empty if not found """ try: return db.Gene.select().where(db.Gene.gene_id==gene_id).dicts().get() @@ -20,16 +21,41 @@ def get_gene(gene_id): def get_gene_by_name(gene_name): + """ + Retrieve gene by gene_name. + First checks gene_name, then other_names. + Args: + gene_name: the id of the gene + Returns: + dict: values for the gene; empty if not found + """ # try gene_name field first - gene = db.Gene.select().where(db.Gene.gene_id==gene_id).dicts().get() - gene = sdb.genes.find_one({'gene_name': gene_name}, projection={'_id': False}) - if gene: - return gene - # if not, try gene['other_names'] - return sdb.genes.find_one({'other_names': gene_name}, projection={'_id': False}) + try: + return db.Gene.select().where(db.Gene.name==gene_name).dicts().get() + except db.Gene.DoesNotExist: + try: + # troubles with KeyError + return db.Gene.select().where(db.Gene.other_names.contains(gene_name)).dicts().get() + except db.Gene.DoesNotExist: + return {} -def get_transcript(sdb, transcript_id): +def get_transcript(transcript_id): + """ + Retrieve transcript by transcript id + Also includes exons as ['exons'] + Args: + transcript_id: the id of the transcript + Returns: + dict: values for the transcript, including exons; empty if not found + """ + try: + transcript = db.Transcript.select().where(db.Transcript.transcript_id==transcript_id).dicts().get() + transcript['exons'] = get_exons_in_transcript(transcript['id']) + return transcript + except db.Transcript.DoesNotExist: + return {} + transcript = sdb.transcripts.find_one({'transcript_id': transcript_id}, projection={'_id': False}) if not transcript: return None @@ -317,10 +343,12 @@ def get_variants_in_transcript(db, sdb, transcript_id): return variants -def get_exons_in_transcript(sdb, transcript_id): - # return sorted( - # [x for x in - # db.exons.find({'transcript_id': transcript_id}, projection={'_id': False}) - # if x['feature_type'] != 'exon'], - # key=lambda k: k['start']) - return sorted(list(sdb.exons.find({'transcript_id': transcript_id, 'feature_type': { "$in": ['CDS', 'UTR', 'exon'] }}, projection={'_id': False})), key=lambda k: k['start']) +def get_exons_in_transcript(transcript_dbid): + """ + Retrieve exons associated with the given transcript id + Args: + transcript_dbid: the id of the transcript in the database (Transcript.id; not transcript_id) + Returns: + list: dicts with values for each exon sorted by start position + """ + return sorted(list(db.Feature.select().where(db.Feature.transcript==transcript_dbid).dicts()), key=lambda k: k['start']) diff --git a/backend/modules/browser/test_lookups.py b/backend/modules/browser/test_lookups.py index fe1fa0a59..b4484cb6b 100644 --- a/backend/modules/browser/test_lookups.py +++ b/backend/modules/browser/test_lookups.py @@ -24,3 +24,91 @@ def test_get_gene(): result = lookups.get_gene('NOT_A_GENE') assert not result + + +def test_get_gene_by_name(): + expected = {'id': 1, + 'reference_set': 1, + 'gene_id': 'ENSG00000223972', + 'gene_name': 'DDX11L1', + 'full_name': 'DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 1', + 'canonical_transcript': 'ENST00000456328', + 'chrom': '1', + 'start_pos': 11870, + 'strand': '+'} + result = lookups.get_gene_by_name('DDX11L1') + assert result['id'] == expected['id'] + assert result['reference_set'] == expected['reference_set'] + assert result['gene_id'] == expected['gene_id'] + assert result['name'] == expected['gene_name'] + assert result['full_name'] == expected['full_name'] + assert result['canonical_transcript'] == expected['canonical_transcript'] + assert result['chrom'] == expected['chrom'] + assert result['start'] == expected['start_pos'] + assert result['strand'] == expected['strand'] + + # crashing with other_names.contains() +# result = lookups.get_gene_by_name('NOT_A_GENE') +# assert not result + # NOC2L +# result = lookups.get_gene_by_name('NOC2L') +# result = lookups.get_gene_by_name('NIR') +# print(result) +# assert False +# result = lookups.get_gene_by_name('Z') + + +def test_get_transcript(): + expected = {'id': 5, + 'transcript_id': 'ENST00000438504', + 'gene': '2', + 'mim_annotation': 'Was protein family homolog 1; wash1', + 'chrom': '1', + 'mim_gene_accession': 613632, + 'start_pos': 14364, + 'stop_pos': 29371, + 'strand': '-'} + exp_exon = [{'id': 28, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 14364, 'stop': 14830, 'strand': '-', 'feature_type': 'exon'}, + {'id': 27, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 14971, 'stop': 15039, 'strand': '-', 'feature_type': 'exon'}, + {'id': 26, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 15797, 'stop': 15902, 'strand': '-', 'feature_type': 'exon'}, + {'id': 25, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 15905, 'stop': 15948, 'strand': '-', 'feature_type': 'exon'}, + {'id': 24, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 16608, 'stop': 16766, 'strand': '-', 'feature_type': 'exon'}, + {'id': 23, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 16855, 'stop': 17056, 'strand': '-', 'feature_type': 'exon'}, + {'id': 22, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 17234, 'stop': 17365, 'strand': '-', 'feature_type': 'exon'}, + {'id': 21, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 17603, 'stop': 17743, 'strand': '-', 'feature_type': 'exon'}, + {'id': 20, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 17916, 'stop': 18062, 'strand': '-', 'feature_type': 'exon'}, + {'id': 19, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 18269, 'stop': 18380, 'strand': '-', 'feature_type': 'exon'}, + {'id': 18, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 24739, 'stop': 24892, 'strand': '-', 'feature_type': 'exon'}, + {'id': 17, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 29322, 'stop': 29371, 'strand': '-', 'feature_type': 'exon'}] + + result = lookups.get_transcript('ENST00000438504') + assert result['id'] == expected['id'] + assert result['mim_annotation'] == expected['mim_annotation'] + assert result['transcript_id'] == expected['transcript_id'] + assert result['mim_gene_accession'] == expected['mim_gene_accession'] + assert result['chrom'] == expected['chrom'] + assert result['start'] == expected['start_pos'] + assert result['stop'] == expected['stop_pos'] + assert result['strand'] == expected['strand'] + assert result['exons'] == exp_exon + + assert not lookups.get_transcript('INCORRECT') + + +def test_get_exons_in_transcript(): + result = lookups.get_exons_in_transcript(5) + expected = [{'id': 28, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 14364, 'stop': 14830, 'strand': '-', 'feature_type': 'exon'}, + {'id': 27, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 14971, 'stop': 15039, 'strand': '-', 'feature_type': 'exon'}, + {'id': 26, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 15797, 'stop': 15902, 'strand': '-', 'feature_type': 'exon'}, + {'id': 25, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 15905, 'stop': 15948, 'strand': '-', 'feature_type': 'exon'}, + {'id': 24, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 16608, 'stop': 16766, 'strand': '-', 'feature_type': 'exon'}, + {'id': 23, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 16855, 'stop': 17056, 'strand': '-', 'feature_type': 'exon'}, + {'id': 22, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 17234, 'stop': 17365, 'strand': '-', 'feature_type': 'exon'}, + {'id': 21, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 17603, 'stop': 17743, 'strand': '-', 'feature_type': 'exon'}, + {'id': 20, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 17916, 'stop': 18062, 'strand': '-', 'feature_type': 'exon'}, + {'id': 19, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 18269, 'stop': 18380, 'strand': '-', 'feature_type': 'exon'}, + {'id': 18, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 24739, 'stop': 24892, 'strand': '-', 'feature_type': 'exon'}, + {'id': 17, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 29322, 'stop': 29371, 'strand': '-', 'feature_type': 'exon'}] + print(result) + assert result == expected + From 370066ce48b9926bc8ea4dd20655d091057b3e39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Mon, 7 Jan 2019 13:10:40 +0100 Subject: [PATCH 005/170] some get_variant functions fixed --- backend/modules/browser/lookups.py | 93 ++++++++++++------------- backend/modules/browser/test_lookups.py | 59 ++++++++++++++-- 2 files changed, 97 insertions(+), 55 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index f2754409b..c2ef98043 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -1,5 +1,4 @@ import re - import db #from .utils import METRICS, AF_BUCKETS, get_xpos, xpos_to_pos, add_consequence_to_variants, add_consequence_to_variant @@ -10,7 +9,7 @@ def get_gene(gene_id): """ Retrieve gene by gene id Args: - gene_id: the id of the gene + gene_id (str): the id of the gene Returns: dict: values for the gene; empty if not found """ @@ -25,7 +24,7 @@ def get_gene_by_name(gene_name): Retrieve gene by gene_name. First checks gene_name, then other_names. Args: - gene_name: the id of the gene + gene_name (str): the id of the gene Returns: dict: values for the gene; empty if not found """ @@ -45,7 +44,7 @@ def get_transcript(transcript_id): Retrieve transcript by transcript id Also includes exons as ['exons'] Args: - transcript_id: the id of the transcript + transcript_id (str): the id of the transcript Returns: dict: values for the transcript, including exons; empty if not found """ @@ -63,54 +62,50 @@ def get_transcript(transcript_id): return transcript -def get_raw_variant(db, xpos, ref, alt, get_id=False): - return db.variants.find_one({'xpos': xpos, 'ref': ref, 'alt': alt}, projection={'_id': get_id}) - - -def get_variant(db, sdb, xpos, ref, alt): - variant = get_raw_variant(db, xpos, ref, alt, False) - if variant is None or 'rsid' not in variant: - return variant - if variant['rsid'] == '.' or variant['rsid'] is None: - rsid = sdb.dbsnp.find_one({'xpos': xpos}) - if rsid: - variant['rsid'] = 'rs%s' % rsid['rsid'] - return variant - - -def add_rsid_to_variant(sdb, variant): - if variant['rsid'] == '.' or variant['rsid'] is None: - rsid = sdb.dbsnp.find_one({'xpos': variant['xpos']}) - if rsid: - variant['rsid'] = 'rs%s' % rsid['rsid'] - - -def get_variants_by_rsid(db, rsid): - if not rsid.startswith('rs'): - return None +def get_raw_variant(pos, chrom, ref, alt): + """ + Retrieve variant by position and change + Args: + pos (int): position of the variant + chrom (str): name of the chromosome + ref (str): reference sequence + ref (str): variant sequence + Returns: + dict: values for the variant; empty if not found + """ try: - int(rsid.lstrip('rs')) - except ValueError: - return None - variants = list(db.variants.find({'rsid': rsid}, projection={'_id': False})) - add_consequence_to_variants(variants) - return variants + return db.Variant.select().where(db.Variant.pos == pos, + db.Variant.ref == ref, + db.Variant.alt == alt, + db.Variant.chrom == chrom).dicts().get() + except db.Variant.DoesNotExist: + return {} -def get_variants_from_dbsnp(db,sdb, rsid): - if not rsid.startswith('rs'): - return None - try: - rsid = int(rsid.lstrip('rs')) - except ValueError: - return None - position = sdb.dbsnp.find_one({'rsid': rsid}) - if position: - variants = list(db.variants.find({'xpos': {'$lte': position['xpos'], '$gte': position['xpos']}}, projection={'_id': False})) - if variants: - add_consequence_to_variants(variants) - return variants - return [] +def get_variant(pos, chrom, ref, alt): + """ + Retrieve variant by position and change + Retrieves rsid from db (if available) if not present in variant + Args: + pos (int): position of the variant + chrom (str): name of the chromosome + ref (str): reference sequence + ref (str): variant sequence + Returns: + dict: values for the variant; empty if not found + """ + try: + variant = get_raw_variant(pos, chrom, ref, alt) + if not variant or 'rsid' not in variant: + return variant + if variant['rsid'] == '.' or variant['rsid'] is None: + rsid = db.dbsnp.select().where(db.snp.pos==pos, + db.snp.chrom==chrom).dicts().get() + if rsid: + variant['rsid'] = 'rs{}'.format(rsid['rsid']) + return variant + except db.Variant.DoesNotExist: + return {} def get_coverage_for_bases(db, xstart, xstop=None): diff --git a/backend/modules/browser/test_lookups.py b/backend/modules/browser/test_lookups.py index b4484cb6b..e1cf5f83c 100644 --- a/backend/modules/browser/test_lookups.py +++ b/backend/modules/browser/test_lookups.py @@ -1,6 +1,14 @@ +''' +Tests for the functions available in lookups.py +''' + import lookups def test_get_gene(): + ''' + Test get_gene() + ''' + # normal entry expected = {'id': 1, 'reference_set': 1, 'gene_id': 'ENSG00000223972', @@ -21,12 +29,17 @@ def test_get_gene(): assert result['chrom'] == expected['chrom'] assert result['start'] == expected['start_pos'] assert result['strand'] == expected['strand'] - + + # non-existing result = lookups.get_gene('NOT_A_GENE') assert not result def test_get_gene_by_name(): + ''' + Test get_gene_by_name() + ''' + # normal entry expected = {'id': 1, 'reference_set': 1, 'gene_id': 'ENSG00000223972', @@ -51,14 +64,21 @@ def test_get_gene_by_name(): # result = lookups.get_gene_by_name('NOT_A_GENE') # assert not result # NOC2L -# result = lookups.get_gene_by_name('NOC2L') -# result = lookups.get_gene_by_name('NIR') -# print(result) -# assert False -# result = lookups.get_gene_by_name('Z') + result = lookups.get_gene_by_name('NOC2L') + assert result['gene_id'] == 'ENSG00000188976' + result = lookups.get_gene_by_name('NIR') + result = lookups.get_gene_by_name('Z') + # non-existing + assert not lookups.get_gene_by_name('INCORRECT') + + def test_get_transcript(): + ''' + Test get_transcript() + ''' + # normal entry expected = {'id': 5, 'transcript_id': 'ENST00000438504', 'gene': '2', @@ -92,10 +112,37 @@ def test_get_transcript(): assert result['strand'] == expected['strand'] assert result['exons'] == exp_exon + # non-existing assert not lookups.get_transcript('INCORRECT') +def test_get_raw_variant(): + ''' + Test get_raw_variant + ''' + result = lookups.get_raw_variant(55500283, '1', 'A', 'T') + assert result['genes'] == ['ENSG00000169174'] + assert result['transcripts'] == ['ENST00000302118'] + assert not lookups.get_raw_variant(55500281, '1', 'A', 'T') + + +def test_get_variant(): + ''' + Test get_variant() + ''' + result = lookups.get_variant(55500283, '1', 'A', 'T') + assert result['genes'] == ['ENSG00000169174'] + assert result['transcripts'] == ['ENST00000302118'] + assert result['rsid'] == [75050571] + # need to add test for entry with missing rsid + # too slow query atm + assert not lookups.get_variant(55500281, '1', 'A', 'T') + + def test_get_exons_in_transcript(): + ''' + Test get_exons_in_transcript() + ''' result = lookups.get_exons_in_transcript(5) expected = [{'id': 28, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 14364, 'stop': 14830, 'strand': '-', 'feature_type': 'exon'}, {'id': 27, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 14971, 'stop': 15039, 'strand': '-', 'feature_type': 'exon'}, From 1ba5c04106e92562dffab547a041d3194e8b92eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Mon, 7 Jan 2019 14:44:40 +0100 Subject: [PATCH 006/170] coverage --- backend/modules/browser/lookups.py | 97 +++++++++++-------------- backend/modules/browser/test_lookups.py | 25 ++++++- 2 files changed, 67 insertions(+), 55 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index c2ef98043..ea37ab058 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -5,6 +5,44 @@ SEARCH_LIMIT = 10000 + +def get_coverage_for_bases(chrom, start_pos, stop_pos=None): + """ + Get the coverage for the list of bases given by start_pos->xstop_pos, inclusive + Args: + chrom (str): chromosome + start_pos (int): first position of interest + end_pos (int): last position of interest; if None it will be set to start_pos + Returns: + list: coverage dicts for the region of interest + """ + if stop_pos is None: + stop_pos = start_pos + + return [values for values in db.Coverage.select().where((db.Coverage.pos >= start_pos) & + (db.Coverage.pos <= stop_pos) & + (db.Coverage.chrom == chrom)).dicts()] + + +def get_coverage_for_transcript(chrom, start_pos, stop_pos=None): + """ + Get the coverage for the list of bases given by start_pos->xstop_pos, inclusive + Args: + chrom (str): chromosome + start_pos (int): first position of interest + end_pos (int): last position of interest; if None it will be set to start_pos + Returns: + list: coverage dicts for the region of interest + """ + coverage_array = get_coverage_for_bases(db, xstart, xstop) + # only return coverages that have coverage (if that makes any sense?) + # return coverage_array + covered = [c for c in coverage_array if c['has_coverage']] + for c in covered: + del c['has_coverage'] + return covered + + def get_gene(gene_id): """ Retrieve gene by gene id @@ -74,10 +112,10 @@ def get_raw_variant(pos, chrom, ref, alt): dict: values for the variant; empty if not found """ try: - return db.Variant.select().where(db.Variant.pos == pos, - db.Variant.ref == ref, - db.Variant.alt == alt, - db.Variant.chrom == chrom).dicts().get() + return db.Variant.select().where((db.Variant.pos == pos) & + (db.Variant.ref == ref) & + (db.Variant.alt == alt) & + (db.Variant.chrom == chrom)).dicts().get() except db.Variant.DoesNotExist: return {} @@ -99,8 +137,8 @@ def get_variant(pos, chrom, ref, alt): if not variant or 'rsid' not in variant: return variant if variant['rsid'] == '.' or variant['rsid'] is None: - rsid = db.dbsnp.select().where(db.snp.pos==pos, - db.snp.chrom==chrom).dicts().get() + rsid = db.dbsnp.select().where((db.snp.pos==pos) & + (db.snp.chrom==chrom)).dicts().get() if rsid: variant['rsid'] = 'rs{}'.format(rsid['rsid']) return variant @@ -108,53 +146,6 @@ def get_variant(pos, chrom, ref, alt): return {} -def get_coverage_for_bases(db, xstart, xstop=None): - """ - Get the coverage for the list of bases given by xstart->xstop, inclusive - Returns list of coverage dicts - xstop can be None if just one base, but you'll still get back a list - """ - if xstop is None: - xstop = xstart - - coverages = { - doc['xpos']: doc for doc in db.base_coverage.find( - {'xpos': {'$gte': xstart, '$lte': xstop}}, - projection={'_id': False} - ) - } - ret = [] - # We only store every 10'th base in the db, so we have to make the checks - # only then. - for i in range(xstart-xstart%10, xstop+1, 10): - if i in coverages: - ret.append(coverages[i]) - else: - ret.append({'xpos': i, 'pos': xpos_to_pos(i)}) - for item in ret: - item['has_coverage'] = 'mean' in item - del item['xpos'] - return ret - - -def get_coverage_for_transcript(db, xstart, xstop=None): - """ - - :param db: - :param genomic_coord_to_exon: - :param xstart: - :param xstop: - :return: - """ - coverage_array = get_coverage_for_bases(db, xstart, xstop) - # only return coverages that have coverage (if that makes any sense?) - # return coverage_array - covered = [c for c in coverage_array if c['has_coverage']] - for c in covered: - del c['has_coverage'] - return covered - - def get_constraint_for_transcript(db, transcript): return db.constraint.find_one({'transcript': transcript}, projection={'_id': False}) diff --git a/backend/modules/browser/test_lookups.py b/backend/modules/browser/test_lookups.py index e1cf5f83c..303eaa478 100644 --- a/backend/modules/browser/test_lookups.py +++ b/backend/modules/browser/test_lookups.py @@ -4,6 +4,27 @@ import lookups + +def test_get_coverage_for_bases(): + ''' + Test get_coverage_for_bases() + ''' + coverage = lookups.get_coverage_for_bases('1', 55500283, 55500320) + expected = [{'id': 5474062, 'dataset_version': 4, 'chrom': '1', + 'pos': 55500290, 'mean': 40.66, 'median': 39.0, + 'coverage': [1.0, 1.0, 1.0, 1.0, 0.996, 0.97, 0.867, 0.127, 0.001]}, + {'id': 5474063, 'dataset_version': 4, 'chrom': '1', + 'pos': 55500300, 'mean': 40.7, 'median': 39.0, + 'coverage': [1.0, 1.0, 1.0, 1.0, 0.996, 0.971, 0.878, 0.132, 0.001]}, + {'id': 5474064, 'dataset_version': 4, 'chrom': '1', + 'pos': 55500310, 'mean': 40.35, 'median': 39.0, + 'coverage': [1.0, 1.0, 1.0, 1.0, 0.995, 0.974, 0.859, 0.138, 0.001]}, + {'id': 5474065, 'dataset_version': 4, 'chrom': '1', + 'pos': 55500320, 'mean': 39.69, 'median': 38.0, + 'coverage': [1.0, 1.0, 1.0, 1.0, 0.996, 0.961, 0.856, 0.117, 0.001]}] + assert coverage == expected + + def test_get_gene(): ''' Test get_gene() @@ -133,11 +154,11 @@ def test_get_variant(): result = lookups.get_variant(55500283, '1', 'A', 'T') assert result['genes'] == ['ENSG00000169174'] assert result['transcripts'] == ['ENST00000302118'] - assert result['rsid'] == [75050571] + assert result['rsid'] == 75050571 # need to add test for entry with missing rsid # too slow query atm assert not lookups.get_variant(55500281, '1', 'A', 'T') - + def test_get_exons_in_transcript(): ''' From 5313802674841490d0fb99575727fdd102d01e81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Mon, 7 Jan 2019 14:49:16 +0100 Subject: [PATCH 007/170] potentially no longer needed function added --- backend/modules/browser/lookups.py | 6 +++--- backend/modules/browser/test_lookups.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index ea37ab058..afea7a353 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -34,12 +34,12 @@ def get_coverage_for_transcript(chrom, start_pos, stop_pos=None): Returns: list: coverage dicts for the region of interest """ + # Is this function no longer relevant with postgres? + # Only entries with reported cov are in database coverage_array = get_coverage_for_bases(db, xstart, xstop) # only return coverages that have coverage (if that makes any sense?) # return coverage_array - covered = [c for c in coverage_array if c['has_coverage']] - for c in covered: - del c['has_coverage'] + covered = [c for c in coverage_array if c['mean']] return covered diff --git a/backend/modules/browser/test_lookups.py b/backend/modules/browser/test_lookups.py index 303eaa478..c2fbf7870 100644 --- a/backend/modules/browser/test_lookups.py +++ b/backend/modules/browser/test_lookups.py @@ -9,7 +9,7 @@ def test_get_coverage_for_bases(): ''' Test get_coverage_for_bases() ''' - coverage = lookups.get_coverage_for_bases('1', 55500283, 55500320) + # coverage = lookups.get_coverage_for_bases('1', 55500283, 55500320) expected = [{'id': 5474062, 'dataset_version': 4, 'chrom': '1', 'pos': 55500290, 'mean': 40.66, 'median': 39.0, 'coverage': [1.0, 1.0, 1.0, 1.0, 0.996, 0.97, 0.867, 0.127, 0.001]}, From 43bbf71522e5d8e4d8747e3c1af2bfa546316e09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Mon, 7 Jan 2019 16:23:05 +0100 Subject: [PATCH 008/170] in progress for variants_from_transcript --- backend/modules/browser/lookups.py | 76 ++++++++++++------------- backend/modules/browser/test_lookups.py | 72 +++++++++++++++-------- 2 files changed, 83 insertions(+), 65 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index afea7a353..fa5f05b13 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -36,13 +36,24 @@ def get_coverage_for_transcript(chrom, start_pos, stop_pos=None): """ # Is this function no longer relevant with postgres? # Only entries with reported cov are in database - coverage_array = get_coverage_for_bases(db, xstart, xstop) + coverage_array = get_coverage_for_bases(chrom, start_pos, stop_pos) # only return coverages that have coverage (if that makes any sense?) # return coverage_array covered = [c for c in coverage_array if c['mean']] return covered +def get_exons_in_transcript(transcript_dbid): + """ + Retrieve exons associated with the given transcript id + Args: + transcript_dbid: the id of the transcript in the database (Transcript.id; not transcript_id) + Returns: + list: dicts with values for each exon sorted by start position + """ + return sorted(list(db.Feature.select().where(db.Feature.transcript==transcript_dbid).dicts()), key=lambda k: k['start']) + + def get_gene(gene_id): """ Retrieve gene by gene id @@ -93,12 +104,6 @@ def get_transcript(transcript_id): except db.Transcript.DoesNotExist: return {} - transcript = sdb.transcripts.find_one({'transcript_id': transcript_id}, projection={'_id': False}) - if not transcript: - return None - transcript['exons'] = get_exons_in_transcript(sdb, transcript_id) - return transcript - def get_raw_variant(pos, chrom, ref, alt): """ @@ -132,13 +137,13 @@ def get_variant(pos, chrom, ref, alt): Returns: dict: values for the variant; empty if not found """ - try: + try: variant = get_raw_variant(pos, chrom, ref, alt) if not variant or 'rsid' not in variant: return variant if variant['rsid'] == '.' or variant['rsid'] is None: - rsid = db.dbsnp.select().where((db.snp.pos==pos) & - (db.snp.chrom==chrom)).dicts().get() + rsid = db.DbSNP.select().where((db.DbSNP.pos==pos) & + (db.DbSNP.chrom==chrom)).dicts().get() if rsid: variant['rsid'] = 'rs{}'.format(rsid['rsid']) return variant @@ -146,16 +151,27 @@ def get_variant(pos, chrom, ref, alt): return {} -def get_constraint_for_transcript(db, transcript): - return db.constraint.find_one({'transcript': transcript}, projection={'_id': False}) - - -def get_exons_cnvs(db, transcript_name): - return list(db.cnvs.find({'transcript': transcript_name}, projection={'_id': False})) - +def get_variants_in_transcript(transcript_id): + """ + Retrieve variants inside a transcript + Args: + pos (int): position of the variant + chrom (str): name of the chromosome + ref (str): reference sequence + ref (str): variant sequence + Returns: + dict: values for the variant; empty if not found + """ + variants = [] + for variant in db.Variant.select().where(db.Variant.transcripts.contains(transcript_id)).dicts(): + variants.append(variant) + return variants + variant['vep_annotations'] = [x for x in variant['vep_annotations'] if x['Feature'] == transcript_id] + add_consequence_to_variant(variant) + remove_extraneous_information(variant) + variants.append(variant) + return variants -def get_cnvs(db, gene_name): - return list(db.cnvgenes.find({'gene': gene_name}, projection={'_id': False})) REGION_REGEX = re.compile(r'^\s*(\d+|X|Y|M|MT)\s*([-:]?)\s*(\d*)-?([\dACTG]*)-?([ACTG]*)') @@ -316,25 +332,3 @@ def get_number_of_variants_in_transcript(db, transcript_id): total = db.variants.count({'transcripts': transcript_id}) filtered = db.variants.count({'transcripts': transcript_id, 'filter': 'PASS'}) return {'filtered': filtered, 'total': total} - - -def get_variants_in_transcript(db, sdb, transcript_id): - variants = [] - for variant in db.variants.find({'transcripts': transcript_id}, projection={'_id': False}): - variant['vep_annotations'] = [x for x in variant['vep_annotations'] if x['Feature'] == transcript_id] - add_rsid_to_variant(sdb, variant) - add_consequence_to_variant(variant) - remove_extraneous_information(variant) - variants.append(variant) - return variants - - -def get_exons_in_transcript(transcript_dbid): - """ - Retrieve exons associated with the given transcript id - Args: - transcript_dbid: the id of the transcript in the database (Transcript.id; not transcript_id) - Returns: - list: dicts with values for each exon sorted by start position - """ - return sorted(list(db.Feature.select().where(db.Feature.transcript==transcript_dbid).dicts()), key=lambda k: k['start']) diff --git a/backend/modules/browser/test_lookups.py b/backend/modules/browser/test_lookups.py index c2fbf7870..72b400b59 100644 --- a/backend/modules/browser/test_lookups.py +++ b/backend/modules/browser/test_lookups.py @@ -9,7 +9,7 @@ def test_get_coverage_for_bases(): ''' Test get_coverage_for_bases() ''' - # coverage = lookups.get_coverage_for_bases('1', 55500283, 55500320) + coverage = lookups.get_coverage_for_bases('1', 55500283, 55500320) expected = [{'id': 5474062, 'dataset_version': 4, 'chrom': '1', 'pos': 55500290, 'mean': 40.66, 'median': 39.0, 'coverage': [1.0, 1.0, 1.0, 1.0, 0.996, 0.97, 0.867, 0.127, 0.001]}, @@ -25,6 +25,44 @@ def test_get_coverage_for_bases(): assert coverage == expected +def test_get_coverage_for_transcript(): + coverage = lookups.get_coverage_for_transcript('1', 55500283, 55500320) + expected = [{'id': 5474062, 'dataset_version': 4, 'chrom': '1', + 'pos': 55500290, 'mean': 40.66, 'median': 39.0, + 'coverage': [1.0, 1.0, 1.0, 1.0, 0.996, 0.97, 0.867, 0.127, 0.001]}, + {'id': 5474063, 'dataset_version': 4, 'chrom': '1', + 'pos': 55500300, 'mean': 40.7, 'median': 39.0, + 'coverage': [1.0, 1.0, 1.0, 1.0, 0.996, 0.971, 0.878, 0.132, 0.001]}, + {'id': 5474064, 'dataset_version': 4, 'chrom': '1', + 'pos': 55500310, 'mean': 40.35, 'median': 39.0, + 'coverage': [1.0, 1.0, 1.0, 1.0, 0.995, 0.974, 0.859, 0.138, 0.001]}, + {'id': 5474065, 'dataset_version': 4, 'chrom': '1', + 'pos': 55500320, 'mean': 39.69, 'median': 38.0, + 'coverage': [1.0, 1.0, 1.0, 1.0, 0.996, 0.961, 0.856, 0.117, 0.001]}] + assert coverage == expected + + +def test_get_exons_in_transcript(): + ''' + Test get_exons_in_transcript() + ''' + result = lookups.get_exons_in_transcript(5) + expected = [{'id': 28, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 14364, 'stop': 14830, 'strand': '-', 'feature_type': 'exon'}, + {'id': 27, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 14971, 'stop': 15039, 'strand': '-', 'feature_type': 'exon'}, + {'id': 26, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 15797, 'stop': 15902, 'strand': '-', 'feature_type': 'exon'}, + {'id': 25, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 15905, 'stop': 15948, 'strand': '-', 'feature_type': 'exon'}, + {'id': 24, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 16608, 'stop': 16766, 'strand': '-', 'feature_type': 'exon'}, + {'id': 23, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 16855, 'stop': 17056, 'strand': '-', 'feature_type': 'exon'}, + {'id': 22, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 17234, 'stop': 17365, 'strand': '-', 'feature_type': 'exon'}, + {'id': 21, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 17603, 'stop': 17743, 'strand': '-', 'feature_type': 'exon'}, + {'id': 20, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 17916, 'stop': 18062, 'strand': '-', 'feature_type': 'exon'}, + {'id': 19, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 18269, 'stop': 18380, 'strand': '-', 'feature_type': 'exon'}, + {'id': 18, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 24739, 'stop': 24892, 'strand': '-', 'feature_type': 'exon'}, + {'id': 17, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 29322, 'stop': 29371, 'strand': '-', 'feature_type': 'exon'}] + print(result) + assert result == expected + + def test_get_gene(): ''' Test get_gene() @@ -93,7 +131,6 @@ def test_get_gene_by_name(): # non-existing assert not lookups.get_gene_by_name('INCORRECT') - def test_get_transcript(): ''' @@ -145,7 +182,7 @@ def test_get_raw_variant(): assert result['genes'] == ['ENSG00000169174'] assert result['transcripts'] == ['ENST00000302118'] assert not lookups.get_raw_variant(55500281, '1', 'A', 'T') - + def test_get_variant(): ''' @@ -157,26 +194,13 @@ def test_get_variant(): assert result['rsid'] == 75050571 # need to add test for entry with missing rsid # too slow query atm - assert not lookups.get_variant(55500281, '1', 'A', 'T') - - -def test_get_exons_in_transcript(): + assert not lookups.get_variant(-1, '1', 'A', 'T') + + +def test_get_variants_in_transcript(): ''' - Test get_exons_in_transcript() + Test get_variants_in_transcript() ''' - result = lookups.get_exons_in_transcript(5) - expected = [{'id': 28, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 14364, 'stop': 14830, 'strand': '-', 'feature_type': 'exon'}, - {'id': 27, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 14971, 'stop': 15039, 'strand': '-', 'feature_type': 'exon'}, - {'id': 26, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 15797, 'stop': 15902, 'strand': '-', 'feature_type': 'exon'}, - {'id': 25, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 15905, 'stop': 15948, 'strand': '-', 'feature_type': 'exon'}, - {'id': 24, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 16608, 'stop': 16766, 'strand': '-', 'feature_type': 'exon'}, - {'id': 23, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 16855, 'stop': 17056, 'strand': '-', 'feature_type': 'exon'}, - {'id': 22, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 17234, 'stop': 17365, 'strand': '-', 'feature_type': 'exon'}, - {'id': 21, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 17603, 'stop': 17743, 'strand': '-', 'feature_type': 'exon'}, - {'id': 20, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 17916, 'stop': 18062, 'strand': '-', 'feature_type': 'exon'}, - {'id': 19, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 18269, 'stop': 18380, 'strand': '-', 'feature_type': 'exon'}, - {'id': 18, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 24739, 'stop': 24892, 'strand': '-', 'feature_type': 'exon'}, - {'id': 17, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 29322, 'stop': 29371, 'strand': '-', 'feature_type': 'exon'}] - print(result) - assert result == expected - + res = lookups.get_variants_in_transcript('ENST00000302118') + + assert False From 989d860181ae17c580495d8137601b325400cfc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Tue, 8 Jan 2019 10:10:12 +0100 Subject: [PATCH 009/170] a bit of test fixing --- backend/modules/browser/test_lookups.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/backend/modules/browser/test_lookups.py b/backend/modules/browser/test_lookups.py index 72b400b59..36d6c15a8 100644 --- a/backend/modules/browser/test_lookups.py +++ b/backend/modules/browser/test_lookups.py @@ -119,18 +119,16 @@ def test_get_gene_by_name(): assert result['start'] == expected['start_pos'] assert result['strand'] == expected['strand'] - # crashing with other_names.contains() -# result = lookups.get_gene_by_name('NOT_A_GENE') -# assert not result - # NOC2L + # non-exist + result = lookups.get_gene_by_name('NOT_A_GENE') + assert not result + + # waiting for fixed db result = lookups.get_gene_by_name('NOC2L') assert result['gene_id'] == 'ENSG00000188976' result = lookups.get_gene_by_name('NIR') result = lookups.get_gene_by_name('Z') - # non-existing - assert not lookups.get_gene_by_name('INCORRECT') - def test_get_transcript(): ''' @@ -202,5 +200,4 @@ def test_get_variants_in_transcript(): Test get_variants_in_transcript() ''' res = lookups.get_variants_in_transcript('ENST00000302118') - - assert False + assert len(res) == 426 From dd0b5a580e2dad9c631031092f0e9368bc080849 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Tue, 8 Jan 2019 16:10:50 +0100 Subject: [PATCH 010/170] multiple updates to tests and code --- backend/modules/browser/lookups.py | 309 +++++++++++++----------- backend/modules/browser/test_lookups.py | 134 ++++++---- 2 files changed, 252 insertions(+), 191 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index fa5f05b13..30ec43b03 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -1,12 +1,91 @@ import re import db +import logging #from .utils import METRICS, AF_BUCKETS, get_xpos, xpos_to_pos, add_consequence_to_variants, add_consequence_to_variant SEARCH_LIMIT = 10000 -def get_coverage_for_bases(chrom, start_pos, stop_pos=None): +REGION_REGEX = re.compile(r'^\s*(\d+|X|Y|M|MT)\s*([-:]?)\s*(\d*)-?([\dACTG]*)-?([ACTG]*)') + +def get_awesomebar_result(dataset, query): + """ + Similar to the above, but this is after a user types enter + We need to figure out what they meant - could be gene, variant, region + + Where datatype is one of 'gene', 'variant', or 'region' + And identifier is one of: + - ensembl ID for gene + - variant ID string for variant (eg. 1-1000-A-T) + - region ID string for region (eg. 1-1000-2000) + + Follow these steps: + - if query is an ensembl ID, return it + - if a gene symbol, return that gene's ensembl ID + - if an RSID, return that variant's string + + Finally, note that we don't return the whole object here - only it's identifier. + This could be important for performance later + + Args: + dataset (str): short name of dataset + query (str): the search query + Returns: + tuple: (datatype, identifier) + """ + query = query.strip() + + # Parse Variant types + variant = get_variants_by_rsid(db, query.lower()) + if not variant: + variant = get_variants_from_dbsnp(db,sdb, query.lower()) + + if variant: + if len(variant) == 1: + retval = ('variant', variant[0]['variant_id']) + else: + retval = ('dbsnp_variant_set', variant[0]['rsid']) + return retval + + gene = get_gene_by_name(sdb, query) + # From here out, all should be uppercase (gene, tx, region, variant_id) + query = query.upper() + if not gene: + gene = get_gene_by_name(sdb, query) + if gene: + return 'gene', gene['gene_id'] + + # Ensembl formatted queries + if query.startswith('ENS'): + # Gene + gene = get_gene(sdb, query) + if gene: + return 'gene', gene['gene_id'] + + # Transcript + transcript = get_transcript(sdb, query) + if transcript: + return 'transcript', transcript['transcript_id'] + + # Region and variant queries + query = query[3:] if query.startswith('CHR') else query + + match = REGION_REGEX.match(query) + if match: + target = match.group(0) + target_type = 'region' + if match.group(2) == ":": + target = target.replace(":","-") + if match.group(5) and set(match.group(4)).issubset(set("ACGT")): + target_type = 'variant' + + return target_type, target + + return 'not_found', query + + +def get_coverage_for_bases(dataset, chrom, start_pos, stop_pos=None): """ Get the coverage for the list of bases given by start_pos->xstop_pos, inclusive Args: @@ -16,12 +95,16 @@ def get_coverage_for_bases(chrom, start_pos, stop_pos=None): Returns: list: coverage dicts for the region of interest """ - if stop_pos is None: - stop_pos = start_pos + dataset_version = db.get_dataset_version(dataset) + print(dataset_version) + return dict(dataset_version) +# if stop_pos is None: +# stop_pos = start_pos - return [values for values in db.Coverage.select().where((db.Coverage.pos >= start_pos) & - (db.Coverage.pos <= stop_pos) & - (db.Coverage.chrom == chrom)).dicts()] +# return [values for values in db.Coverage.select().where((db.Coverage.pos >= start_pos) & +# (db.Coverage.pos <= stop_pos) & +# (db.Coverage.chrom == chrom) & +# (db.Coverage.data)).dicts()] def get_coverage_for_transcript(chrom, start_pos, stop_pos=None): @@ -34,7 +117,7 @@ def get_coverage_for_transcript(chrom, start_pos, stop_pos=None): Returns: list: coverage dicts for the region of interest """ - # Is this function no longer relevant with postgres? + # Is this function still relevant with postgres? # Only entries with reported cov are in database coverage_array = get_coverage_for_bases(chrom, start_pos, stop_pos) # only return coverages that have coverage (if that makes any sense?) @@ -51,24 +134,31 @@ def get_exons_in_transcript(transcript_dbid): Returns: list: dicts with values for each exon sorted by start position """ - return sorted(list(db.Feature.select().where(db.Feature.transcript==transcript_dbid).dicts()), key=lambda k: k['start']) + return sorted(list(db.Feature.select().where((db.Feature.transcript==transcript_dbid) & + (db.Feature.feature_type=='exon')).dicts()), + key=lambda k: k['start']) -def get_gene(gene_id): +def get_gene(dataset, gene_id): """ Retrieve gene by gene id Args: + dataset (str): short name of the dataset gene_id (str): the id of the gene Returns: dict: values for the gene; empty if not found """ + ref_dbid = db.get_reference_dbid_dataset(dataset) + if not ref_dbid: + return {} try: - return db.Gene.select().where(db.Gene.gene_id==gene_id).dicts().get() + return db.Gene.select().where((db.Gene.gene_id == gene_id) & + (db.Gene.reference_set == ref_dbid)).dicts().get() except db.Gene.DoesNotExist: return {} -def get_gene_by_name(gene_name): +def get_gene_by_name(dataset, gene_name): """ Retrieve gene by gene_name. First checks gene_name, then other_names. @@ -77,17 +167,42 @@ def get_gene_by_name(gene_name): Returns: dict: values for the gene; empty if not found """ - # try gene_name field first + ref_dbid = db.get_reference_dbid_dataset(dataset) + if not ref_dbid: + return {} try: return db.Gene.select().where(db.Gene.name==gene_name).dicts().get() except db.Gene.DoesNotExist: try: - # troubles with KeyError return db.Gene.select().where(db.Gene.other_names.contains(gene_name)).dicts().get() except db.Gene.DoesNotExist: return {} +def get_genes_in_region(chrom, start_pos, stop_pos): + """ + Retrieve genes located within a region + Args: + chrom (str): chromosome name + start_pos (int): start of region + stop_pos (int): end of region + Returns: + dict: values for the gene; empty if not found + """ + gene_query = db.Gene.select().where((((db.Gene.start >= start_pos) & + (db.Gene.start <= stop_pos)) | + ((db.Gene.stop >= start_pos) & + (db.Gene.stop <= stop_pos))) & + (db.Gene.chrom == chrom)).dicts() + return [gene for gene in gene_query] + + +def get_number_of_variants_in_transcript(db, transcript_id): + total = db.variants.count({'transcripts': transcript_id}) + filtered = db.variants.count({'transcripts': transcript_id, 'filter': 'PASS'}) + return {'filtered': filtered, 'total': total} + + def get_transcript(transcript_id): """ Retrieve transcript by transcript id @@ -125,6 +240,21 @@ def get_raw_variant(pos, chrom, ref, alt): return {} +def get_transcripts_in_gene(dataset, gene_id): + """ + Get the transcripts associated with a gene + Args: + dataset (str): short name of the reference set + gene_id (str): id of the gene + Returns: + list: transcripts (dict) associated with the gene + """ + ref_dbid = db.get_reference_dbid_dataset(dataset) + gene = db.Gene.select().where((db.Gene.reference_set == ref_dbid) & + (db.Gene.gene_id == gene_id)).dicts().get() + return [transcript for transcript in db.Transcript.select().where(db.Transcript.gene == gene['id']).dicts()] + + def get_variant(pos, chrom, ref, alt): """ Retrieve variant by position and change @@ -151,6 +281,26 @@ def get_variant(pos, chrom, ref, alt): return {} +def get_variants_in_gene(dataset, gene_id): + """ + Retrieve variants present inside a gene + Args: + dataset: short name of the dataset + gene_id (str): id of the gene + Returns: + list: values for the variants + """ + ref_dbid = db.get_reference_dbid_dataset(dataset) +# db.Variant.select().where(db.Variant.gene.contains(re + variants = [] + for variant in db.variants.find({'genes': gene_id}, projection={'_id': False}): + variant['vep_annotations'] = [x for x in variant['vep_annotations'] if x['Gene'] == gene_id] + add_consequence_to_variant(variant) + remove_extraneous_information(variant) + variants.append(variant) + return variants + + def get_variants_in_transcript(transcript_id): """ Retrieve variants inside a transcript @@ -173,95 +323,7 @@ def get_variants_in_transcript(transcript_id): return variants -REGION_REGEX = re.compile(r'^\s*(\d+|X|Y|M|MT)\s*([-:]?)\s*(\d*)-?([\dACTG]*)-?([ACTG]*)') - -def get_awesomebar_result(db,sdb, query): - """ - Similar to the above, but this is after a user types enter - We need to figure out what they meant - could be gene, variant, region - - Return tuple of (datatype, identifier) - Where datatype is one of 'gene', 'variant', or 'region' - And identifier is one of: - - ensembl ID for gene - - variant ID string for variant (eg. 1-1000-A-T) - - region ID string for region (eg. 1-1000-2000) - - Follow these steps: - - if query is an ensembl ID, return it - - if a gene symbol, return that gene's ensembl ID - - if an RSID, return that variant's string - - - Finally, note that we don't return the whole object here - only it's identifier. - This could be important for performance later - - """ - query = query.strip() - - # Parse Variant types - variant = get_variants_by_rsid(db, query.lower()) - if not variant: - variant = get_variants_from_dbsnp(db,sdb, query.lower()) - - if variant: - if len(variant) == 1: - retval = ('variant', variant[0]['variant_id']) - else: - retval = ('dbsnp_variant_set', variant[0]['rsid']) - return retval - - gene = get_gene_by_name(sdb, query) - # From here out, all should be uppercase (gene, tx, region, variant_id) - query = query.upper() - if not gene: - gene = get_gene_by_name(sdb, query) - if gene: - return 'gene', gene['gene_id'] - - # Ensembl formatted queries - if query.startswith('ENS'): - # Gene - gene = get_gene(sdb, query) - if gene: - return 'gene', gene['gene_id'] - - # Transcript - transcript = get_transcript(sdb, query) - if transcript: - return 'transcript', transcript['transcript_id'] - - # Region and variant queries - query = query[3:] if query.startswith('CHR') else query - - match = REGION_REGEX.match(query) - if match: - target = match.group(0) - target_type = 'region' - if match.group(2) == ":": - target = target.replace(":","-") - if match.group(5) and set(match.group(4)).issubset(set("ACGT")): - target_type = 'variant' - - return target_type, target - - return 'not_found', query - - -def get_genes_in_region(sdb, chrom, start, stop): - """ - Genes that overlap a region - """ - xstart = get_xpos(chrom, start) - xstop = get_xpos(chrom, stop) - genes = sdb.genes.find({ - 'xstart': {'$lte': xstop}, - 'xstop': {'$gte': xstart}, - }, projection={'_id': False}) - return list(genes) - - -def get_variants_in_region(db, sdb, chrom, start, stop): +def get_variants_in_region(db, chrom, start, stop): """ Variants that overlap a region Unclear if this will include CNVs @@ -278,28 +340,6 @@ def get_variants_in_region(db, sdb, chrom, start, stop): return list(variants) -def get_metrics(db, variant): - if 'allele_count' not in variant or variant['allele_num'] == 0: - return None - metrics = {} - for metric in METRICS: - metrics[metric] = db.metrics.find_one({'metric': metric}, projection={'_id': False}) - - metric = None - if variant['allele_count'] == 1: - metric = 'singleton' - elif variant['allele_count'] == 2: - metric = 'doubleton' - else: - for af in AF_BUCKETS: - if float(variant['allele_count'])/variant['allele_num'] < af: - metric = af - break - if metric is not None: - metrics['Site Quality'] = db.metrics.find_one({'metric': 'binned_%s' % metric}, projection={'_id': False}) - return metrics - - def remove_extraneous_information(variant): #del variant['genotype_depths'] #del variant['genotype_qualities'] @@ -311,24 +351,3 @@ def remove_extraneous_information(variant): del variant['xstop'] del variant['site_quality'] del variant['vep_annotations'] - - -def get_variants_in_gene(db, sdb, gene_id): - variants = [] - for variant in db.variants.find({'genes': gene_id}, projection={'_id': False}): - variant['vep_annotations'] = [x for x in variant['vep_annotations'] if x['Gene'] == gene_id] - add_rsid_to_variant(sdb, variant) - add_consequence_to_variant(variant) - remove_extraneous_information(variant) - variants.append(variant) - return variants - - -def get_transcripts_in_gene(sdb, gene_id): - return list(sdb.transcripts.find({'gene_id': gene_id}, projection={'_id': False})) - - -def get_number_of_variants_in_transcript(db, transcript_id): - total = db.variants.count({'transcripts': transcript_id}) - filtered = db.variants.count({'transcripts': transcript_id, 'filter': 'PASS'}) - return {'filtered': filtered, 'total': total} diff --git a/backend/modules/browser/test_lookups.py b/backend/modules/browser/test_lookups.py index 36d6c15a8..4680bc849 100644 --- a/backend/modules/browser/test_lookups.py +++ b/backend/modules/browser/test_lookups.py @@ -1,15 +1,23 @@ -''' +""" Tests for the functions available in lookups.py -''' +""" import lookups +def test_get_awesomebar_result(): + """ + Test get_awesomebar_result() + """ + pass + + def test_get_coverage_for_bases(): - ''' + """ Test get_coverage_for_bases() - ''' - coverage = lookups.get_coverage_for_bases('1', 55500283, 55500320) + """ + coverage = lookups.get_coverage_for_bases('SweGen', '1', 55500283, 55500320) + print(type(coverage)) expected = [{'id': 5474062, 'dataset_version': 4, 'chrom': '1', 'pos': 55500290, 'mean': 40.66, 'median': 39.0, 'coverage': [1.0, 1.0, 1.0, 1.0, 0.996, 0.97, 0.867, 0.127, 0.001]}, @@ -26,7 +34,7 @@ def test_get_coverage_for_bases(): def test_get_coverage_for_transcript(): - coverage = lookups.get_coverage_for_transcript('1', 55500283, 55500320) + # coverage = lookups.get_coverage_for_transcript('1', 55500283, 55500320) expected = [{'id': 5474062, 'dataset_version': 4, 'chrom': '1', 'pos': 55500290, 'mean': 40.66, 'median': 39.0, 'coverage': [1.0, 1.0, 1.0, 1.0, 0.996, 0.97, 0.867, 0.127, 0.001]}, @@ -43,30 +51,33 @@ def test_get_coverage_for_transcript(): def test_get_exons_in_transcript(): - ''' + """ Test get_exons_in_transcript() - ''' - result = lookups.get_exons_in_transcript(5) - expected = [{'id': 28, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 14364, 'stop': 14830, 'strand': '-', 'feature_type': 'exon'}, - {'id': 27, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 14971, 'stop': 15039, 'strand': '-', 'feature_type': 'exon'}, - {'id': 26, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 15797, 'stop': 15902, 'strand': '-', 'feature_type': 'exon'}, - {'id': 25, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 15905, 'stop': 15948, 'strand': '-', 'feature_type': 'exon'}, - {'id': 24, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 16608, 'stop': 16766, 'strand': '-', 'feature_type': 'exon'}, - {'id': 23, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 16855, 'stop': 17056, 'strand': '-', 'feature_type': 'exon'}, - {'id': 22, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 17234, 'stop': 17365, 'strand': '-', 'feature_type': 'exon'}, - {'id': 21, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 17603, 'stop': 17743, 'strand': '-', 'feature_type': 'exon'}, - {'id': 20, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 17916, 'stop': 18062, 'strand': '-', 'feature_type': 'exon'}, - {'id': 19, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 18269, 'stop': 18380, 'strand': '-', 'feature_type': 'exon'}, - {'id': 18, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 24739, 'stop': 24892, 'strand': '-', 'feature_type': 'exon'}, - {'id': 17, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 29322, 'stop': 29371, 'strand': '-', 'feature_type': 'exon'}] - print(result) + """ + result = lookups.get_exons_in_transcript(28186) + expected = [{'id': 326403, 'gene': 8600, 'transcript': 28186, 'chrom': '2', + 'start': 202047893, 'stop': 202048032, 'strand': '+', 'feature_type': 'exon'}, + {'id': 326404, 'gene': 8600, 'transcript': 28186, 'chrom': '2', + 'start': 202050495, 'stop': 202050848, 'strand': '+', 'feature_type': 'exon'}, + {'id': 326406, 'gene': 8600, 'transcript': 28186, 'chrom': '2', + 'start': 202052430, 'stop': 202052523, 'strand': '+', 'feature_type': 'exon'}, + {'id': 326408, 'gene': 8600, 'transcript': 28186, 'chrom': '2', + 'start': 202057708, 'stop': 202057843, 'strand': '+', 'feature_type': 'exon'}, + {'id': 326410, 'gene': 8600, 'transcript': 28186, 'chrom': '2', + 'start': 202060566, 'stop': 202060672, 'strand': '+', 'feature_type': 'exon'}, + {'id': 326412, 'gene': 8600, 'transcript': 28186, 'chrom': '2', + 'start': 202072799, 'stop': 202072907, 'strand': '+', 'feature_type': 'exon'}, + {'id': 326414, 'gene': 8600, 'transcript': 28186, 'chrom': '2', + 'start': 202073794, 'stop': 202074286, 'strand': '+', 'feature_type': 'exon'}, + {'id': 326416, 'gene': 8600, 'transcript': 28186, 'chrom': '2', + 'start': 202082312, 'stop': 202084804, 'strand': '+', 'feature_type': 'exon'}] assert result == expected def test_get_gene(): - ''' + """ Test get_gene() - ''' + """ # normal entry expected = {'id': 1, 'reference_set': 1, @@ -77,7 +88,7 @@ def test_get_gene(): 'chrom': '1', 'start_pos': 11870, 'strand': '+'} - result = lookups.get_gene('ENSG00000223972') + result = lookups.get_gene('SweGen', 'ENSG00000223972') print(result) assert result['id'] == expected['id'] assert result['reference_set'] == expected['reference_set'] @@ -89,15 +100,19 @@ def test_get_gene(): assert result['start'] == expected['start_pos'] assert result['strand'] == expected['strand'] - # non-existing - result = lookups.get_gene('NOT_A_GENE') + # non-existing gene + result = lookups.get_gene('SweGen', 'NOT_A_GENE') + assert not result + + # non-existing dataset + result = lookups.get_gene('NoDataset', 'ENSG00000223972') assert not result def test_get_gene_by_name(): - ''' + """ Test get_gene_by_name() - ''' + """ # normal entry expected = {'id': 1, 'reference_set': 1, @@ -108,7 +123,7 @@ def test_get_gene_by_name(): 'chrom': '1', 'start_pos': 11870, 'strand': '+'} - result = lookups.get_gene_by_name('DDX11L1') + result = lookups.get_gene_by_name('SweGen', 'DDX11L1') assert result['id'] == expected['id'] assert result['reference_set'] == expected['reference_set'] assert result['gene_id'] == expected['gene_id'] @@ -119,21 +134,33 @@ def test_get_gene_by_name(): assert result['start'] == expected['start_pos'] assert result['strand'] == expected['strand'] - # non-exist - result = lookups.get_gene_by_name('NOT_A_GENE') + # non-existing gene + result = lookups.get_gene_by_name('SweGen', 'NOT_A_GENE') + assert not result + + # non-existing dataset + result = lookups.get_gene_by_name('NoDataset', 'ENSG00000223972') assert not result - # waiting for fixed db - result = lookups.get_gene_by_name('NOC2L') + # name in other_names + result = lookups.get_gene_by_name('SweGen', 'NIR') assert result['gene_id'] == 'ENSG00000188976' - result = lookups.get_gene_by_name('NIR') - result = lookups.get_gene_by_name('Z') + + +def test_get_genes_in_region(): + """ + Test get_genes_in_region() + """ + res = lookups.get_genes_in_region('4', 99080000, 99210000) + # stop_pos missing in db, so needs to be updated when available + # exp_names = + assert False def test_get_transcript(): - ''' + """ Test get_transcript() - ''' + """ # normal entry expected = {'id': 5, 'transcript_id': 'ENST00000438504', @@ -172,10 +199,24 @@ def test_get_transcript(): assert not lookups.get_transcript('INCORRECT') +def test_get_transcripts_in_gene(): + """ + Test get_transcripts_in_gene() + """ + res = lookups.get_transcripts_in_gene('SweGen', 'ENSG00000241670') + expected = [{'id': 39, 'transcript_id': 'ENST00000424429', 'gene': 19, + 'mim_gene_accession': None, 'mim_annotation': None, + 'chrom': '1', 'start': 228293, 'stop': 228655, 'strand': '-'}, + {'id': 40, 'transcript_id': 'ENST00000450734', 'gene': 19, + 'mim_gene_accession': None, 'mim_annotation': None, + 'chrom': '1', 'start': 228320, 'stop': 228776, 'strand': '-'}] + assert res == expected + + def test_get_raw_variant(): - ''' + """ Test get_raw_variant - ''' + """ result = lookups.get_raw_variant(55500283, '1', 'A', 'T') assert result['genes'] == ['ENSG00000169174'] assert result['transcripts'] == ['ENST00000302118'] @@ -183,9 +224,9 @@ def test_get_raw_variant(): def test_get_variant(): - ''' + """ Test get_variant() - ''' + """ result = lookups.get_variant(55500283, '1', 'A', 'T') assert result['genes'] == ['ENSG00000169174'] assert result['transcripts'] == ['ENST00000302118'] @@ -196,8 +237,9 @@ def test_get_variant(): def test_get_variants_in_transcript(): - ''' + """ Test get_variants_in_transcript() - ''' - res = lookups.get_variants_in_transcript('ENST00000302118') - assert len(res) == 426 + """ + # res = lookups.get_variants_in_transcript('ENST00000302118') + # assert len(res) == 426 + assert False From 267d083a84d84cb48367ce556ee57e4da2dcec28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Tue, 8 Jan 2019 16:54:05 +0100 Subject: [PATCH 011/170] keep on fixing tests and lookups --- backend/db.py | 33 ++++++++++++++++++++++--- backend/modules/browser/lookups.py | 29 +++++++++++++++++++--- backend/modules/browser/test_lookups.py | 26 ++++++++++++++++++- 3 files changed, 79 insertions(+), 9 deletions(-) diff --git a/backend/db.py b/backend/db.py index d3e329c3b..eac8e215c 100644 --- a/backend/db.py +++ b/backend/db.py @@ -464,20 +464,30 @@ def get_next_free_uid(): return next_uid + def get_admin_datasets(user): return DatasetAccess.select().where( DatasetAccess.user == user, DatasetAccess.is_admin) + def get_dataset(dataset): dataset = Dataset.select().where( Dataset.short_name == dataset).get() return dataset + def get_dataset_version(dataset, version=None): + """ + Given dataset get DatasetVersion + Args: + dataset (str): short name of the dataset + Returns: + DatasetVersionCurrent: the corresponding DatasetVersion entry + """ if version: dataset_version = (DatasetVersion - .select(DatasetVersion, Dataset) - .join(Dataset) - .where(DatasetVersion.version == version, - Dataset.short_name == dataset)).get() + .select(DatasetVersion, Dataset) + .join(Dataset) + .where(DatasetVersion.version == version, + Dataset.short_name == dataset)).get() else: dataset_version = (DatasetVersionCurrent .select(DatasetVersionCurrent, Dataset) @@ -485,6 +495,21 @@ def get_dataset_version(dataset, version=None): .where(Dataset.short_name == dataset)).get() return dataset_version + +def get_reference_dbid_dataset(dataset): + """ + Get the database id of the associated reference set for a dataset + Args: + dataset (str): short name of the dataset + Returns: + int: id of the associated reference set; returns None if not available + """ + try: + return Dataset.select().where(Dataset.short_name==dataset).dicts().get()['reference_set'] + except Dataset.DoesNotExist: + return None + + def build_dict_from_row(row): d = {} diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index 30ec43b03..b4f53dd54 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -7,6 +7,18 @@ SEARCH_LIMIT = 10000 +def add_rsid_to_variant(variant): + """ + Add rsid to a variant in the database + Args: + variant (dict): values for a variant + """ + if variant['rsid'] == '.' or variant['rsid'] is None: + rsid = db.DbSNP.select().where(db.DbSNP.pos == variant['pos']).dicts().get() + if rsid: + variant['rsid'] = 'rs{}'.format(rsid['rsid']) + + REGION_REGEX = re.compile(r'^\s*(\d+|X|Y|M|MT)\s*([-:]?)\s*(\d*)-?([\dACTG]*)-?([ACTG]*)') def get_awesomebar_result(dataset, query): @@ -272,15 +284,24 @@ def get_variant(pos, chrom, ref, alt): if not variant or 'rsid' not in variant: return variant if variant['rsid'] == '.' or variant['rsid'] is None: - rsid = db.DbSNP.select().where((db.DbSNP.pos==pos) & - (db.DbSNP.chrom==chrom)).dicts().get() - if rsid: - variant['rsid'] = 'rs{}'.format(rsid['rsid']) + add_rsid_to_variant(variant) return variant except db.Variant.DoesNotExist: return {} +def get_variants_by_rsid(db, rsid): + if not rsid.startswith('rs'): + return None + try: + int(rsid.lstrip('rs')) + except ValueError: + return None + variants = list(db.variants.find({'rsid': rsid}, projection={'_id': False})) + add_consequence_to_variants(variants) + return variants + + def get_variants_in_gene(dataset, gene_id): """ Retrieve variants present inside a gene diff --git a/backend/modules/browser/test_lookups.py b/backend/modules/browser/test_lookups.py index 4680bc849..233d71593 100644 --- a/backend/modules/browser/test_lookups.py +++ b/backend/modules/browser/test_lookups.py @@ -5,11 +5,19 @@ import lookups +def test_add_rsid_to_variant(): + """ + Test add_rsid_to_variant() + """ + variant = '' + assert False + + def test_get_awesomebar_result(): """ Test get_awesomebar_result() """ - pass + assert False def test_get_coverage_for_bases(): @@ -231,8 +239,24 @@ def test_get_variant(): assert result['genes'] == ['ENSG00000169174'] assert result['transcripts'] == ['ENST00000302118'] assert result['rsid'] == 75050571 + + # missing rsid in result, multiple transcripts + # slow, need to fix db + # result = lookups.get_variant(47730411, '21', 'TA', 'T') + assert result['genes'] == ['ENSG00000160298'] + assert result['transcripts'] == ['ENST00000417060', 'ENST00000397682', + 'ENST00000397683', 'ENST00000397680', + 'ENST00000397685', 'ENST00000397679', + 'ENST00000291691', 'ENST00000445935', + 'ENST00000491666', 'ENST00000472607', + 'ENST00000475776'] + assert result['rsid'] == 75050571 + + # need to add test for entry with missing rsid # too slow query atm + + # incorrect position assert not lookups.get_variant(-1, '1', 'A', 'T') From 20ef941f1e3ef5736a9c45031ecbf0eb4b9f9ab2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Wed, 9 Jan 2019 16:41:48 +0100 Subject: [PATCH 012/170] more updates, focused on logging, adding version as a parameter, and many other things --- backend/modules/browser/lookups.py | 117 +++++++++++++++++------- backend/modules/browser/test_lookups.py | 25 +++-- 2 files changed, 101 insertions(+), 41 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index b4f53dd54..7924f1669 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -7,16 +7,32 @@ SEARCH_LIMIT = 10000 -def add_rsid_to_variant(variant): +def add_rsid_to_variant(dataset, variant): """ Add rsid to a variant in the database Args: + dataset (str): short name of the dataset variant (dict): values for a variant """ + refset = (db.Dataset + .select(db.ReferenceSet) + .join(db.ReferenceSet) + .where(db.Dataset.short_name == dataset) + .dicts() + .get()) + dbsnp_version = refset['dbsnp_version'] + if variant['rsid'] == '.' or variant['rsid'] is None: - rsid = db.DbSNP.select().where(db.DbSNP.pos == variant['pos']).dicts().get() + rsid = (db.DbSNP + .select() + .where((db.DbSNP.pos == variant['pos']) & + (db.DbSNP.version == dbsnp_version)) + .dicts() + .get()) if rsid: variant['rsid'] = 'rs{}'.format(rsid['rsid']) + else: + logging.error('add_rsid_to_variant({}, {}): unable to retrieve rsid'.format(dataset, variant)) REGION_REGEX = re.compile(r'^\s*(\d+|X|Y|M|MT)\s*([-:]?)\s*(\d*)-?([\dACTG]*)-?([ACTG]*)') @@ -97,26 +113,35 @@ def get_awesomebar_result(dataset, query): return 'not_found', query -def get_coverage_for_bases(dataset, chrom, start_pos, stop_pos=None): +def get_coverage_for_bases(dataset, chrom, start_pos, end_pos=None, ds_version=None): """ - Get the coverage for the list of bases given by start_pos->xstop_pos, inclusive + Get the coverage for the list of bases given by start_pos->end_pos, inclusive Args: + dataset (str): short name for the dataset chrom (str): chromosome start_pos (int): first position of interest end_pos (int): last position of interest; if None it will be set to start_pos + ds_version (str): version of the dataset Returns: - list: coverage dicts for the region of interest + list: coverage dicts for the region of interest: None if unable to retrieve """ - dataset_version = db.get_dataset_version(dataset) - print(dataset_version) - return dict(dataset_version) -# if stop_pos is None: -# stop_pos = start_pos + dataset_version = db.get_dataset_version(dataset, ds_version) + if not dataset_version: + return -# return [values for values in db.Coverage.select().where((db.Coverage.pos >= start_pos) & -# (db.Coverage.pos <= stop_pos) & -# (db.Coverage.chrom == chrom) & -# (db.Coverage.data)).dicts()] + if end_pos is None: + end_pos = start_pos + try: + return [values for values in (db.Coverage + .select() + .where((db.Coverage.pos >= start_pos) & + (db.Coverage.pos <= end_pos) & + (db.Coverage.chrom == chrom) & + (db.Coverage.dataset_version == dataset_version.id)) + .dicts())] + except db.Coverage.DoesNotExist: + logging.error('get_coverage_for_bases({}, {}, {}, {}): '.format(dataset, chrom, start_pos, end_pos)) + return def get_coverage_for_transcript(chrom, start_pos, stop_pos=None): @@ -188,6 +213,7 @@ def get_gene_by_name(dataset, gene_name): try: return db.Gene.select().where(db.Gene.other_names.contains(gene_name)).dicts().get() except db.Gene.DoesNotExist: + logging.error('get_gene_by_name({}, {}): unable to retrieve gene'.format(dataset, gene_name)) return {} @@ -201,12 +227,15 @@ def get_genes_in_region(chrom, start_pos, stop_pos): Returns: dict: values for the gene; empty if not found """ - gene_query = db.Gene.select().where((((db.Gene.start >= start_pos) & - (db.Gene.start <= stop_pos)) | - ((db.Gene.stop >= start_pos) & - (db.Gene.stop <= stop_pos))) & - (db.Gene.chrom == chrom)).dicts() - return [gene for gene in gene_query] + try: + gene_query = db.Gene.select().where((((db.Gene.start >= start_pos) & + (db.Gene.start <= stop_pos)) | + ((db.Gene.stop >= start_pos) & + (db.Gene.stop <= stop_pos))) & + (db.Gene.chrom == chrom)).dicts() + return [gene for gene in gene_query] + except db.Gene.DoesNotExist: + logging.error('get_genes_in_region({}, {}, {}): no genes found'.format(chrom, start_pos, stop_pos)) def get_number_of_variants_in_transcript(db, transcript_id): @@ -232,23 +261,36 @@ def get_transcript(transcript_id): return {} -def get_raw_variant(pos, chrom, ref, alt): +def get_raw_variant(dataset, pos, chrom, ref, alt, ds_version=None): """ Retrieve variant by position and change Args: + dataset (str): short name of the reference set pos (int): position of the variant chrom (str): name of the chromosome ref (str): reference sequence - ref (str): variant sequence + alt (str): variant sequence + ds_version (str): dataset version Returns: dict: values for the variant; empty if not found """ + dataset_version = db.get_dataset_version(dataset, ds_version) + if not dataset_version: + return + try: - return db.Variant.select().where((db.Variant.pos == pos) & - (db.Variant.ref == ref) & - (db.Variant.alt == alt) & - (db.Variant.chrom == chrom)).dicts().get() + return (db.Variant + .select() + .where((db.Variant.pos == pos) & + (db.Variant.ref == ref) & + (db.Variant.alt == alt) & + (db.Variant.chrom == chrom) & + (db.Variant.dataset_version == dataset_version.id)) + .dicts() + .get()) except db.Variant.DoesNotExist: + logging.error(('get_raw_variant({}, {}, {}, {}, {}, {})'.format(dataset, pos, chrom, ref, alt, ds_version) + + ': unable to retrieve variant')) return {} @@ -259,19 +301,27 @@ def get_transcripts_in_gene(dataset, gene_id): dataset (str): short name of the reference set gene_id (str): id of the gene Returns: - list: transcripts (dict) associated with the gene + list: transcripts (dict) associated with the gene; empty if no hits """ ref_dbid = db.get_reference_dbid_dataset(dataset) - gene = db.Gene.select().where((db.Gene.reference_set == ref_dbid) & - (db.Gene.gene_id == gene_id)).dicts().get() - return [transcript for transcript in db.Transcript.select().where(db.Transcript.gene == gene['id']).dicts()] + if not ref_dbid: + logging.error('get_transcripts_in_gene({}, {}): unable to get referenceset dbid'.format(dataset, gene_id)) + return [] + try: + gene = db.Gene.select().where((db.Gene.reference_set == ref_dbid) & + (db.Gene.gene_id == gene_id)).dicts().get() + return [transcript for transcript in db.Transcript.select().where(db.Transcript.gene == gene['id']).dicts()] + except db.Gene.DoesNotExist or db.Transcript.DoesNotExist: + logging.error('get_transcripts_in_gene({}, {}): unable to retrieve gene or transcript'.format(dataset, gene_id)) + return [] -def get_variant(pos, chrom, ref, alt): +def get_variant(dataset, pos, chrom, ref, alt): """ Retrieve variant by position and change Retrieves rsid from db (if available) if not present in variant Args: + dataset (str): short name of the dataset pos (int): position of the variant chrom (str): name of the chromosome ref (str): reference sequence @@ -280,11 +330,14 @@ def get_variant(pos, chrom, ref, alt): dict: values for the variant; empty if not found """ try: - variant = get_raw_variant(pos, chrom, ref, alt) + variant = get_raw_variant(dataset, pos, chrom, ref, alt) if not variant or 'rsid' not in variant: return variant if variant['rsid'] == '.' or variant['rsid'] is None: add_rsid_to_variant(variant) + else: + if str(variant['rsid'])[:2] != 'rs': + variant['rsid'] = 'rs{}'.format(variant['rsid']) return variant except db.Variant.DoesNotExist: return {} diff --git a/backend/modules/browser/test_lookups.py b/backend/modules/browser/test_lookups.py index 233d71593..6b35e63e6 100644 --- a/backend/modules/browser/test_lookups.py +++ b/backend/modules/browser/test_lookups.py @@ -9,8 +9,16 @@ def test_add_rsid_to_variant(): """ Test add_rsid_to_variant() """ - variant = '' - assert False + # "with ." + variant = lookups.get_variant('SweGen', 55500283, '1', 'A', 'T') + rsid = variant['rsid'] + variant['rsid'] = '.' + # lookups.add_rsid_to_variant('SweGen', variant) + assert variant['rsid'] == rsid + # "non-existing" + del variant['rsid'] + # lookups.add_rsid_to_variant(variant) + assert variant['rsid'] == rsid def test_get_awesomebar_result(): @@ -24,8 +32,7 @@ def test_get_coverage_for_bases(): """ Test get_coverage_for_bases() """ - coverage = lookups.get_coverage_for_bases('SweGen', '1', 55500283, 55500320) - print(type(coverage)) + # coverage = lookups.get_coverage_for_bases('SweGen', '1', 55500283, 55500320) expected = [{'id': 5474062, 'dataset_version': 4, 'chrom': '1', 'pos': 55500290, 'mean': 40.66, 'median': 39.0, 'coverage': [1.0, 1.0, 1.0, 1.0, 0.996, 0.97, 0.867, 0.127, 0.001]}, @@ -42,7 +49,7 @@ def test_get_coverage_for_bases(): def test_get_coverage_for_transcript(): - # coverage = lookups.get_coverage_for_transcript('1', 55500283, 55500320) + coverage = lookups.get_coverage_for_transcript('1', 55500283, 55500320) expected = [{'id': 5474062, 'dataset_version': 4, 'chrom': '1', 'pos': 55500290, 'mean': 40.66, 'median': 39.0, 'coverage': [1.0, 1.0, 1.0, 1.0, 0.996, 0.97, 0.867, 0.127, 0.001]}, @@ -225,20 +232,20 @@ def test_get_raw_variant(): """ Test get_raw_variant """ - result = lookups.get_raw_variant(55500283, '1', 'A', 'T') + result = lookups.get_raw_variant('SweGen', 55500283, '1', 'A', 'T') assert result['genes'] == ['ENSG00000169174'] assert result['transcripts'] == ['ENST00000302118'] - assert not lookups.get_raw_variant(55500281, '1', 'A', 'T') + assert not lookups.get_raw_variant('SweGen', 55500281, '1', 'A', 'T') def test_get_variant(): """ Test get_variant() """ - result = lookups.get_variant(55500283, '1', 'A', 'T') + result = lookups.get_variant('SweGen', 55500283, '1', 'A', 'T') assert result['genes'] == ['ENSG00000169174'] assert result['transcripts'] == ['ENST00000302118'] - assert result['rsid'] == 75050571 + assert result['rsid'] == 'rs75050571' # missing rsid in result, multiple transcripts # slow, need to fix db From b3e0305b4c7c068bb24966d25175dfbc175a55e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Thu, 10 Jan 2019 10:29:42 +0100 Subject: [PATCH 013/170] fixes and tests for get_variants_by_rsid --- backend/modules/browser/lookups.py | 44 ++++++++++++++++++------- backend/modules/browser/test_lookups.py | 24 +++++++++++++- 2 files changed, 55 insertions(+), 13 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index 7924f1669..ad7cde7cc 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -2,7 +2,7 @@ import db import logging -#from .utils import METRICS, AF_BUCKETS, get_xpos, xpos_to_pos, add_consequence_to_variants, add_consequence_to_variant +from .utils import METRICS, AF_BUCKETS, get_xpos, xpos_to_pos, add_consequence_to_variants, add_consequence_to_variant SEARCH_LIMIT = 10000 @@ -39,11 +39,10 @@ def add_rsid_to_variant(dataset, variant): def get_awesomebar_result(dataset, query): """ - Similar to the above, but this is after a user types enter - We need to figure out what they meant - could be gene, variant, region + Parse the search input - Where datatype is one of 'gene', 'variant', or 'region' - And identifier is one of: + Datatype is one of 'gene', 'variant', or 'region' + Identifier is one of: - ensembl ID for gene - variant ID string for variant (eg. 1-1000-A-T) - region ID string for region (eg. 1-1000-2000) @@ -336,22 +335,43 @@ def get_variant(dataset, pos, chrom, ref, alt): if variant['rsid'] == '.' or variant['rsid'] is None: add_rsid_to_variant(variant) else: - if str(variant['rsid'])[:2] != 'rs': + if not str(variant['rsid']).startswith('rs'): variant['rsid'] = 'rs{}'.format(variant['rsid']) return variant except db.Variant.DoesNotExist: return {} -def get_variants_by_rsid(db, rsid): +def get_variants_by_rsid(dataset, rsid, ds_version=None): + """ + Retrieve variants by their associated rsid + Args: + dataset (str): short name of dataset + rsid (str): rsid of the variant (starting with rs) + ds_version (str): version of the dataset + Returns: + list: variant dicts; no hits + """ + dataset_version = db.get_dataset_version(dataset, ds_version) + if not dataset_version: + return + if not rsid.startswith('rs'): - return None + logging.error('get_variants_by_rsid({}, {}): rsid not starting with rs'.format(dataset, rsid)) + return + try: - int(rsid.lstrip('rs')) + rsid = int(rsid.lstrip('rs')) except ValueError: - return None - variants = list(db.variants.find({'rsid': rsid}, projection={'_id': False})) - add_consequence_to_variants(variants) + logging.error('get_variants_by_rsid({}, {}): not an integer after rs'.format(dataset, rsid)) + return + query = (db.Variant + .select() + .where((db.Variant.rsid == rsid) & + (db.Variant.dataset_version == dataset_version)) + .dicts()) + variants = [variant for variant in query] + # add_consequence_to_variants(variants) return variants diff --git a/backend/modules/browser/test_lookups.py b/backend/modules/browser/test_lookups.py index 6b35e63e6..4e1fa629c 100644 --- a/backend/modules/browser/test_lookups.py +++ b/backend/modules/browser/test_lookups.py @@ -259,7 +259,6 @@ def test_get_variant(): 'ENST00000475776'] assert result['rsid'] == 75050571 - # need to add test for entry with missing rsid # too slow query atm @@ -267,6 +266,29 @@ def test_get_variant(): assert not lookups.get_variant(-1, '1', 'A', 'T') +def test_get_variants_by_rsid(caplog): + ''' + Test get_variants_by_rsid() + ''' + # normal + result = lookups.get_variants_by_rsid('SweGen', 'rs373706802') + assert result[0]['genes'] == ['ENSG00000229286', 'ENSG00000235265'] + assert result[0]['transcripts'] == ['ENST00000448070','ENST00000413156'] + + # errors + assert lookups.get_variants_by_rsid('incorrect_name', 'rs373706802') is None + assert lookups.get_variants_by_rsid('SweGen', '373706802') is None + assert lookups.get_variants_by_rsid('SweGen', 'rs3737o68o2') is None + expected = ('get_dataset_version(incorrect_name, version=None): cannot retrieve dataset version', + 'get_variants_by_rsid(SweGen, 373706802): rsid not starting with rs', + 'get_variants_by_rsid(SweGen, rs3737o68o2): not an integer after rs') + for comparison in zip(caplog.messages, expected): + assert comparison[0] == comparison[1] + + # no variants with rsid available + assert not lookups.get_variants_by_rsid('SweGen', 'rs1') + + def test_get_variants_in_transcript(): """ Test get_variants_in_transcript() From 8f73f677390a17e89e6c74675e05dfb71141ab68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Thu, 10 Jan 2019 13:59:13 +0100 Subject: [PATCH 014/170] get_variants_in region migrated, many other fixes in tests and code --- backend/modules/browser/lookups.py | 174 ++++++++++++++++-------- backend/modules/browser/test_lookups.py | 48 +++++-- 2 files changed, 150 insertions(+), 72 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index ad7cde7cc..2346cda1e 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -2,14 +2,15 @@ import db import logging -from .utils import METRICS, AF_BUCKETS, get_xpos, xpos_to_pos, add_consequence_to_variants, add_consequence_to_variant +# from .utils import METRICS, AF_BUCKETS, get_xpos, xpos_to_pos, add_consequence_to_variants, add_consequence_to_variant SEARCH_LIMIT = 10000 def add_rsid_to_variant(dataset, variant): """ - Add rsid to a variant in the database + Add rsid to a variant in the database based on position + Note that this may be inaccurate Args: dataset (str): short name of the dataset variant (dict): values for a variant @@ -23,16 +24,20 @@ def add_rsid_to_variant(dataset, variant): dbsnp_version = refset['dbsnp_version'] if variant['rsid'] == '.' or variant['rsid'] is None: - rsid = (db.DbSNP - .select() - .where((db.DbSNP.pos == variant['pos']) & - (db.DbSNP.version == dbsnp_version)) - .dicts() - .get()) - if rsid: - variant['rsid'] = 'rs{}'.format(rsid['rsid']) - else: - logging.error('add_rsid_to_variant({}, {}): unable to retrieve rsid'.format(dataset, variant)) + try: + rsid = (db.DbSNP + .select() + .where((db.DbSNP.pos == variant['pos']) & + (db.DbSNP.chrom == variant['chrom']) & + (db.DbSNP.version == dbsnp_version)) + .dicts() + .get()) + if rsid: + variant['rsid'] = 'rs{}'.format(rsid['rsid']) + else: + logging.error('add_rsid_to_variant({}, variant[dbid: {}]): unable to retrieve rsid'.format(dataset, variant['id'])) + except db.DbSNP.DoesNotExist: + logging.error('add_rsid_to_variant({}, variant[dbid: {}]): unable to retrieve rsid'.format(dataset, variant['id'])) REGION_REGEX = re.compile(r'^\s*(\d+|X|Y|M|MT)\s*([-:]?)\s*(\d*)-?([\dACTG]*)-?([ACTG]*)') @@ -143,35 +148,50 @@ def get_coverage_for_bases(dataset, chrom, start_pos, end_pos=None, ds_version=N return -def get_coverage_for_transcript(chrom, start_pos, stop_pos=None): +def get_coverage_for_transcript(dataset, chrom, start_pos, end_pos=None, ds_version=None): """ - Get the coverage for the list of bases given by start_pos->xstop_pos, inclusive + Get the coverage for the list of bases given by start_pos->end_pos, inclusive Args: + dataset (str): short name for the dataset chrom (str): chromosome start_pos (int): first position of interest end_pos (int): last position of interest; if None it will be set to start_pos + ds_version (str): version of the dataset Returns: list: coverage dicts for the region of interest """ # Is this function still relevant with postgres? # Only entries with reported cov are in database - coverage_array = get_coverage_for_bases(chrom, start_pos, stop_pos) + coverage_array = get_coverage_for_bases(dataset, chrom, start_pos, end_pos, ds_version) # only return coverages that have coverage (if that makes any sense?) # return coverage_array covered = [c for c in coverage_array if c['mean']] return covered -def get_exons_in_transcript(transcript_dbid): +def get_exons_in_transcript(dataset, transcript_id): """ Retrieve exons associated with the given transcript id Args: - transcript_dbid: the id of the transcript in the database (Transcript.id; not transcript_id) + dataset (str): short name of the dataset + transcript_id (str): the id of the transcript Returns: list: dicts with values for each exon sorted by start position """ - return sorted(list(db.Feature.select().where((db.Feature.transcript==transcript_dbid) & - (db.Feature.feature_type=='exon')).dicts()), + ref_dbid = db.get_reference_dbid_dataset(dataset) + + try: + transcript = (db.Transcript + .select() + .join(db.Gene) + .where((db.Transcript.transcript_id == transcript_id) & + (db.Gene.reference_set == ref_dbid)) + .get()) + except db.Transcript.DoesNotExist: + logging.error('get_exons_in_transcript({}, {}): unable to retrueve transcript'.format(dataset, transcript_id)) + return + return sorted(list(db.Feature.select().where((db.Feature.transcript == transcript) & + (db.Feature.feature_type == 'exon')).dicts()), key=lambda k: k['start']) @@ -237,27 +257,27 @@ def get_genes_in_region(chrom, start_pos, stop_pos): logging.error('get_genes_in_region({}, {}, {}): no genes found'.format(chrom, start_pos, stop_pos)) -def get_number_of_variants_in_transcript(db, transcript_id): - total = db.variants.count({'transcripts': transcript_id}) - filtered = db.variants.count({'transcripts': transcript_id, 'filter': 'PASS'}) - return {'filtered': filtered, 'total': total} - - -def get_transcript(transcript_id): +def get_number_of_variants_in_transcript(dataset, transcript_id, ds_version=None): """ - Retrieve transcript by transcript id - Also includes exons as ['exons'] + Get the total and filtered amount of variants in a transcript Args: - transcript_id (str): the id of the transcript + dataset (str): short name of the dataset + transcript_id (str): id of the transcript + ds_version (str): version of the dataset Returns: - dict: values for the transcript, including exons; empty if not found + dict: {filtered: nr_filtered, total: nr_total} """ - try: - transcript = db.Transcript.select().where(db.Transcript.transcript_id==transcript_id).dicts().get() - transcript['exons'] = get_exons_in_transcript(transcript['id']) - return transcript - except db.Transcript.DoesNotExist: - return {} + # will be implemented after database is updated + raise NotImplementedError + + dataset_version = db.get_dataset_version() + if not dataset_version: + return + + transcript = db.Transcript.select().where(db.Transcript.transcript_id) + total = db.variants.count({'transcripts': transcript_id}) + filtered = db.variants.count({'transcripts': transcript_id, 'filter': 'PASS'}) + return {'filtered': filtered, 'total': total} def get_raw_variant(dataset, pos, chrom, ref, alt, ds_version=None): @@ -293,6 +313,31 @@ def get_raw_variant(dataset, pos, chrom, ref, alt, ds_version=None): return {} +def get_transcript(dataset, transcript_id): + """ + Retrieve transcript by transcript id + Also includes exons as ['exons'] + Args: + dataset (str): short name of the dataset + transcript_id (str): the id of the transcript + Returns: + dict: values for the transcript, including exons; empty if not found + """ + ref_dbid = db.get_reference_dbid_dataset(dataset) + try: + transcript = (db.Transcript + .select() + .join(db.Gene) + .where((db.Transcript.transcript_id == transcript_id) & + (db.Gene.reference_set == ref_dbid)) + .dicts() + .get()) + transcript['exons'] = get_exons_in_transcript(dataset, transcript_id) + return transcript + except db.Transcript.DoesNotExist: + return {} + + def get_transcripts_in_gene(dataset, gene_id): """ Get the transcripts associated with a gene @@ -315,7 +360,7 @@ def get_transcripts_in_gene(dataset, gene_id): return [] -def get_variant(dataset, pos, chrom, ref, alt): +def get_variant(dataset, pos, chrom, ref, alt, ds_version=None): """ Retrieve variant by position and change Retrieves rsid from db (if available) if not present in variant @@ -324,16 +369,17 @@ def get_variant(dataset, pos, chrom, ref, alt): pos (int): position of the variant chrom (str): name of the chromosome ref (str): reference sequence - ref (str): variant sequence + alt (str): variant sequence + ds_version (str): version of the dataset Returns: dict: values for the variant; empty if not found """ try: - variant = get_raw_variant(dataset, pos, chrom, ref, alt) + variant = get_raw_variant(dataset, pos, chrom, ref, alt, ds_version) if not variant or 'rsid' not in variant: return variant if variant['rsid'] == '.' or variant['rsid'] is None: - add_rsid_to_variant(variant) + add_rsid_to_variant(dataset, variant) else: if not str(variant['rsid']).startswith('rs'): variant['rsid'] = 'rs{}'.format(variant['rsid']) @@ -375,6 +421,33 @@ def get_variants_by_rsid(dataset, rsid, ds_version=None): return variants +def get_variants_in_region(dataset, chrom, start_pos, end_pos, ds_version=None): + """ + Variants that overlap a region + Args: + dataset (str): short name of the dataset + chrom (str): name of the chromosom + start_pos (int): start of the region + end_pos (int): start of the region + ds_version (str): version of the dataset + """ + dataset_version = db.get_dataset_version(dataset, ds_version) + if not dataset_version: + return + query = (db.Variant + .select() + .where((db.Variant.pos >= start_pos) & + (db.Variant.pos <= end_pos) & + (db.Variant.chrom == chrom) & + (db.Variant.dataset_version == dataset_version)) + .dicts()) + variants = [variant for variant in query] + # add_consequence_to_variants(variants) + #for variant in variants: + # remove_extraneous_information(variant) + return variants + + def get_variants_in_gene(dataset, gene_id): """ Retrieve variants present inside a gene @@ -406,9 +479,7 @@ def get_variants_in_transcript(transcript_id): Returns: dict: values for the variant; empty if not found """ - variants = [] - for variant in db.Variant.select().where(db.Variant.transcripts.contains(transcript_id)).dicts(): - variants.append(variant) + variants = [variant for variant in db.Variant.select().where(db.Variant.transcripts.contains(transcript_id)).dicts()] return variants variant['vep_annotations'] = [x for x in variant['vep_annotations'] if x['Feature'] == transcript_id] add_consequence_to_variant(variant) @@ -417,23 +488,6 @@ def get_variants_in_transcript(transcript_id): return variants -def get_variants_in_region(db, chrom, start, stop): - """ - Variants that overlap a region - Unclear if this will include CNVs - """ - xstart = get_xpos(chrom, start) - xstop = get_xpos(chrom, stop) - variants = list(db.variants.find({ - 'xpos': {'$lte': xstop, '$gte': xstart} - }, projection={'_id': False}, limit=SEARCH_LIMIT)) - add_consequence_to_variants(variants) - for variant in variants: - add_rsid_to_variant(sdb, variant) - remove_extraneous_information(variant) - return list(variants) - - def remove_extraneous_information(variant): #del variant['genotype_depths'] #del variant['genotype_qualities'] diff --git a/backend/modules/browser/test_lookups.py b/backend/modules/browser/test_lookups.py index 4e1fa629c..c9ab76fd9 100644 --- a/backend/modules/browser/test_lookups.py +++ b/backend/modules/browser/test_lookups.py @@ -13,11 +13,11 @@ def test_add_rsid_to_variant(): variant = lookups.get_variant('SweGen', 55500283, '1', 'A', 'T') rsid = variant['rsid'] variant['rsid'] = '.' - # lookups.add_rsid_to_variant('SweGen', variant) + lookups.add_rsid_to_variant('SweGen', variant) assert variant['rsid'] == rsid # "non-existing" del variant['rsid'] - # lookups.add_rsid_to_variant(variant) + lookups.add_rsid_to_variant(variant) assert variant['rsid'] == rsid @@ -32,7 +32,7 @@ def test_get_coverage_for_bases(): """ Test get_coverage_for_bases() """ - # coverage = lookups.get_coverage_for_bases('SweGen', '1', 55500283, 55500320) + coverage = lookups.get_coverage_for_bases('SweGen', '1', 55500283, 55500320) expected = [{'id': 5474062, 'dataset_version': 4, 'chrom': '1', 'pos': 55500290, 'mean': 40.66, 'median': 39.0, 'coverage': [1.0, 1.0, 1.0, 1.0, 0.996, 0.97, 0.867, 0.127, 0.001]}, @@ -49,7 +49,7 @@ def test_get_coverage_for_bases(): def test_get_coverage_for_transcript(): - coverage = lookups.get_coverage_for_transcript('1', 55500283, 55500320) + coverage = lookups.get_coverage_for_transcript('SweGen', '1', 55500283, 55500320) expected = [{'id': 5474062, 'dataset_version': 4, 'chrom': '1', 'pos': 55500290, 'mean': 40.66, 'median': 39.0, 'coverage': [1.0, 1.0, 1.0, 1.0, 0.996, 0.97, 0.867, 0.127, 0.001]}, @@ -69,7 +69,7 @@ def test_get_exons_in_transcript(): """ Test get_exons_in_transcript() """ - result = lookups.get_exons_in_transcript(28186) + result = lookups.get_exons_in_transcript('SweGen', 'ENST00000346817') expected = [{'id': 326403, 'gene': 8600, 'transcript': 28186, 'chrom': '2', 'start': 202047893, 'stop': 202048032, 'strand': '+', 'feature_type': 'exon'}, {'id': 326404, 'gene': 8600, 'transcript': 28186, 'chrom': '2', @@ -172,6 +172,14 @@ def test_get_genes_in_region(): assert False +def test_get_number_of_variants_in_transcript(): + """ + Test get_number_of_variants_in_transcripts() + """ + assert False + lookups.get_number_of_variants_in_transcripts() + + def test_get_transcript(): """ Test get_transcript() @@ -199,7 +207,7 @@ def test_get_transcript(): {'id': 18, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 24739, 'stop': 24892, 'strand': '-', 'feature_type': 'exon'}, {'id': 17, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 29322, 'stop': 29371, 'strand': '-', 'feature_type': 'exon'}] - result = lookups.get_transcript('ENST00000438504') + result = lookups.get_transcript('SweGen', 'ENST00000438504') assert result['id'] == expected['id'] assert result['mim_annotation'] == expected['mim_annotation'] assert result['transcript_id'] == expected['transcript_id'] @@ -211,7 +219,7 @@ def test_get_transcript(): assert result['exons'] == exp_exon # non-existing - assert not lookups.get_transcript('INCORRECT') + assert not lookups.get_transcript('SweGen', 'INCORRECT') def test_get_transcripts_in_gene(): @@ -249,7 +257,7 @@ def test_get_variant(): # missing rsid in result, multiple transcripts # slow, need to fix db - # result = lookups.get_variant(47730411, '21', 'TA', 'T') + result = lookups.get_variant('SweGen', 47730411, '21', 'TA', 'T') assert result['genes'] == ['ENSG00000160298'] assert result['transcripts'] == ['ENST00000417060', 'ENST00000397682', 'ENST00000397683', 'ENST00000397680', @@ -257,10 +265,8 @@ def test_get_variant(): 'ENST00000291691', 'ENST00000445935', 'ENST00000491666', 'ENST00000472607', 'ENST00000475776'] - assert result['rsid'] == 75050571 - - # need to add test for entry with missing rsid - # too slow query atm + assert result['rsid'] == 'rs75050571' + # TODO: add test for entry with missing rsid # incorrect position assert not lookups.get_variant(-1, '1', 'A', 'T') @@ -289,6 +295,24 @@ def test_get_variants_by_rsid(caplog): assert not lookups.get_variants_by_rsid('SweGen', 'rs1') +def test_get_variants_in_region(): + """ + Test get_variants_in_region() + """ + # normal + result = lookups.get_variants_in_region('SweGen', '22', 16079200, 16079400) + expected_pos = [16079227, 16079234, 16079289, 16079350] + assert [res['pos'] for res in result] == expected_pos + + # no positions covered + result = lookups.get_variants_in_region('SweGen', '22', 16079200, 16079000) + assert not result + + # incorrect dataset + result = lookups.get_variants_in_region('Incorrect_dataset', '22', 16079200, 16079400) + assert not result + + def test_get_variants_in_transcript(): """ Test get_variants_in_transcript() From 4fc466af503987f96e55e6fc793ba3c1df12aef5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Thu, 10 Jan 2019 15:34:17 +0100 Subject: [PATCH 015/170] added the functionality of get_variants_from_dbsnp to get_variants_by_rsid --- backend/modules/browser/lookups.py | 43 ++++++++++++++++++++----- backend/modules/browser/test_lookups.py | 7 ++++ 2 files changed, 42 insertions(+), 8 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index 2346cda1e..a017c5486 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -42,7 +42,7 @@ def add_rsid_to_variant(dataset, variant): REGION_REGEX = re.compile(r'^\s*(\d+|X|Y|M|MT)\s*([-:]?)\s*(\d*)-?([\dACTG]*)-?([ACTG]*)') -def get_awesomebar_result(dataset, query): +def get_awesomebar_result(dataset, query, ds_version=None): """ Parse the search input @@ -63,14 +63,16 @@ def get_awesomebar_result(dataset, query): Args: dataset (str): short name of dataset query (str): the search query + ds_version (str): the dataset version Returns: tuple: (datatype, identifier) """ query = query.strip() # Parse Variant types - variant = get_variants_by_rsid(db, query.lower()) + variant = get_variants_by_rsid(dataset, query.lower(), ds_version=ds_version) if not variant: + variant = get_variants_by_rsid(dataset, query.lower(), check_position=True, ds_version=ds_version) variant = get_variants_from_dbsnp(db,sdb, query.lower()) if variant: @@ -388,12 +390,14 @@ def get_variant(dataset, pos, chrom, ref, alt, ds_version=None): return {} -def get_variants_by_rsid(dataset, rsid, ds_version=None): +def get_variants_by_rsid(dataset, rsid, check_position=False, ds_version=None): """ Retrieve variants by their associated rsid + May also look up rsid and search for variants at the position Args: dataset (str): short name of dataset rsid (str): rsid of the variant (starting with rs) + check_position (bool): check for variants at the position of the rsid instead of by rsid ds_version (str): version of the dataset Returns: list: variant dicts; no hits @@ -411,11 +415,34 @@ def get_variants_by_rsid(dataset, rsid, ds_version=None): except ValueError: logging.error('get_variants_by_rsid({}, {}): not an integer after rs'.format(dataset, rsid)) return - query = (db.Variant - .select() - .where((db.Variant.rsid == rsid) & - (db.Variant.dataset_version == dataset_version)) - .dicts()) + if check_position: + refset = (db.Dataset + .select(db.ReferenceSet) + .join(db.ReferenceSet) + .where(db.Dataset.short_name == dataset) + .dicts() + .get()) + dbsnp_version = refset['dbsnp_version'] + + rsid_dbsnp = (db.DbSNP + .select() + .where((db.DbSNP.rsid == rsid) & + (db.DbSNP.version_id == dbsnp_version) ) + .dicts() + .get()) + query = (db.Variant + .select() + .where((db.Variant.pos == rsid_dbsnp['pos']) & + (db.Variant.chrom == rsid_dbsnp['chrom']) & + (db.Variant.dataset_version == dataset_version)) + .dicts()) + else: + query = (db.Variant + .select() + .where((db.Variant.rsid == rsid) & + (db.Variant.dataset_version == dataset_version)) + .dicts()) + variants = [variant for variant in query] # add_consequence_to_variants(variants) return variants diff --git a/backend/modules/browser/test_lookups.py b/backend/modules/browser/test_lookups.py index c9ab76fd9..613b26e1c 100644 --- a/backend/modules/browser/test_lookups.py +++ b/backend/modules/browser/test_lookups.py @@ -278,6 +278,13 @@ def test_get_variants_by_rsid(caplog): ''' # normal result = lookups.get_variants_by_rsid('SweGen', 'rs373706802') + assert result[0]['pos'] == 16080482 + assert result[0]['genes'] == ['ENSG00000229286', 'ENSG00000235265'] + assert result[0]['transcripts'] == ['ENST00000448070','ENST00000413156'] + + # by position + result = lookups.get_variants_by_rsid('SweGen', 'rs373706802', check_position=True) + assert result[0]['pos'] == 16080482 assert result[0]['genes'] == ['ENSG00000229286', 'ENSG00000235265'] assert result[0]['transcripts'] == ['ENST00000448070','ENST00000413156'] From 8ae076e7e733ca3ea4195c87509be901bfbf0fc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Thu, 10 Jan 2019 16:36:16 +0100 Subject: [PATCH 016/170] awesomebar seems to work --- backend/modules/browser/lookups.py | 20 +++++++++++--------- backend/modules/browser/test_lookups.py | 15 ++++++++++++++- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index a017c5486..3c7a44ef6 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -46,7 +46,13 @@ def get_awesomebar_result(dataset, query, ds_version=None): """ Parse the search input - Datatype is one of 'gene', 'variant', or 'region' + Datatype is one of: + - 'gene' + - 'transcript' + - 'variant' + - 'dbsnp_variant_set' + - 'region' + Identifier is one of: - ensembl ID for gene - variant ID string for variant (eg. 1-1000-A-T) @@ -57,9 +63,6 @@ def get_awesomebar_result(dataset, query, ds_version=None): - if a gene symbol, return that gene's ensembl ID - if an RSID, return that variant's string - Finally, note that we don't return the whole object here - only it's identifier. - This could be important for performance later - Args: dataset (str): short name of dataset query (str): the search query @@ -73,7 +76,6 @@ def get_awesomebar_result(dataset, query, ds_version=None): variant = get_variants_by_rsid(dataset, query.lower(), ds_version=ds_version) if not variant: variant = get_variants_by_rsid(dataset, query.lower(), check_position=True, ds_version=ds_version) - variant = get_variants_from_dbsnp(db,sdb, query.lower()) if variant: if len(variant) == 1: @@ -82,23 +84,23 @@ def get_awesomebar_result(dataset, query, ds_version=None): retval = ('dbsnp_variant_set', variant[0]['rsid']) return retval - gene = get_gene_by_name(sdb, query) + gene = get_gene_by_name(dataset, query) # From here out, all should be uppercase (gene, tx, region, variant_id) query = query.upper() if not gene: - gene = get_gene_by_name(sdb, query) + gene = get_gene_by_name(dataset, query) if gene: return 'gene', gene['gene_id'] # Ensembl formatted queries if query.startswith('ENS'): # Gene - gene = get_gene(sdb, query) + gene = get_gene(dataset, query) if gene: return 'gene', gene['gene_id'] # Transcript - transcript = get_transcript(sdb, query) + transcript = get_transcript(dataset, query) if transcript: return 'transcript', transcript['transcript_id'] diff --git a/backend/modules/browser/test_lookups.py b/backend/modules/browser/test_lookups.py index 613b26e1c..18cc2722c 100644 --- a/backend/modules/browser/test_lookups.py +++ b/backend/modules/browser/test_lookups.py @@ -25,7 +25,20 @@ def test_get_awesomebar_result(): """ Test get_awesomebar_result() """ - assert False + result = lookups.get_awesomebar_result('SweGen', 'rs373706802') + assert result == ('dbsnp_variant_set', 373706802) + result = lookups.get_awesomebar_result('SweGen', 'rs783') + assert result == ('variant', '22-29461622-G-A') + result = lookups.get_awesomebar_result('SweGen', 'ADH6') + assert result == ('gene', 'ENSG00000172955') + result = lookups.get_awesomebar_result('SweGen', 'ENSG00000172955') + assert result == ('gene', 'ENSG00000172955') + result = lookups.get_awesomebar_result('SweGen', 'ENST00000237653') + assert result == ('transcript', 'ENST00000237653') + result = lookups.get_awesomebar_result('SweGen', '22-46615715-46615880') + assert result == ('region', '22-46615715-46615880') + result = lookups.get_awesomebar_result('SweGen', 'CHR22:46615715-46615880') + assert result == ('region', '22-46615715-46615880') def test_get_coverage_for_bases(): From 8c8336537aefe5152671b2324dcebeef4446df42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Mon, 14 Jan 2019 19:44:32 +0100 Subject: [PATCH 017/170] Fix for Google style for docstrings, some other changes --- backend/modules/browser/lookups.py | 83 ++++++++++++++++++++---------- 1 file changed, 57 insertions(+), 26 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index 3c7a44ef6..84b10af77 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -2,7 +2,7 @@ import db import logging -# from .utils import METRICS, AF_BUCKETS, get_xpos, xpos_to_pos, add_consequence_to_variants, add_consequence_to_variant +import utils SEARCH_LIMIT = 10000 @@ -11,6 +11,7 @@ def add_rsid_to_variant(dataset, variant): """ Add rsid to a variant in the database based on position Note that this may be inaccurate + Args: dataset (str): short name of the dataset variant (dict): values for a variant @@ -67,6 +68,7 @@ def get_awesomebar_result(dataset, query, ds_version=None): dataset (str): short name of dataset query (str): the search query ds_version (str): the dataset version + Returns: tuple: (datatype, identifier) """ @@ -124,12 +126,14 @@ def get_awesomebar_result(dataset, query, ds_version=None): def get_coverage_for_bases(dataset, chrom, start_pos, end_pos=None, ds_version=None): """ Get the coverage for the list of bases given by start_pos->end_pos, inclusive + Args: dataset (str): short name for the dataset chrom (str): chromosome start_pos (int): first position of interest end_pos (int): last position of interest; if None it will be set to start_pos ds_version (str): version of the dataset + Returns: list: coverage dicts for the region of interest: None if unable to retrieve """ @@ -155,12 +159,14 @@ def get_coverage_for_bases(dataset, chrom, start_pos, end_pos=None, ds_version=N def get_coverage_for_transcript(dataset, chrom, start_pos, end_pos=None, ds_version=None): """ Get the coverage for the list of bases given by start_pos->end_pos, inclusive + Args: dataset (str): short name for the dataset chrom (str): chromosome start_pos (int): first position of interest end_pos (int): last position of interest; if None it will be set to start_pos ds_version (str): version of the dataset + Returns: list: coverage dicts for the region of interest """ @@ -176,9 +182,11 @@ def get_coverage_for_transcript(dataset, chrom, start_pos, end_pos=None, ds_vers def get_exons_in_transcript(dataset, transcript_id): """ Retrieve exons associated with the given transcript id + Args: dataset (str): short name of the dataset transcript_id (str): the id of the transcript + Returns: list: dicts with values for each exon sorted by start position """ @@ -202,9 +210,11 @@ def get_exons_in_transcript(dataset, transcript_id): def get_gene(dataset, gene_id): """ Retrieve gene by gene id + Args: dataset (str): short name of the dataset gene_id (str): the id of the gene + Returns: dict: values for the gene; empty if not found """ @@ -222,8 +232,10 @@ def get_gene_by_name(dataset, gene_name): """ Retrieve gene by gene_name. First checks gene_name, then other_names. + Args: gene_name (str): the id of the gene + Returns: dict: values for the gene; empty if not found """ @@ -243,10 +255,12 @@ def get_gene_by_name(dataset, gene_name): def get_genes_in_region(chrom, start_pos, stop_pos): """ Retrieve genes located within a region + Args: chrom (str): chromosome name start_pos (int): start of region stop_pos (int): end of region + Returns: dict: values for the gene; empty if not found """ @@ -264,10 +278,12 @@ def get_genes_in_region(chrom, start_pos, stop_pos): def get_number_of_variants_in_transcript(dataset, transcript_id, ds_version=None): """ Get the total and filtered amount of variants in a transcript + Args: dataset (str): short name of the dataset transcript_id (str): id of the transcript ds_version (str): version of the dataset + Returns: dict: {filtered: nr_filtered, total: nr_total} """ @@ -287,6 +303,7 @@ def get_number_of_variants_in_transcript(dataset, transcript_id, ds_version=None def get_raw_variant(dataset, pos, chrom, ref, alt, ds_version=None): """ Retrieve variant by position and change + Args: dataset (str): short name of the reference set pos (int): position of the variant @@ -294,6 +311,7 @@ def get_raw_variant(dataset, pos, chrom, ref, alt, ds_version=None): ref (str): reference sequence alt (str): variant sequence ds_version (str): dataset version + Returns: dict: values for the variant; empty if not found """ @@ -321,9 +339,11 @@ def get_transcript(dataset, transcript_id): """ Retrieve transcript by transcript id Also includes exons as ['exons'] + Args: dataset (str): short name of the dataset transcript_id (str): the id of the transcript + Returns: dict: values for the transcript, including exons; empty if not found """ @@ -368,6 +388,7 @@ def get_variant(dataset, pos, chrom, ref, alt, ds_version=None): """ Retrieve variant by position and change Retrieves rsid from db (if available) if not present in variant + Args: dataset (str): short name of the dataset pos (int): position of the variant @@ -375,6 +396,7 @@ def get_variant(dataset, pos, chrom, ref, alt, ds_version=None): ref (str): reference sequence alt (str): variant sequence ds_version (str): version of the dataset + Returns: dict: values for the variant; empty if not found """ @@ -396,11 +418,13 @@ def get_variants_by_rsid(dataset, rsid, check_position=False, ds_version=None): """ Retrieve variants by their associated rsid May also look up rsid and search for variants at the position + Args: dataset (str): short name of dataset rsid (str): rsid of the variant (starting with rs) check_position (bool): check for variants at the position of the rsid instead of by rsid ds_version (str): version of the dataset + Returns: list: variant dicts; no hits """ @@ -450,15 +474,41 @@ def get_variants_by_rsid(dataset, rsid, check_position=False, ds_version=None): return variants +def get_variants_in_gene(dataset, gene_id): + """ + Retrieve variants present inside a gene + + Args: + dataset: short name of the dataset + gene_id (str): id of the gene + + Returns: + list: values for the variants + """ + ref_dbid = db.get_reference_dbid_dataset(dataset) +# db.Variant.select().where(db.Variant.gene.contains(re + variants = [] + for variant in db.variants.find({'genes': gene_id}, projection={'_id': False}): + variant['vep_annotations'] = [x for x in variant['vep_annotations'] if x['Gene'] == gene_id] + add_consequence_to_variant(variant) + remove_extraneous_information(variant) + variants.append(variant) + return variants + + def get_variants_in_region(dataset, chrom, start_pos, end_pos, ds_version=None): """ Variants that overlap a region + Args: dataset (str): short name of the dataset chrom (str): name of the chromosom start_pos (int): start of the region end_pos (int): start of the region ds_version (str): version of the dataset + + Returns: + list: variant dicts """ dataset_version = db.get_dataset_version(dataset, ds_version) if not dataset_version: @@ -477,43 +527,24 @@ def get_variants_in_region(dataset, chrom, start_pos, end_pos, ds_version=None): return variants -def get_variants_in_gene(dataset, gene_id): - """ - Retrieve variants present inside a gene - Args: - dataset: short name of the dataset - gene_id (str): id of the gene - Returns: - list: values for the variants - """ - ref_dbid = db.get_reference_dbid_dataset(dataset) -# db.Variant.select().where(db.Variant.gene.contains(re - variants = [] - for variant in db.variants.find({'genes': gene_id}, projection={'_id': False}): - variant['vep_annotations'] = [x for x in variant['vep_annotations'] if x['Gene'] == gene_id] - add_consequence_to_variant(variant) - remove_extraneous_information(variant) - variants.append(variant) - return variants - - def get_variants_in_transcript(transcript_id): """ Retrieve variants inside a transcript + Args: pos (int): position of the variant chrom (str): name of the chromosome ref (str): reference sequence ref (str): variant sequence + Returns: dict: values for the variant; empty if not found """ variants = [variant for variant in db.Variant.select().where(db.Variant.transcripts.contains(transcript_id)).dicts()] - return variants - variant['vep_annotations'] = [x for x in variant['vep_annotations'] if x['Feature'] == transcript_id] - add_consequence_to_variant(variant) - remove_extraneous_information(variant) - variants.append(variant) + for variant in variants: + variant['vep_annotations'] = [annotation for annotation in variant['vep_annotations'] if x['Feature'] == transcript_id] + add_consequence_to_variant(variant) + remove_extraneous_information(variant) return variants From 7b45a8bc5578294928cc948ed85fc7a981a77a67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Thu, 17 Jan 2019 15:00:16 +0100 Subject: [PATCH 018/170] starting to migrate the utils code --- backend/modules/browser/utils.py | 183 +++++++++++-------------------- 1 file changed, 63 insertions(+), 120 deletions(-) diff --git a/backend/modules/browser/utils.py b/backend/modules/browser/utils.py index 68350916c..b6bf5a890 100644 --- a/backend/modules/browser/utils.py +++ b/backend/modules/browser/utils.py @@ -15,61 +15,69 @@ 'VQSLOD' ] +# Note that this is the current as of v81 with some included for backwards compatibility (VEP <= 75) +CSQ_ORDER = ["transcript_ablation", +"splice_acceptor_variant", +"splice_donor_variant", +"stop_gained", +"frameshift_variant", +"stop_lost", +"start_lost", # new in v81 +"initiator_codon_variant", # deprecated +"transcript_amplification", +"inframe_insertion", +"inframe_deletion", +"missense_variant", +"protein_altering_variant", # new in v79 +"splice_region_variant", +"incomplete_terminal_codon_variant", +"stop_retained_variant", +"synonymous_variant", +"coding_sequence_variant", +"mature_miRNA_variant", +"5_prime_UTR_variant", +"3_prime_UTR_variant", +"non_coding_transcript_exon_variant", +"non_coding_exon_variant", # deprecated +"intron_variant", +"NMD_transcript_variant", +"non_coding_transcript_variant", +"nc_transcript_variant", # deprecated +"upstream_gene_variant", +"downstream_gene_variant", +"TFBS_ablation", +"TFBS_amplification", +"TF_binding_site_variant", +"regulatory_region_ablation", +"regulatory_region_amplification", +"feature_elongation", +"regulatory_region_variant", +"feature_truncation", +"intergenic_variant", +""] +assert len(CSQ_ORDER) == len(set(CSQ_ORDER)) # No dupplicates -def add_transcript_coordinate_to_variants(sdb, variant_list, transcript_id): - """ - Each variant has a 'xpos' and 'pos' positional attributes. - This method takes a list of variants and adds a third position: the "transcript coordinates". - This is defined as the distance from the start of the transcript, in coding bases. - So a variant in the 7th base of the 6th exon of a transcript will have a transcript coordinate of - the sum of the size of the first 5 exons) + 7 - This is 0-based, so a variant in the first base of the first exon has a transcript coordinate of 0. - - You may want to add transcript coordinates for multiple transcripts, so this is stored in a variant as - variant['transcript_coordinates'][transcript_id] - - If a variant in variant_list does not have a `transcript_coordinates` dictionary, we create one - - If a variant start position for some reason does not fall in any exons in this transcript, its coordinate is 0. - This is perhaps logically inconsistent, - but it allows you to spot errors quickly if there's a pileup at the first base. - `None` would just break things. - - Consider the behavior if a 20 base deletion deletes parts of two exons. - I think the behavior in this method is consistent, but beware that it might break things downstream. - - Edits variant_list in place; no return val - """ - - import lookups - # make sure exons is sorted by (start, end) - exons = sorted(lookups.get_exons_in_transcript(sdb, transcript_id), key=itemgetter('start', 'stop')) - - # offset from start of base for exon in ith position (so first item in this list is always 0) - exon_offsets = [0 for i in range(len(exons))] - for i, exon in enumerate(exons): - for j in range(i+1, len(exons)): - exon_offsets[j] += exon['stop'] - exon['start'] - - for variant in variant_list: - if 'transcript_coordinates' not in variant: - variant['transcript_coordinates'] = {} - variant['transcript_coordinates'][transcript_id] = 0 - for i, exon in enumerate(exons): - if exon['start'] <= variant['pos'] <= exon['stop']: - variant['transcript_coordinates'][transcript_id] = exon_offsets[i] + variant['pos'] - exon['start'] - - -def xpos_to_pos(xpos): - return int(xpos % 1e9) +CSQ_ORDER_DICT = {csq:i for i,csq in enumerate(CSQ_ORDER)} +REV_CSQ_ORDER_DICT = dict(enumerate(CSQ_ORDER)) +assert all(csq == REV_CSQ_ORDER_DICT[CSQ_ORDER_DICT[csq]] for csq in CSQ_ORDER) def add_consequence_to_variants(variant_list): + """ + Add information about variant consequence to multiple variants + Args: + variant_list (list): list of variants + """ for variant in variant_list: add_consequence_to_variant(variant) def add_consequence_to_variant(variant): + """ + Add information about variant consequence to a variant + Args: + variant (dict): variant information + """ worst_csq = worst_csq_with_vep(variant['vep_annotations']) variant['major_consequence'] = '' if worst_csq is None: @@ -81,16 +89,16 @@ def add_consequence_to_variant(variant): variant['HGVS'] = get_proper_hgvs(worst_csq) variant['CANONICAL'] = worst_csq['CANONICAL'] - if csq_order_dict[variant['major_consequence']] <= csq_order_dict["frameshift_variant"]: + if CSQ_ORDER_DICT[variant['major_consequence']] <= CSQ_ORDER_DICT["frameshift_variant"]: variant['category'] = 'lof_variant' for annotation in variant['vep_annotations']: if annotation['LoF'] == '': annotation['LoF'] = 'NC' annotation['LoF_filter'] = 'Non-protein-coding gene' - elif csq_order_dict[variant['major_consequence']] <= csq_order_dict["missense_variant"]: + elif CSQ_ORDER_DICT[variant['major_consequence']] <= CSQ_ORDER_DICT["missense_variant"]: # Should be noted that this grabs inframe deletion, etc. variant['category'] = 'missense_variant' - elif csq_order_dict[variant['major_consequence']] <= csq_order_dict["synonymous_variant"]: + elif CSQ_ORDER_DICT[variant['major_consequence']] <= CSQ_ORDER_DICT["synonymous_variant"]: variant['category'] = 'synonymous_variant' else: variant['category'] = 'other_variant' @@ -145,55 +153,9 @@ def get_protein_hgvs(annotation): logging.error("Could not fetch protein hgvs - unknown amino acid") return annotation['HGVSp'].split(':')[-1] -# Note that this is the current as of v81 with some included for backwards compatibility (VEP <= 75) -csq_order = ["transcript_ablation", -"splice_acceptor_variant", -"splice_donor_variant", -"stop_gained", -"frameshift_variant", -"stop_lost", -"start_lost", # new in v81 -"initiator_codon_variant", # deprecated -"transcript_amplification", -"inframe_insertion", -"inframe_deletion", -"missense_variant", -"protein_altering_variant", # new in v79 -"splice_region_variant", -"incomplete_terminal_codon_variant", -"stop_retained_variant", -"synonymous_variant", -"coding_sequence_variant", -"mature_miRNA_variant", -"5_prime_UTR_variant", -"3_prime_UTR_variant", -"non_coding_transcript_exon_variant", -"non_coding_exon_variant", # deprecated -"intron_variant", -"NMD_transcript_variant", -"non_coding_transcript_variant", -"nc_transcript_variant", # deprecated -"upstream_gene_variant", -"downstream_gene_variant", -"TFBS_ablation", -"TFBS_amplification", -"TF_binding_site_variant", -"regulatory_region_ablation", -"regulatory_region_amplification", -"feature_elongation", -"regulatory_region_variant", -"feature_truncation", -"intergenic_variant", -""] -assert len(csq_order) == len(set(csq_order)) # No dupes! - -csq_order_dict = {csq:i for i,csq in enumerate(csq_order)} -rev_csq_order_dict = dict(enumerate(csq_order)) -assert all(csq == rev_csq_order_dict[csq_order_dict[csq]] for csq in csq_order) - def remove_extraneous_vep_annotations(annotation_list): - return [ann for ann in annotation_list if worst_csq_index(ann['Consequence'].split('&')) <= csq_order_dict['intron_variant']] + return [ann for ann in annotation_list if worst_csq_index(ann['Consequence'].split('&')) <= CSQ_ORDER_DICT['intron_variant']] def worst_csq_index(csq_list): @@ -202,7 +164,7 @@ def worst_csq_index(csq_list): Return index of the worst consequence (In this case, index of 'frameshift_variant', so 4) Works well with worst_csq_index('non_coding_exon_variant&nc_transcript_variant'.split('&')) """ - return min([csq_order_dict[csq] for csq in csq_list]) + return min([CSQ_ORDER_DICT[csq] for csq in csq_list]) def worst_csq_from_list(csq_list): @@ -211,7 +173,7 @@ def worst_csq_from_list(csq_list): Return the worst consequence (In this case, 'frameshift_variant') Works well with worst_csq_from_list('non_coding_exon_variant&nc_transcript_variant'.split('&')) """ - return rev_csq_order_dict[worst_csq_index(csq_list)] + return REV_CSQ_ORDER_DICT[worst_csq_index(csq_list)] def worst_csq_from_csq(csq): @@ -219,7 +181,7 @@ def worst_csq_from_csq(csq): Input possibly &-filled csq string (e.g. 'non_coding_exon_variant&nc_transcript_variant') Return the worst consequence (In this case, 'non_coding_exon_variant') """ - return rev_csq_order_dict[worst_csq_index(csq.split('&'))] + return REV_CSQ_ORDER_DICT[worst_csq_index(csq.split('&'))] def order_vep_by_csq(annotation_list): @@ -229,7 +191,7 @@ def order_vep_by_csq(annotation_list): """ for ann in annotation_list: ann['major_consequence'] = worst_csq_from_csq(ann['Consequence']) - return sorted(annotation_list, key=(lambda ann:csq_order_dict[ann['major_consequence']])) + return sorted(annotation_list, key=(lambda ann:CSQ_ORDER_DICT[ann['major_consequence']])) def worst_csq_with_vep(annotation_list): @@ -247,7 +209,7 @@ def worst_csq_with_vep(annotation_list): def annotation_severity(annotation): "Bigger is more important." - rv = -csq_order_dict[worst_csq_from_csq(annotation['Consequence'])] + rv = -CSQ_ORDER_DICT[worst_csq_from_csq(annotation['Consequence'])] if annotation['CANONICAL'] == 'YES': rv += 0.1 return rv @@ -257,25 +219,6 @@ def annotation_severity(annotation): CHROMOSOME_TO_CODE = { item: i+1 for i, item in enumerate(CHROMOSOMES) } -def get_single_location(chrom, pos): - """ - Gets a single location from chromosome and position - chr must be actual chromosme code (chrY) and pos must be integer - - Borrowed from xbrowse - """ - return CHROMOSOME_TO_CODE[chrom] * int(1e9) + pos - - -def get_xpos(chrom, pos): - """ - Borrowed from xbrowse - """ - if not chrom.startswith('chr'): - chrom = 'chr{}'.format(chrom) - return get_single_location(chrom, int(pos)) - - def get_minimal_representation(pos, ref, alt): """ Get the minimal representation of a variant, based on the ref + alt alleles in a VCF From bc67b6ec2ee68886677878285b253cef8266899c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Thu, 17 Jan 2019 15:00:29 +0100 Subject: [PATCH 019/170] first tests --- backend/modules/browser/test_utils.py | 52 +++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 backend/modules/browser/test_utils.py diff --git a/backend/modules/browser/test_utils.py b/backend/modules/browser/test_utils.py new file mode 100644 index 000000000..c42db4c35 --- /dev/null +++ b/backend/modules/browser/test_utils.py @@ -0,0 +1,52 @@ +""" +Tests for utils.py +""" + +import lookups +import utils + +import json + + +def test_add_consequence_to_variants(): + """ + Test add_consequence_to_variants() + """ + assert False + + +def test_add_consequence_to_variant(): + """ + Test add_consequence_to_variant() + """ + # variant = lookups.get_variant('SweGen', 47730411, '21', 'TA', 'T') + variant2 = lookups.get_variant('SweGen', 55500283, '1', 'A', 'T') + # variant2['vep_annotations'] = + result = utils.add_consequence_to_variant(variant2) + # result = utils.add_consequence_to_variant(variant) + print(result) + print(result['major_consequence']) + print(result['category']) + + assert False + + +def test_annotation_severity(): + """ + Test annotation_severity() + """ + variant = lookups.get_variant('SweGen', 55500283, '1', 'A', 'T') + utils.annotation_severity(variant['vep_annotations']) + + +def test_worst_csq_from_csq(): + """ + Test worst_csq_from_csq() + """ + variant = lookups.get_variant('SweGen', 55500283, '1', 'A', 'T') + print(type(variant['vep_annotations'])) + print(variant['vep_annotations']) + vep = json.loads(variant['vep_annotations'])[0] + print(vep['Consequence']) + utils.worst_csq_from_csq(vep['Consequence']) + assert False From a2eba10df2af306b6632b801169eb3dbc838885c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Wed, 23 Jan 2019 10:36:08 +0100 Subject: [PATCH 020/170] improved test coverage, also a few small bugs fixed --- backend/modules/browser/lookups.py | 31 ++++++++-------- backend/modules/browser/test_lookups.py | 47 ++++++++++++++++++++++--- 2 files changed, 56 insertions(+), 22 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index 84b10af77..e0364054f 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -33,10 +33,7 @@ def add_rsid_to_variant(dataset, variant): (db.DbSNP.version == dbsnp_version)) .dicts() .get()) - if rsid: - variant['rsid'] = 'rs{}'.format(rsid['rsid']) - else: - logging.error('add_rsid_to_variant({}, variant[dbid: {}]): unable to retrieve rsid'.format(dataset, variant['id'])) + variant['rsid'] = 'rs{}'.format(rsid['rsid']) except db.DbSNP.DoesNotExist: logging.error('add_rsid_to_variant({}, variant[dbid: {}]): unable to retrieve rsid'.format(dataset, variant['id'])) @@ -143,17 +140,13 @@ def get_coverage_for_bases(dataset, chrom, start_pos, end_pos=None, ds_version=N if end_pos is None: end_pos = start_pos - try: - return [values for values in (db.Coverage - .select() - .where((db.Coverage.pos >= start_pos) & - (db.Coverage.pos <= end_pos) & - (db.Coverage.chrom == chrom) & - (db.Coverage.dataset_version == dataset_version.id)) - .dicts())] - except db.Coverage.DoesNotExist: - logging.error('get_coverage_for_bases({}, {}, {}, {}): '.format(dataset, chrom, start_pos, end_pos)) - return + return [values for values in (db.Coverage + .select() + .where((db.Coverage.pos >= start_pos) & + (db.Coverage.pos <= end_pos) & + (db.Coverage.chrom == chrom) & + (db.Coverage.dataset_version == dataset_version.id)) + .dicts())] def get_coverage_for_transcript(dataset, chrom, start_pos, end_pos=None, ds_version=None): @@ -175,6 +168,8 @@ def get_coverage_for_transcript(dataset, chrom, start_pos, end_pos=None, ds_vers coverage_array = get_coverage_for_bases(dataset, chrom, start_pos, end_pos, ds_version) # only return coverages that have coverage (if that makes any sense?) # return coverage_array + if not coverage_array: + return covered = [c for c in coverage_array if c['mean']] return covered @@ -191,7 +186,9 @@ def get_exons_in_transcript(dataset, transcript_id): list: dicts with values for each exon sorted by start position """ ref_dbid = db.get_reference_dbid_dataset(dataset) - + if not ref_dbid: + logging.error('get_exons_in_transcript({}, {}): unable to find dataset dbid'.format(dataset, transcript_id)) + return try: transcript = (db.Transcript .select() @@ -200,7 +197,7 @@ def get_exons_in_transcript(dataset, transcript_id): (db.Gene.reference_set == ref_dbid)) .get()) except db.Transcript.DoesNotExist: - logging.error('get_exons_in_transcript({}, {}): unable to retrueve transcript'.format(dataset, transcript_id)) + logging.error('get_exons_in_transcript({}, {}): unable to retrieve transcript'.format(dataset, transcript_id)) return return sorted(list(db.Feature.select().where((db.Feature.transcript == transcript) & (db.Feature.feature_type == 'exon')).dicts()), diff --git a/backend/modules/browser/test_lookups.py b/backend/modules/browser/test_lookups.py index 18cc2722c..1a3f3fe46 100644 --- a/backend/modules/browser/test_lookups.py +++ b/backend/modules/browser/test_lookups.py @@ -39,12 +39,17 @@ def test_get_awesomebar_result(): assert result == ('region', '22-46615715-46615880') result = lookups.get_awesomebar_result('SweGen', 'CHR22:46615715-46615880') assert result == ('region', '22-46615715-46615880') + result = lookups.get_awesomebar_result('SweGen', 'CHR22-29461622-G-A') + assert result == ('variant', '22-29461622-G-A') + result = lookups.get_awesomebar_result('SweGen', 'DOES_NOT_EXIST') + assert result == ('not_found', 'DOES_NOT_EXIST') -def test_get_coverage_for_bases(): +def test_get_coverage_for_bases(caplog): """ Test get_coverage_for_bases() """ + # normal coverage = lookups.get_coverage_for_bases('SweGen', '1', 55500283, 55500320) expected = [{'id': 5474062, 'dataset_version': 4, 'chrom': '1', 'pos': 55500290, 'mean': 40.66, 'median': 39.0, @@ -60,8 +65,24 @@ def test_get_coverage_for_bases(): 'coverage': [1.0, 1.0, 1.0, 1.0, 0.996, 0.961, 0.856, 0.117, 0.001]}] assert coverage == expected + # no end_pos + coverage = lookups.get_coverage_for_bases('SweGen', '1', 55500290) + expected = [{'id': 5474062, 'dataset_version': 4, 'chrom': '1', + 'pos': 55500290, 'mean': 40.66, 'median': 39.0, + 'coverage': [1.0, 1.0, 1.0, 1.0, 0.996, 0.97, 0.867, 0.127, 0.001]}] + + # no hits + coverage = lookups.get_coverage_for_bases('SweGen', '1', 55500283, 55500285) + assert not coverage + + # incorrect dataset + assert not lookups.get_coverage_for_bases('BAD_DATASET', '1', 55500283, 55500320) + def test_get_coverage_for_transcript(): + """ + Test get_coverage_for_transcript() + """ coverage = lookups.get_coverage_for_transcript('SweGen', '1', 55500283, 55500320) expected = [{'id': 5474062, 'dataset_version': 4, 'chrom': '1', 'pos': 55500290, 'mean': 40.66, 'median': 39.0, @@ -76,9 +97,10 @@ def test_get_coverage_for_transcript(): 'pos': 55500320, 'mean': 39.69, 'median': 38.0, 'coverage': [1.0, 1.0, 1.0, 1.0, 0.996, 0.961, 0.856, 0.117, 0.001]}] assert coverage == expected + assert not lookups.get_coverage_for_transcript('BAD_DATASET', '1', 55500283, 55500320) -def test_get_exons_in_transcript(): +def test_get_exons_in_transcript(caplog): """ Test get_exons_in_transcript() """ @@ -101,6 +123,16 @@ def test_get_exons_in_transcript(): 'start': 202082312, 'stop': 202084804, 'strand': '+', 'feature_type': 'exon'}] assert result == expected + # bad dataset + result = lookups.get_exons_in_transcript('NO_DATASET', 'ENST00000346817') + assert not result + assert caplog.messages[0] == 'get_exons_in_transcript(NO_DATASET, ENST00000346817): unable to find dataset dbid' + + # bad transcript + result = lookups.get_exons_in_transcript('SweGen', 'BAD_TRANSCRIPT') + assert not result + assert caplog.messages[1] == 'get_exons_in_transcript(SweGen, BAD_TRANSCRIPT): unable to retrieve transcript' + def test_get_gene(): """ @@ -131,13 +163,13 @@ def test_get_gene(): # non-existing gene result = lookups.get_gene('SweGen', 'NOT_A_GENE') assert not result - + # non-existing dataset result = lookups.get_gene('NoDataset', 'ENSG00000223972') assert not result -def test_get_gene_by_name(): +def test_get_gene_by_name(caplog): """ Test get_gene_by_name() """ @@ -165,7 +197,8 @@ def test_get_gene_by_name(): # non-existing gene result = lookups.get_gene_by_name('SweGen', 'NOT_A_GENE') assert not result - + assert caplog.messages[0] == 'get_gene_by_name(SweGen, NOT_A_GENE): unable to retrieve gene' + # non-existing dataset result = lookups.get_gene_by_name('NoDataset', 'ENSG00000223972') assert not result @@ -294,6 +327,9 @@ def test_get_variants_by_rsid(caplog): assert result[0]['pos'] == 16080482 assert result[0]['genes'] == ['ENSG00000229286', 'ENSG00000235265'] assert result[0]['transcripts'] == ['ENST00000448070','ENST00000413156'] + print(type(result[0]['vep_annotations'])) + print(result[0]['vep_annotations']) + assert False # by position result = lookups.get_variants_by_rsid('SweGen', 'rs373706802', check_position=True) @@ -305,6 +341,7 @@ def test_get_variants_by_rsid(caplog): assert lookups.get_variants_by_rsid('incorrect_name', 'rs373706802') is None assert lookups.get_variants_by_rsid('SweGen', '373706802') is None assert lookups.get_variants_by_rsid('SweGen', 'rs3737o68o2') is None + expected = ('get_dataset_version(incorrect_name, version=None): cannot retrieve dataset version', 'get_variants_by_rsid(SweGen, 373706802): rsid not starting with rs', 'get_variants_by_rsid(SweGen, rs3737o68o2): not an integer after rs') From 914442abe5fba71ae833f9db3f9d2079a53f7e1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Wed, 23 Jan 2019 13:36:33 +0100 Subject: [PATCH 021/170] adding tests and documentation --- backend/modules/browser/test_utils.py | 125 +++++++++++++-- backend/modules/browser/utils.py | 209 +++++++++++++++----------- 2 files changed, 232 insertions(+), 102 deletions(-) diff --git a/backend/modules/browser/test_utils.py b/backend/modules/browser/test_utils.py index c42db4c35..828464ec0 100644 --- a/backend/modules/browser/test_utils.py +++ b/backend/modules/browser/test_utils.py @@ -12,31 +12,89 @@ def test_add_consequence_to_variants(): """ Test add_consequence_to_variants() """ - assert False + variants = [] + variants.append(lookups.get_variant('SweGen', 47730411, '21', 'TA', 'T')) + variants.append(lookups.get_variant('SweGen', 55500283, '1', 'A', 'T')) + variants[0]['vep_annotations'] = json.loads(variants[0]['vep_annotations']) # remove when db is fixed + variants[1]['vep_annotations'] = json.loads(variants[1]['vep_annotations']) # remove when db is fixed + + utils.add_consequence_to_variants(variants) + assert variants[0]['major_consequence'] == 'intron_variant' + assert variants[1]['major_consequence'] == 'upstream_gene_variant' def test_add_consequence_to_variant(): """ Test add_consequence_to_variant() """ - # variant = lookups.get_variant('SweGen', 47730411, '21', 'TA', 'T') + variant = lookups.get_variant('SweGen', 47730411, '21', 'TA', 'T') + variant['vep_annotations'] = json.loads(variant['vep_annotations']) # remove when db is fixed + utils.add_consequence_to_variant(variant) + assert variant['major_consequence'] == 'intron_variant' + variant2 = lookups.get_variant('SweGen', 55500283, '1', 'A', 'T') - # variant2['vep_annotations'] = - result = utils.add_consequence_to_variant(variant2) - # result = utils.add_consequence_to_variant(variant) - print(result) - print(result['major_consequence']) - print(result['category']) + variant2['vep_annotations'] = json.loads(variant2['vep_annotations']) # remove when db is fixed + utils.add_consequence_to_variant(variant2) + assert variant2['major_consequence'] == 'upstream_gene_variant' - assert False - def test_annotation_severity(): """ Test annotation_severity() """ variant = lookups.get_variant('SweGen', 55500283, '1', 'A', 'T') - utils.annotation_severity(variant['vep_annotations']) + variant['vep_annotations'] = json.loads(variant['vep_annotations']) # remove when db is fixed + res = utils.annotation_severity(variant['vep_annotations'][0]) + assert res == -26.9 + + +def test_get_flags_from_variant(): + """ + Test get_flags_from_variant() + """ + assert False + + +def test_get_minimal_representation(): + """ + Test get_minimal_representation() + """ + assert False + + +def test_get_proper_hgvs(): + """ + Test get_proper_hgvs() + """ + assert False + + +def test_get_protein_hgvs(): + """ + Test get_protein_hgvs() + """ + assert False + + +def test_get_transcript_hgvs(): + """ + Test get_transcript_hgvs() + """ + assert False + + +def test_order_vep_by_csq(): + """ + Test order_vep_by_csq() + """ + assert False + + +def test_remove_extraneous_vep_annotations(): + """ + Test remove_extraneous_vep_annotations() + """ + assert False def test_worst_csq_from_csq(): @@ -44,9 +102,44 @@ def test_worst_csq_from_csq(): Test worst_csq_from_csq() """ variant = lookups.get_variant('SweGen', 55500283, '1', 'A', 'T') - print(type(variant['vep_annotations'])) - print(variant['vep_annotations']) - vep = json.loads(variant['vep_annotations'])[0] - print(vep['Consequence']) - utils.worst_csq_from_csq(vep['Consequence']) + variant['vep_annotations'] = json.loads(variant['vep_annotations']) # remove when db is fixed + res = utils.worst_csq_from_csq(variant['vep_annotations'][0]['Consequence']) + assert res == 'upstream_gene_variant' + res = utils.worst_csq_from_csq('non_coding_exon_variant&nc_transcript_variant') + assert res == 'non_coding_exon_variant' + + +def test_worst_csq_from_list(): + """ + Test worst_csq_from_list() + """ assert False + + +def test_worst_csq_index(): + """ + Test worst_csq_index() + """ + csqs = ['frameshift_variant', 'missense_variant'] + assert utils.worst_csq_index(csqs) == 4 + + +def test_worst_csq_with_vep(): + """ + Test worst_csq_from_vep() + """ + veps = [{'SYMBOL': '1', 'Consequence': 'intergenic_variant', 'CANONICAL': ''}, + {'SYMBOL': '2', 'Consequence': 'frameshift_variant', 'CANONICAL': ''}, + {'SYMBOL': '3', 'Consequence': 'intron_variant', 'CANONICAL': ''}, + {'SYMBOL': '4', 'Consequence': 'stop_lost', 'CANONICAL': ''}] + res = utils.worst_csq_with_vep(veps) + assert res == {'SYMBOL': '2', 'Consequence': 'frameshift_variant', + 'CANONICAL': '', 'major_consequence': 'frameshift_variant'} + + veps = [{'SYMBOL': '1', 'Consequence': 'frameshift_variant', 'CANONICAL': 'YES'}, + {'SYMBOL': '2', 'Consequence': 'frameshift_variant', 'CANONICAL': ''}, + {'SYMBOL': '3', 'Consequence': 'intron_variant', 'CANONICAL': ''}, + {'SYMBOL': '4', 'Consequence': 'stop_lost', 'CANONICAL': ''}] + res = utils.worst_csq_with_vep(veps) + assert res == {'SYMBOL': '1', 'Consequence': 'frameshift_variant', + 'CANONICAL': 'YES', 'major_consequence': 'frameshift_variant'} diff --git a/backend/modules/browser/utils.py b/backend/modules/browser/utils.py index b6bf5a890..8257db98b 100644 --- a/backend/modules/browser/utils.py +++ b/backend/modules/browser/utils.py @@ -2,20 +2,13 @@ from operator import itemgetter AF_BUCKETS = [0.0001, 0.0002, 0.0005, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1] -METRICS = [ - 'BaseQRankSum', - 'ClippingRankSum', - 'DP', - 'FS', - 'InbreedingCoeff', - 'MQ', - 'MQRankSum', - 'QD', - 'ReadPosRankSum', - 'VQSLOD' -] + +CHROMOSOMES = ['chr%s' % x for x in range(1, 23)] +CHROMOSOMES.extend(['chrX', 'chrY', 'chrM']) +CHROMOSOME_TO_CODE = { item: i+1 for i, item in enumerate(CHROMOSOMES) } # Note that this is the current as of v81 with some included for backwards compatibility (VEP <= 75) + CSQ_ORDER = ["transcript_ablation", "splice_acceptor_variant", "splice_donor_variant", @@ -61,10 +54,33 @@ REV_CSQ_ORDER_DICT = dict(enumerate(CSQ_ORDER)) assert all(csq == REV_CSQ_ORDER_DICT[CSQ_ORDER_DICT[csq]] for csq in CSQ_ORDER) +METRICS = [ + 'BaseQRankSum', + 'ClippingRankSum', + 'DP', + 'FS', + 'InbreedingCoeff', + 'MQ', + 'MQRankSum', + 'QD', + 'ReadPosRankSum', + 'VQSLOD' +] + +PROTEIN_LETTERS_1TO3 = { + 'A': 'Ala', 'C': 'Cys', 'D': 'Asp', 'E': 'Glu', + 'F': 'Phe', 'G': 'Gly', 'H': 'His', 'I': 'Ile', + 'K': 'Lys', 'L': 'Leu', 'M': 'Met', 'N': 'Asn', + 'P': 'Pro', 'Q': 'Gln', 'R': 'Arg', 'S': 'Ser', + 'T': 'Thr', 'V': 'Val', 'W': 'Trp', 'Y': 'Tyr', + 'X': 'Ter', '*': 'Ter', 'U': 'Sec' +} + def add_consequence_to_variants(variant_list): """ Add information about variant consequence to multiple variants + Args: variant_list (list): list of variants """ @@ -75,6 +91,7 @@ def add_consequence_to_variants(variant_list): def add_consequence_to_variant(variant): """ Add information about variant consequence to a variant + Args: variant (dict): variant information """ @@ -105,7 +122,35 @@ def add_consequence_to_variant(variant): variant['flags'] = get_flags_from_variant(variant) +def annotation_severity(annotation): + """ + Evaluate severity of the consequences; "bigger is more important" + + Args: + annotation (dict): vep_annotation from a variant + + Returns: + float: severity score + """ + rv = -CSQ_ORDER_DICT[worst_csq_from_csq(annotation['Consequence'])] + if annotation['CANONICAL'] == 'YES': + rv += 0.1 + return rv + + def get_flags_from_variant(variant): + """ + Get flags from variant. + checks for: + - MNP (identical length of reference and variant) + - LoF (loss of function) + + Args: + variant (dict): a variant + + Returns: + list: flags for the variant + """ flags = [] if 'mnps' in variant: flags.append('MNP') @@ -119,14 +164,35 @@ def get_flags_from_variant(variant): return flags -protein_letters_1to3 = { - 'A': 'Ala', 'C': 'Cys', 'D': 'Asp', 'E': 'Glu', - 'F': 'Phe', 'G': 'Gly', 'H': 'His', 'I': 'Ile', - 'K': 'Lys', 'L': 'Leu', 'M': 'Met', 'N': 'Asn', - 'P': 'Pro', 'Q': 'Gln', 'R': 'Arg', 'S': 'Ser', - 'T': 'Thr', 'V': 'Val', 'W': 'Trp', 'Y': 'Tyr', - 'X': 'Ter', '*': 'Ter', 'U': 'Sec' -} +def get_minimal_representation(pos, ref, alt): + """ + Get the minimal representation of a variant, based on the ref + alt alleles in a VCF + This is used to make sure that multiallelic variants in different datasets, + with different combinations of alternate alleles, can always be matched directly. + + Note that chromosome is ignored here - in xbrowse, we'll probably be dealing with 1D coordinates + Args: + pos (int): genomic position in a chromosome (1-based) + ref (str): ref allele string + alt (str): alt allele string + Returns: + tuple: (pos, ref, alt) of remapped coordinate + """ + pos = int(pos) + # If it's a simple SNV, don't remap anything + if len(ref) == 1 and len(alt) == 1: + return pos, ref, alt + + # strip off identical suffixes + while(alt[-1] == ref[-1] and min(len(alt),len(ref)) > 1): + alt = alt[:-1] + ref = ref[:-1] + # strip off identical prefixes and increment position + while(alt[0] == ref[0] and min(len(alt),len(ref)) > 1): + alt = alt[1:] + ref = ref[1:] + pos += 1 + return pos, ref, alt def get_proper_hgvs(csq): @@ -137,34 +203,35 @@ def get_proper_hgvs(csq): return get_protein_hgvs(csq) -def get_transcript_hgvs(csq): - return csq['HGVSc'].split(':')[-1] - - def get_protein_hgvs(annotation): """ Takes consequence dictionary, returns proper variant formatting for synonymous variants """ if '%3D' in annotation['HGVSp']: # "%3D" is "=" try: - amino_acids = ''.join([protein_letters_1to3[x] for x in annotation['Amino_acids']]) + amino_acids = ''.join([PROTEIN_LETTERS_1TO3[x] for x in annotation['Amino_acids']]) return "p." + amino_acids + annotation['Protein_position'] + amino_acids except KeyError: logging.error("Could not fetch protein hgvs - unknown amino acid") return annotation['HGVSp'].split(':')[-1] -def remove_extraneous_vep_annotations(annotation_list): - return [ann for ann in annotation_list if worst_csq_index(ann['Consequence'].split('&')) <= CSQ_ORDER_DICT['intron_variant']] +def get_transcript_hgvs(csq): + return csq['HGVSc'].split(':')[-1] -def worst_csq_index(csq_list): +def order_vep_by_csq(annotation_list): """ - Input list of consequences (e.g. ['frameshift_variant', 'missense_variant']) - Return index of the worst consequence (In this case, index of 'frameshift_variant', so 4) - Works well with worst_csq_index('non_coding_exon_variant&nc_transcript_variant'.split('&')) + Adds "major_consequence" to each annotation. + Returns them ordered from most deleterious to least. """ - return min([CSQ_ORDER_DICT[csq] for csq in csq_list]) + for ann in annotation_list: + ann['major_consequence'] = worst_csq_from_csq(ann['Consequence']) + return sorted(annotation_list, key=(lambda ann:CSQ_ORDER_DICT[ann['major_consequence']])) + + +def remove_extraneous_vep_annotations(annotation_list): + return [ann for ann in annotation_list if worst_csq_index(ann['Consequence'].split('&')) <= CSQ_ORDER_DICT['intron_variant']] def worst_csq_from_list(csq_list): @@ -178,73 +245,43 @@ def worst_csq_from_list(csq_list): def worst_csq_from_csq(csq): """ - Input possibly &-filled csq string (e.g. 'non_coding_exon_variant&nc_transcript_variant') - Return the worst consequence (In this case, 'non_coding_exon_variant') + Find worst consequence in a possibly &-filled consequence string + + Args: + csq (str): string of consequences, seperated with & (if multiple) + + Returns: + str: the worst consequence """ return REV_CSQ_ORDER_DICT[worst_csq_index(csq.split('&'))] -def order_vep_by_csq(annotation_list): +def worst_csq_index(csq_list): """ - Adds "major_consequence" to each annotation. - Returns them ordered from most deleterious to least. + Find the index of the worst consequence. + Corresponds to the lowest value (index) from CSQ_ORDER_DICT + + Args: + csq_list (list): consequences + + Returns: + int: index in CSQ_ODER_DICT of the worst consequence """ - for ann in annotation_list: - ann['major_consequence'] = worst_csq_from_csq(ann['Consequence']) - return sorted(annotation_list, key=(lambda ann:CSQ_ORDER_DICT[ann['major_consequence']])) + return min([CSQ_ORDER_DICT[csq] for csq in csq_list]) def worst_csq_with_vep(annotation_list): """ - Takes list of VEP annotations [{'Consequence': 'frameshift', Feature: 'ENST'}, ...] - Returns most severe annotation (as full VEP annotation [{'Consequence': 'frameshift', Feature: 'ENST'}]) - Also tacks on "major_consequence" for that annotation (i.e. worst_csq_from_csq) + Choose the vep annotation with the most severe consequence + + Args: + annotation_list (list): VEP annotations + + Returns: + dict: the annotation with the most severe consequence; also adds "major_consequence" for that annotation """ if not annotation_list: return None worst = max(annotation_list, key=annotation_severity) worst['major_consequence'] = worst_csq_from_csq(worst['Consequence']) return worst - - -def annotation_severity(annotation): - "Bigger is more important." - rv = -CSQ_ORDER_DICT[worst_csq_from_csq(annotation['Consequence'])] - if annotation['CANONICAL'] == 'YES': - rv += 0.1 - return rv - -CHROMOSOMES = ['chr%s' % x for x in range(1, 23)] -CHROMOSOMES.extend(['chrX', 'chrY', 'chrM']) -CHROMOSOME_TO_CODE = { item: i+1 for i, item in enumerate(CHROMOSOMES) } - - -def get_minimal_representation(pos, ref, alt): - """ - Get the minimal representation of a variant, based on the ref + alt alleles in a VCF - This is used to make sure that multiallelic variants in different datasets, - with different combinations of alternate alleles, can always be matched directly. - - Note that chromosome is ignored here - in xbrowse, we'll probably be dealing with 1D coordinates - Args: - pos (int): genomic position in a chromosome (1-based) - ref (str): ref allele string - alt (str): alt allele string - Returns: - tuple: (pos, ref, alt) of remapped coordinate - """ - pos = int(pos) - # If it's a simple SNV, don't remap anything - if len(ref) == 1 and len(alt) == 1: - return pos, ref, alt - - # strip off identical suffixes - while(alt[-1] == ref[-1] and min(len(alt),len(ref)) > 1): - alt = alt[:-1] - ref = ref[:-1] - # strip off identical prefixes and increment position - while(alt[0] == ref[0] and min(len(alt),len(ref)) > 1): - alt = alt[1:] - ref = ref[1:] - pos += 1 - return pos, ref, alt From dcc489f846dbb45eedcddd70063e144ddf6d6568 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Wed, 23 Jan 2019 14:45:03 +0100 Subject: [PATCH 022/170] tests for get_flags_from_variant(), worst_csq_from_list(); removed get_minimal_representation as it seems unused --- backend/modules/browser/test_utils.py | 21 ++++++++++++------ backend/modules/browser/utils.py | 31 --------------------------- 2 files changed, 14 insertions(+), 38 deletions(-) diff --git a/backend/modules/browser/test_utils.py b/backend/modules/browser/test_utils.py index 828464ec0..0ce0c39a3 100644 --- a/backend/modules/browser/test_utils.py +++ b/backend/modules/browser/test_utils.py @@ -52,14 +52,20 @@ def test_get_flags_from_variant(): """ Test get_flags_from_variant() """ - assert False + fake_variant = {'vep_annotations':[{'LoF': 'LC', 'LoF_flags': 'something'}, + {'LoF': '', 'LoF_flags': ''}, + {'LoF': 'LC', 'LoF_flags': 'something'}]} + flags = utils.get_flags_from_variant(fake_variant) + assert flags == ['LC LoF', 'LoF flag'] + fake_variant = {'vep_annotations':[{'LoF': 'LC', 'LoF_flags': 'something'}, + {'LoF': 'HC', 'LoF_flags': 'something'}]} + flags = utils.get_flags_from_variant(fake_variant) + assert flags == ['LoF flag'] -def test_get_minimal_representation(): - """ - Test get_minimal_representation() - """ - assert False + fake_variant = {'mnps': 'no idea', 'vep_annotations':[]} + flags = utils.get_flags_from_variant(fake_variant) + assert flags == ['MNP'] def test_get_proper_hgvs(): @@ -113,7 +119,8 @@ def test_worst_csq_from_list(): """ Test worst_csq_from_list() """ - assert False + csqs = ['frameshift_variant', 'missense_variant'] + assert utils.worst_csq_from_list(csqs) == 'frameshift_variant' def test_worst_csq_index(): diff --git a/backend/modules/browser/utils.py b/backend/modules/browser/utils.py index 8257db98b..833d14023 100644 --- a/backend/modules/browser/utils.py +++ b/backend/modules/browser/utils.py @@ -164,37 +164,6 @@ def get_flags_from_variant(variant): return flags -def get_minimal_representation(pos, ref, alt): - """ - Get the minimal representation of a variant, based on the ref + alt alleles in a VCF - This is used to make sure that multiallelic variants in different datasets, - with different combinations of alternate alleles, can always be matched directly. - - Note that chromosome is ignored here - in xbrowse, we'll probably be dealing with 1D coordinates - Args: - pos (int): genomic position in a chromosome (1-based) - ref (str): ref allele string - alt (str): alt allele string - Returns: - tuple: (pos, ref, alt) of remapped coordinate - """ - pos = int(pos) - # If it's a simple SNV, don't remap anything - if len(ref) == 1 and len(alt) == 1: - return pos, ref, alt - - # strip off identical suffixes - while(alt[-1] == ref[-1] and min(len(alt),len(ref)) > 1): - alt = alt[:-1] - ref = ref[:-1] - # strip off identical prefixes and increment position - while(alt[0] == ref[0] and min(len(alt),len(ref)) > 1): - alt = alt[1:] - ref = ref[1:] - pos += 1 - return pos, ref, alt - - def get_proper_hgvs(csq): # Needs major_consequence if csq['major_consequence'] in ('splice_donor_variant', 'splice_acceptor_variant', 'splice_region_variant'): From d40d7a1fdce01701e3b755ed3048cef4de412ee1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Thu, 24 Jan 2019 08:29:33 +0100 Subject: [PATCH 023/170] didn't confirm that genes were retrieved from the correct reference set --- backend/modules/browser/lookups.py | 28 ++++++++++++++++--------- backend/modules/browser/test_lookups.py | 2 +- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index e0364054f..18004f30a 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -240,20 +240,23 @@ def get_gene_by_name(dataset, gene_name): if not ref_dbid: return {} try: - return db.Gene.select().where(db.Gene.name==gene_name).dicts().get() + return db.Gene.select().where((db.Gene.reference_set == ref_dbid) & + (db.Gene.name==gene_name)).dicts().get() except db.Gene.DoesNotExist: try: - return db.Gene.select().where(db.Gene.other_names.contains(gene_name)).dicts().get() + return db.Gene.select().where((db.Gene.reference_set == ref_dbid) & + (db.Gene.other_names.contains(gene_name))).dicts().get() except db.Gene.DoesNotExist: logging.error('get_gene_by_name({}, {}): unable to retrieve gene'.format(dataset, gene_name)) return {} -def get_genes_in_region(chrom, start_pos, stop_pos): +def get_genes_in_region(dataset, chrom, start_pos, stop_pos): """ Retrieve genes located within a region Args: + dataset (str): short name of the dataset chrom (str): chromosome name start_pos (int): start of region stop_pos (int): end of region @@ -261,12 +264,17 @@ def get_genes_in_region(chrom, start_pos, stop_pos): Returns: dict: values for the gene; empty if not found """ + ref_dbid = db.get_reference_dbid_dataset(dataset) + if not ref_dbid: + return {} + try: - gene_query = db.Gene.select().where((((db.Gene.start >= start_pos) & - (db.Gene.start <= stop_pos)) | - ((db.Gene.stop >= start_pos) & - (db.Gene.stop <= stop_pos))) & - (db.Gene.chrom == chrom)).dicts() + gene_query = db.Gene.select().where((db.Gene.reference_set == ref_dbid) & + ((((db.Gene.start >= start_pos) & + (db.Gene.start <= stop_pos)) | + ((db.Gene.stop >= start_pos) & + (db.Gene.stop <= stop_pos))) & + (db.Gene.chrom == chrom))).dicts() return [gene for gene in gene_query] except db.Gene.DoesNotExist: logging.error('get_genes_in_region({}, {}, {}): no genes found'.format(chrom, start_pos, stop_pos)) @@ -395,7 +403,7 @@ def get_variant(dataset, pos, chrom, ref, alt, ds_version=None): ds_version (str): version of the dataset Returns: - dict: values for the variant; empty if not found + dict: values for the variant; None if not found """ try: variant = get_raw_variant(dataset, pos, chrom, ref, alt, ds_version) @@ -408,7 +416,7 @@ def get_variant(dataset, pos, chrom, ref, alt, ds_version=None): variant['rsid'] = 'rs{}'.format(variant['rsid']) return variant except db.Variant.DoesNotExist: - return {} + return def get_variants_by_rsid(dataset, rsid, check_position=False, ds_version=None): diff --git a/backend/modules/browser/test_lookups.py b/backend/modules/browser/test_lookups.py index 1a3f3fe46..a6942d32f 100644 --- a/backend/modules/browser/test_lookups.py +++ b/backend/modules/browser/test_lookups.py @@ -212,7 +212,7 @@ def test_get_genes_in_region(): """ Test get_genes_in_region() """ - res = lookups.get_genes_in_region('4', 99080000, 99210000) + res = lookups.get_genes_in_region('SweGen', '4', 99080000, 99210000) # stop_pos missing in db, so needs to be updated when available # exp_names = assert False From 7799446bd1ea1b65e664a4c95ce89abe2c26e8af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Thu, 24 Jan 2019 09:19:19 +0100 Subject: [PATCH 024/170] fixes to allow docker testing --- backend/modules/browser/lookups.py | 2 +- backend/modules/browser/pgsql.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index 18004f30a..b834e6068 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -2,7 +2,7 @@ import db import logging -import utils +from . import utils SEARCH_LIMIT = 10000 diff --git a/backend/modules/browser/pgsql.py b/backend/modules/browser/pgsql.py index a8d761377..ddf696a42 100644 --- a/backend/modules/browser/pgsql.py +++ b/backend/modules/browser/pgsql.py @@ -4,9 +4,9 @@ import logging -from . import db +import db + from . import lookups -from .utils import get_xpos def get_autocomplete(dataset, query): @@ -25,5 +25,5 @@ def get_autocomplete(dataset, query): return gene_names -def get_variant_list: +def get_variant_list(): pass From 5f439784489a749a283d608c676eaff9923aa81a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Thu, 24 Jan 2019 09:19:48 +0100 Subject: [PATCH 025/170] start of conversion of handlers --- backend/modules/browser/browser_handlers.py | 148 ++++++++++++-------- 1 file changed, 91 insertions(+), 57 deletions(-) diff --git a/backend/modules/browser/browser_handlers.py b/backend/modules/browser/browser_handlers.py index 7dff60210..017a611c0 100644 --- a/backend/modules/browser/browser_handlers.py +++ b/backend/modules/browser/browser_handlers.py @@ -1,27 +1,38 @@ +import logging + +import db import handlers from . import lookups -from . import mongodb from . import pgsql -from .utils import get_xpos, add_consequence_to_variant, remove_extraneous_vep_annotations, \ - order_vep_by_csq, get_proper_hgvs +#from .utils import get_xpos, add_consequence_to_variant, remove_extraneous_vep_annotations, \ +# order_vep_by_csq, get_proper_hgvs +# maximum length of requested region (GetRegion) +REGION_LIMIT = 100000 class GetTranscript(handlers.UnsafeHandler): + """ + Request information about a transcript + """ def get(self, dataset, transcript): + """ + Request information about a transcript + + Args: + dataset (str): short name of the dataset + transcript (str): the transcript id + + Returns: + dict: transcript (transcript and exons), gene (gene information) + """ transcript_id = transcript ret = {'transcript':{}, 'gene':{}, } - db_shared = mongodb.connect_db(dataset, True) - if not db_shared: - self.set_user_msg("Could not connect to database.", "error") - self.finish( ret ) - return - # Add transcript information - transcript = lookups.get_transcript(db_shared, transcript_id) + transcript = lookups.get_transcript(dataset, transcript_id) ret['transcript']['id'] = transcript['transcript_id'] ret['transcript']['number_of_CDS'] = len([t for t in transcript['exons'] if t['feature_type'] == 'CDS']) @@ -31,30 +42,50 @@ def get(self, dataset, transcript): ret['exons'] += [{'start':exon['start'], 'stop':exon['stop'], 'type':exon['feature_type']}] # Add gene information - gene = lookups.get_gene(db_shared, transcript['gene_id']) + gene = lookups.get_gene(dataset, transcript['gene_id']) ret['gene']['id'] = gene['gene_id'] ret['gene']['name'] = gene['gene_name'] ret['gene']['full_name'] = gene['full_gene_name'] ret['gene']['canonical_transcript'] = gene['canonical_transcript'] - gene_transcripts = lookups.get_transcripts_in_gene(db_shared, transcript['gene_id']) + gene_transcripts = lookups.get_transcripts_in_gene(dataset, transcript['gene_id']) ret['gene']['transcripts'] = [g['transcript_id'] for g in gene_transcripts] - self.finish( ret ) + self.finish(ret) class GetRegion(handlers.UnsafeHandler): + """ + Request information about genes in a region + """ def get(self, dataset, region): + """ + Request information about genes in a region + + Args: + dataset (str): short name of the dataset + region (str): the region in the format chr-startpos-endpos + + Returns: + dict: information about the region and the genes found there + """ region = region.split('-') - REGION_LIMIT = 100000 chrom = region[0] start = None stop = None - if len(region) > 1: - start = int(region[1]) - if len(region) > 2: - stop = int(region[2]) + + try: + if len(region) > 1: + start = int(region[1]) + if len(region) > 2: + stop = int(region[2]) + except ValueError: + logging.error('GetRegion: unable to parse region ({})'.format(region)) + self.send_error(status_code=400) + self.set_user_msg('Unable to parse region', 'error') + return + if not start: start = 0 if not stop and start: @@ -69,13 +100,7 @@ def get(self, dataset, region): }, } - db_shared = mongodb.connect_db(dataset, True) - if not db_shared: - self.set_user_msg("Could not connect to database.", "error") - self.finish( ret ) - return - - genes_in_region = lookups.get_genes_in_region(db_shared, chrom, start, stop) + genes_in_region = lookups.get_genes_in_region(dataset, chrom, start, stop) if genes_in_region: ret['region']['genes'] = [] for gene in genes_in_region: @@ -84,69 +109,78 @@ def get(self, dataset, region): 'full_gene_name':gene['full_gene_name'], }] - self.finish( ret ) + self.finish(ret) class GetGene(handlers.UnsafeHandler): - def get(self, dataset, gene): + """ + Request information about a gene + """ + def get(self, dataset, gene, ds_version=None): + """ + Request information about a gene + + Args: + dataset (str): short name of the dataset + gene (str): the gene id + """ gene_id = gene ret = {'gene':{'gene_id': gene_id}} - db = mongodb.connect_db(dataset, False) - db_shared = mongodb.connect_db(dataset, True) - if not db_shared or not db: - self.set_user_msg("Could not connect to database.", "error") - self.finish( ret ) - return # Gene - gene = lookups.get_gene(db_shared, gene_id) - ret['gene'] = gene + gene = lookups.get_gene(dataset, gene_id) + if gene: + ret['gene'] = gene # Add exons from transcript - transcript = lookups.get_transcript(db_shared, gene['canonical_transcript']) + transcript = lookups.get_transcript(dataset, gene['canonical_transcript']) ret['exons'] = [] for exon in sorted(transcript['exons'], key=lambda k: k['start']): ret['exons'] += [{'start':exon['start'], 'stop':exon['stop'], 'type':exon['feature_type']}] # Variants - ret['gene']['variants'] = lookups.get_number_of_variants_in_transcript(db, gene['canonical_transcript']) + ret['gene']['variants'] = lookups.get_number_of_variants_in_transcript(dataset, gene['canonical_transcript'], ds_version) # Transcripts - transcripts_in_gene = lookups.get_transcripts_in_gene(db_shared, gene_id) + transcripts_in_gene = lookups.get_transcripts_in_gene(dataset, gene_id) if transcripts_in_gene: ret['transcripts'] = [] for transcript in transcripts_in_gene: ret['transcripts'] += [{'transcript_id':transcript['transcript_id']}] - self.finish( ret ) + self.finish(ret) class GetVariant(handlers.UnsafeHandler): + """ + Request information about a gene + """ def get(self, dataset, variant): + """ + Request information about a gene + Args: + dataset (str): short name of the dataset + variant (str): variant in the format chrom-pos-ref-alt + """ ret = {'variant':{}} - db = mongodb.connect_db(dataset, False) - db_shared = mongodb.connect_db(dataset, True) - - if not db_shared or not db: - self.set_user_msg("Could not connect to database.", "error") - self.finish( ret ) - return - # Variant v = variant.split('-') - variant = lookups.get_variant(db, db_shared, get_xpos(v[0], int(v[1])), v[2], v[3]) - - if variant is None: - variant = { - 'chrom': v[0], - 'pos': int(v[1]), - 'xpos': get_xpos(v[0], int(v[1])), - 'ref': v[2], - 'alt': v[3] - } + try: + v[1] = int(v[1]) + except ValueError: + logging.error('GetVariant: unable to parse variant ({})'.format(variant)) + self.send_error(status_code=400) + self.set_user_msg('Unable to parse variant', 'error') + return + variant = lookups.get_variant(dataset, v[0], v[1], v[2], v[3]) + + if not variant: + self.send_error(status_code=404) + self.set_user_msg('Variant not found', 'error') + return # Just get the information we need for item in ["variant_id", "chrom", "pos", "ref", "alt", "filter", "rsid", "allele_num", From 8719831e413c08425d0bf20c57711fa42569f5f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Thu, 24 Jan 2019 12:50:08 +0100 Subject: [PATCH 026/170] more fixes, especially in GetVariation --- backend/modules/browser/browser_handlers.py | 48 +++++++++++++-------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/backend/modules/browser/browser_handlers.py b/backend/modules/browser/browser_handlers.py index 017a611c0..d104b78e2 100644 --- a/backend/modules/browser/browser_handlers.py +++ b/backend/modules/browser/browser_handlers.py @@ -1,3 +1,4 @@ +import json # remove when db is fixed import logging import db @@ -5,8 +6,9 @@ from . import lookups from . import pgsql -#from .utils import get_xpos, add_consequence_to_variant, remove_extraneous_vep_annotations, \ -# order_vep_by_csq, get_proper_hgvs + +from .utils import add_consequence_to_variant, remove_extraneous_vep_annotations, \ + order_vep_by_csq, get_proper_hgvs # maximum length of requested region (GetRegion) REGION_LIMIT = 100000 @@ -175,19 +177,22 @@ def get(self, dataset, variant): self.send_error(status_code=400) self.set_user_msg('Unable to parse variant', 'error') return - variant = lookups.get_variant(dataset, v[0], v[1], v[2], v[3]) + orig_variant = variant + variant = lookups.get_variant(dataset, v[1], v[0], v[2], v[3]) if not variant: + logging.error('Variant not found ({})'.format(orig_variant)) self.send_error(status_code=404) self.set_user_msg('Variant not found', 'error') return # Just get the information we need - for item in ["variant_id", "chrom", "pos", "ref", "alt", "filter", "rsid", "allele_num", + for item in ["variant_id", "chrom", "pos", "ref", "alt", "filter_string", "rsid", "allele_num", "allele_freq", "allele_count", "orig_alt_alleles", "site_quality", "quality_metrics", "transcripts", "genes"]: ret['variant'][item] = variant[item] + variant['vep_annotations'] = json.loads(variant['vep_annotations']) # remove when db is fixed # Variant Effect Predictor (VEP) annotations # https://www.ensembl.org/info/docs/tools/vep/vep_formats.html ret['variant']['consequences'] = [] @@ -229,11 +234,8 @@ def get(self, dataset, variant): ['Allele Frequency', 'freq']], 'datasets':{}, 'total':{}} - for item in ['ans', 'acs', 'freq', 'homs']: - key = 'pop_' + item - if key not in variant: - continue - for _dataset, value in variant[key].items(): + for item in ['ans', 'allele_count', 'allelle_freq', 'hom_count']: + for _dataset, value in variant['pop_' + item].items(): if _dataset not in frequencies['datasets']: frequencies['datasets'][_dataset] = {'pop':_dataset} frequencies['datasets'][_dataset][item] = value @@ -249,7 +251,13 @@ def get(self, dataset, variant): class GetVariants(handlers.UnsafeHandler): + """ + Retrieve variants + """ def get(self, dataset, datatype, item): + """ + Retrieve variants + """ ret = mongodb.get_variant_list(dataset, datatype, item) # inconvenient way of doing humpBack-conversion headers = [] @@ -273,18 +281,20 @@ def get(self, dataset, datatype, item): class Search(handlers.UnsafeHandler): + """ + Perform a search for the wanted object + """ def get(self, dataset, query): - ret = {"dataset": dataset, "value": None, "type": None} - - db = mongodb.connect_db(dataset, False) - db_shared = mongodb.connect_db(dataset, True) + """ + Perform a search for the wanted object - if not db_shared or not db: - self.set_user_msg("Could not connect to database.", "error") - self.finish( ret ) - return + Args: + dataset (str): short name of the dataset + query (str): search query + """ + ret = {"dataset": dataset, "value": None, "type": None} - datatype, identifier = lookups.get_awesomebar_result(db, db_shared, query) + datatype, identifier = lookups.get_awesomebar_result(dataset, query) if datatype == "dbsnp_variant_set": datatype = "dbsnp" @@ -292,7 +302,7 @@ def get(self, dataset, query): ret["type"] = datatype ret["value"] = identifier - self.finish( ret ) + self.finish(ret) class Autocomplete(handlers.UnsafeHandler): From 179b878a7c97d8e7548f00945e8908d65f150b1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Thu, 24 Jan 2019 13:36:39 +0100 Subject: [PATCH 027/170] variation page is working, but with a few hacks to get around db problems --- backend/modules/browser/browser_handlers.py | 25 +++++++++++++-------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/backend/modules/browser/browser_handlers.py b/backend/modules/browser/browser_handlers.py index d104b78e2..13504c4b0 100644 --- a/backend/modules/browser/browser_handlers.py +++ b/backend/modules/browser/browser_handlers.py @@ -187,10 +187,12 @@ def get(self, dataset, variant): return # Just get the information we need - for item in ["variant_id", "chrom", "pos", "ref", "alt", "filter_string", "rsid", "allele_num", + variant['quality_metrics'] = json.loads(variant['quality_metrics']) # remove when db is fixed + for item in ["variant_id", "chrom", "pos", "ref", "alt", "rsid", "allele_num", "allele_freq", "allele_count", "orig_alt_alleles", "site_quality", "quality_metrics", "transcripts", "genes"]: ret['variant'][item] = variant[item] + ret['variant']['filter'] = variant['filter_string'] variant['vep_annotations'] = json.loads(variant['vep_annotations']) # remove when db is fixed # Variant Effect Predictor (VEP) annotations @@ -234,18 +236,23 @@ def get(self, dataset, variant): ['Allele Frequency', 'freq']], 'datasets':{}, 'total':{}} - for item in ['ans', 'allele_count', 'allelle_freq', 'hom_count']: - for _dataset, value in variant['pop_' + item].items(): - if _dataset not in frequencies['datasets']: - frequencies['datasets'][_dataset] = {'pop':_dataset} - frequencies['datasets'][_dataset][item] = value - if item not in frequencies['total']: - frequencies['total'][item] = 0 - frequencies['total'][item] += value + term_map = {'allele_num':'ans', 'allele_count':'acs', 'allele_freq':'freq', 'hom_count':'homs'} + if dataset not in frequencies['datasets']: + frequencies['datasets'][dataset] = {'pop':dataset} + for item in term_map: + if item not in frequencies['total']: + frequencies['total'][term_map[item]] = 0 + if variant[item] is None: + frequencies['datasets'][dataset][term_map[item]] = 0 + frequencies['total'][term_map[item]] += 0 + else: + frequencies['datasets'][dataset][term_map[item]] = variant[item] + frequencies['total'][term_map[item]] += variant[item] if 'freq' in frequencies['total']: frequencies['total']['freq'] /= len(frequencies['datasets'].keys()) ret['variant']['pop_freq'] = frequencies + logging.error(ret) self.finish( ret ) From 416cfd0dc2a2f43350469244e1a1c17fcad0e122 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Mon, 28 Jan 2019 11:03:39 +0100 Subject: [PATCH 028/170] new function for retrieving genes by dbid; mixed updates to get all functions running --- backend/modules/browser/browser_handlers.py | 22 +++++-- backend/modules/browser/lookups.py | 68 +++++++++++++++------ 2 files changed, 68 insertions(+), 22 deletions(-) diff --git a/backend/modules/browser/browser_handlers.py b/backend/modules/browser/browser_handlers.py index 13504c4b0..3be480ac1 100644 --- a/backend/modules/browser/browser_handlers.py +++ b/backend/modules/browser/browser_handlers.py @@ -8,7 +8,7 @@ from . import pgsql from .utils import add_consequence_to_variant, remove_extraneous_vep_annotations, \ - order_vep_by_csq, get_proper_hgvs + order_vep_by_csq, get_proper_hgvs # maximum length of requested region (GetRegion) REGION_LIMIT = 100000 @@ -110,7 +110,6 @@ def get(self, dataset, region): 'gene_name':gene['gene_name'], 'full_gene_name':gene['full_gene_name'], }] - self.finish(ret) @@ -264,8 +263,13 @@ class GetVariants(handlers.UnsafeHandler): def get(self, dataset, datatype, item): """ Retrieve variants + + Args: + dataset (str): short name of the dataset + datatype (str): gene, region, or transcript + item (str): item to query """ - ret = mongodb.get_variant_list(dataset, datatype, item) + ret = pgsql.get_variant_list(dataset, datatype, item) # inconvenient way of doing humpBack-conversion headers = [] for a, h in ret['headers']: @@ -276,14 +280,22 @@ def get(self, dataset, datatype, item): class GetCoverage(handlers.UnsafeHandler): + """ + Retrieve coverage + """ def get(self, dataset, datatype, item): - ret = mongodb.get_coverage(dataset, datatype, item) + # ret = mongodb.get_coverage(dataset, datatype, item) + ret = None self.finish( ret ) class GetCoveragePos(handlers.UnsafeHandler): + """ + Retrieve coverage + """ def get(self, dataset, datatype, item): - ret = mongodb.get_coverage_pos(dataset, datatype, item) + # ret = mongodb.get_coverage_pos(dataset, datatype, item) + ret = None self.finish( ret ) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index b834e6068..90562d147 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -1,6 +1,9 @@ + +import json # remove when db is fixed +import logging import re + import db -import logging from . import utils @@ -225,6 +228,26 @@ def get_gene(dataset, gene_id): return {} +def get_gene_by_dbid(dataset, gene_dbid): + """ + Retrieve gene by gene database id + + Args: + dataset (str): short name of the dataset + gene_dbid (str): the database id of the gene + + Returns: + dict: values for the gene; empty if not found + """ + ref_dbid = db.get_reference_dbid_dataset(dataset) + if not ref_dbid: + return {} + try: + return db.Gene.select().where(db.Gene.id == id) + except db.Gene.DoesNotExist: + return {} + + def get_gene_by_name(dataset, gene_name): """ Retrieve gene by gene_name. @@ -292,16 +315,13 @@ def get_number_of_variants_in_transcript(dataset, transcript_id, ds_version=None Returns: dict: {filtered: nr_filtered, total: nr_total} """ - # will be implemented after database is updated - raise NotImplementedError - - dataset_version = db.get_dataset_version() + dataset_version = db.get_dataset_version(dataset, ds_version) if not dataset_version: return - transcript = db.Transcript.select().where(db.Transcript.transcript_id) - total = db.variants.count({'transcripts': transcript_id}) - filtered = db.variants.count({'transcripts': transcript_id, 'filter': 'PASS'}) + variants = get_variants_in_transcript(dataset, transcript_id) + total = len(variants) + filtered = len(tuple(variant for variant in variants if variant['filter_string'] == 'PASS')) return {'filtered': filtered, 'total': total} @@ -335,7 +355,7 @@ def get_raw_variant(dataset, pos, chrom, ref, alt, ds_version=None): .dicts() .get()) except db.Variant.DoesNotExist: - logging.error(('get_raw_variant({}, {}, {}, {}, {}, {})'.format(dataset, pos, chrom, ref, alt, ds_version) + + logging.error(('get_raw_variant({}, {}, {}, {}, {}, {})'.format(dataset, pos, chrom, ref, alt, dataset_version.id) + ': unable to retrieve variant')) return {} @@ -491,8 +511,14 @@ def get_variants_in_gene(dataset, gene_id): list: values for the variants """ ref_dbid = db.get_reference_dbid_dataset(dataset) + variants = [variant for variant in db.Variant.select().where(db.Variant.genes.contains(transcript_id)).dicts()] # db.Variant.select().where(db.Variant.gene.contains(re variants = [] + ##### remove when db is fixed + for variant in variants: + variant['vep_annotations'] = json.loads(variant['vep_annotations']) + ##### + for variant in db.variants.find({'genes': gene_id}, projection={'_id': False}): variant['vep_annotations'] = [x for x in variant['vep_annotations'] if x['Gene'] == gene_id] add_consequence_to_variant(variant) @@ -526,13 +552,19 @@ def get_variants_in_region(dataset, chrom, start_pos, end_pos, ds_version=None): (db.Variant.dataset_version == dataset_version)) .dicts()) variants = [variant for variant in query] - # add_consequence_to_variants(variants) - #for variant in variants: - # remove_extraneous_information(variant) + + ##### remove when db is fixed + for variant in variants: + variant['vep_annotations'] = json.loads(variant['vep_annotations']) + ##### + + utils.add_consequence_to_variants(variants) + for variant in variants: + remove_extraneous_information(variant) return variants -def get_variants_in_transcript(transcript_id): +def get_variants_in_transcript(dataset, transcript_id): """ Retrieve variants inside a transcript @@ -546,8 +578,13 @@ def get_variants_in_transcript(transcript_id): dict: values for the variant; empty if not found """ variants = [variant for variant in db.Variant.select().where(db.Variant.transcripts.contains(transcript_id)).dicts()] + ##### remove when db is fixed + for variant in variants: + variant['vep_annotations'] = json.loads(variant['vep_annotations']) + ##### + for variant in variants: - variant['vep_annotations'] = [annotation for annotation in variant['vep_annotations'] if x['Feature'] == transcript_id] + variant['vep_annotations'] = [anno for anno in variant['vep_annotations'] if anno['Feature'] == transcript_id] add_consequence_to_variant(variant) remove_extraneous_information(variant) return variants @@ -559,8 +596,5 @@ def remove_extraneous_information(variant): del variant['transcripts'] del variant['genes'] del variant['orig_alt_alleles'] - del variant['xpos'] - del variant['xstart'] - del variant['xstop'] del variant['site_quality'] del variant['vep_annotations'] From dccf4d9a0db6105a7e2e2fbe6a22372cfbdee6ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Mon, 28 Jan 2019 11:04:49 +0100 Subject: [PATCH 029/170] functions migrated from mongodb.py to pgsql.py, bug fixes --- backend/modules/browser/browser_handlers.py | 16 ++-- backend/modules/browser/pgsql.py | 97 ++++++++++++++++++++- 2 files changed, 102 insertions(+), 11 deletions(-) diff --git a/backend/modules/browser/browser_handlers.py b/backend/modules/browser/browser_handlers.py index 3be480ac1..087b0a672 100644 --- a/backend/modules/browser/browser_handlers.py +++ b/backend/modules/browser/browser_handlers.py @@ -44,7 +44,7 @@ def get(self, dataset, transcript): ret['exons'] += [{'start':exon['start'], 'stop':exon['stop'], 'type':exon['feature_type']}] # Add gene information - gene = lookups.get_gene(dataset, transcript['gene_id']) + gene = lookups.get_gene_by_dbid(dataset, transcript['gene']) ret['gene']['id'] = gene['gene_id'] ret['gene']['name'] = gene['gene_name'] ret['gene']['full_name'] = gene['full_gene_name'] @@ -283,20 +283,18 @@ class GetCoverage(handlers.UnsafeHandler): """ Retrieve coverage """ - def get(self, dataset, datatype, item): - # ret = mongodb.get_coverage(dataset, datatype, item) - ret = None - self.finish( ret ) + def get(self, dataset, datatype, item, ds_version=None): + ret = pgsql.get_coverage(dataset, datatype, item, ds_version) + self.finish(ret) class GetCoveragePos(handlers.UnsafeHandler): """ - Retrieve coverage + Retrieve coverage range """ def get(self, dataset, datatype, item): - # ret = mongodb.get_coverage_pos(dataset, datatype, item) - ret = None - self.finish( ret ) + ret = pgsql.get_coverage_pos(dataset, datatype, item) + self.finish(ret) class Search(handlers.UnsafeHandler): diff --git a/backend/modules/browser/pgsql.py b/backend/modules/browser/pgsql.py index ddf696a42..7298e3b7c 100644 --- a/backend/modules/browser/pgsql.py +++ b/backend/modules/browser/pgsql.py @@ -8,6 +8,7 @@ from . import lookups +EXON_PADDING = 50 def get_autocomplete(dataset, query): """ @@ -25,5 +26,97 @@ def get_autocomplete(dataset, query): return gene_names -def get_variant_list(): - pass +def get_variant_list(dataset, datatype, item): + headers = [['variant_id','Variant'], ['chrom','Chrom'], ['pos','Position'], + ['HGVS','Consequence'], ['filter','Filter'], ['major_consequence','Annotation'], + ['flags','Flags'], ['allele_count','Allele Count'], ['allele_num','Allele Number'], + ['hom_count','Number of Homozygous Alleles'], ['allele_freq','Allele Frequency']] + + if datatype == 'gene': + variants = lookups.get_variants_in_gene(dataset, item) + elif datatype == 'region': + chrom, start, stop = item.split('-') + variants = lookups.get_variants_in_region(dataset, chrom, start, stop) + elif datatype == 'transcript': + variants = lookups.get_variants_in_transcript(dataset, item) + + # Format output + def format_variant(variant): + if variant['rsid'] == '.': + variant['rsid'] = '' + variant['major_consequence'] = (variant['major_consequence'].replace('_variant','') + .replace('_prime_', '\'') + .replace('_', ' ')) + + # This is so an array values turns into a comma separated string instead + return {k: ", ".join(v) if isinstance(v,list) else v for k, v in variant.items()} + + variants = list(map(format_variant, variants)) + logging.error('VARIANTS_POST : ' + str(variants) + str(len(variants))) + return {'variants': variants, 'headers': headers} + + +def get_coverage(dataset, datatype, item, ds_version=None): + """ + Retrieve coverage for a gene/region/transcript + + Args: + dataset (str): short name of the dataset + datatype (str): type of "region" (gene/region/transcript) + item (str): the datatype item to look up + ds_version (str): the dataset version + """ + ret = {'coverage':[]} + + if datatype == 'gene': + gene = lookups.get_gene(dataset, item) + transcript = lookups.get_transcript(dataset, gene['canonical_transcript']) + start = transcript['start'] - EXON_PADDING + stop = transcript['stop'] + EXON_PADDING + ret['coverage'] = lookups.get_coverage_for_transcript(dataset, transcript['chrom'], start, stop, ds_version) + elif datatype == 'region': + chrom, start, stop = item.split('-') + start = int(start) + stop = int(stop) + ret['coverage'] = lookups.get_coverage_for_bases(dataset, chrom, start, stop, ds_version) + elif datatype == 'transcript': + transcript = lookups.get_transcript(dataset, item) + start = transcript['start'] - EXON_PADDING + stop = transcript['stop'] + EXON_PADDING + ret['coverage'] = lookups.get_coverage_for_transcript(dataset, start, stop, ds_version) + + return ret + + +def get_coverage_pos(dataset, datatype, item): + """ + Retrieve coverage range + + Args: + dataset (str): short name of the dataset + datatype (str): type of "region" (gene/region/transcript) + item (str): the datatype item to look up + ds_version (str): the dataset version + """ + ret = {'start':None, 'stop':None, 'chrom':None} + + if datatype == 'gene': + gene = lookups.get_gene(dataset, item) + transcript = lookups.get_transcript(dataset, gene['canonical_transcript']) + elif datatype == 'transcript': + transcript = lookups.get_transcript(dataset, item) + + if datatype == 'region': + chrom, start, stop = item.split('-') + start = int(start) + stop = int(stop) + else: + start = transcript['start'] - EXON_PADDING + stop = transcript['stop'] + EXON_PADDING + chrom = transcript['chrom'] + + ret['start'] = start + ret['stop'] = stop + ret['chrom'] = chrom + + return ret From a882929e963b753846435be6231379a428f883db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Mon, 28 Jan 2019 14:08:31 +0100 Subject: [PATCH 030/170] A couple of ugly hacks, but now seems to work for everything but gene. Most hacks can be removed once database is fixed --- backend/modules/browser/browser_handlers.py | 8 +-- backend/modules/browser/lookups.py | 72 +++++++++++++-------- backend/modules/browser/pgsql.py | 4 +- backend/modules/browser/settings.py | 16 ----- 4 files changed, 50 insertions(+), 50 deletions(-) delete mode 100644 backend/modules/browser/settings.py diff --git a/backend/modules/browser/browser_handlers.py b/backend/modules/browser/browser_handlers.py index 087b0a672..26202361f 100644 --- a/backend/modules/browser/browser_handlers.py +++ b/backend/modules/browser/browser_handlers.py @@ -46,13 +46,14 @@ def get(self, dataset, transcript): # Add gene information gene = lookups.get_gene_by_dbid(dataset, transcript['gene']) ret['gene']['id'] = gene['gene_id'] - ret['gene']['name'] = gene['gene_name'] - ret['gene']['full_name'] = gene['full_gene_name'] + ret['gene']['name'] = gene['name'] + ret['gene']['full_name'] = gene['full_name'] ret['gene']['canonical_transcript'] = gene['canonical_transcript'] - gene_transcripts = lookups.get_transcripts_in_gene(dataset, transcript['gene_id']) + gene_transcripts = lookups.get_transcripts_in_gene_by_dbid(transcript['gene']) ret['gene']['transcripts'] = [g['transcript_id'] for g in gene_transcripts] + logging.error('Transcript with data {}'.format(ret)) self.finish(ret) @@ -251,7 +252,6 @@ def get(self, dataset, variant): frequencies['total']['freq'] /= len(frequencies['datasets'].keys()) ret['variant']['pop_freq'] = frequencies - logging.error(ret) self.finish( ret ) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index 90562d147..f2bb48742 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -202,8 +202,9 @@ def get_exons_in_transcript(dataset, transcript_id): except db.Transcript.DoesNotExist: logging.error('get_exons_in_transcript({}, {}): unable to retrieve transcript'.format(dataset, transcript_id)) return + wanted_types = ('CDS', 'UTR', 'exon') return sorted(list(db.Feature.select().where((db.Feature.transcript == transcript) & - (db.Feature.feature_type == 'exon')).dicts()), + (db.Feature.feature_type in wanted_types)).dicts()), key=lambda k: k['start']) @@ -243,7 +244,7 @@ def get_gene_by_dbid(dataset, gene_dbid): if not ref_dbid: return {} try: - return db.Gene.select().where(db.Gene.id == id) + return db.Gene.select().where(db.Gene.id == gene_dbid).dicts().get() except db.Gene.DoesNotExist: return {} @@ -409,6 +410,21 @@ def get_transcripts_in_gene(dataset, gene_id): return [] +def get_transcripts_in_gene_by_dbid(gene_dbid): + """ + Get the transcripts associated with a gene + Args: + gene_dbid (str): database id of the gene + Returns: + list: transcripts (dict) associated with the gene; empty if no hits + """ + try: + return [transcript for transcript in db.Transcript.select().where(db.Transcript.gene == gene_dbid).dicts()] + except db.Gene.DoesNotExist or db.Transcript.DoesNotExist: + logging.error('get_transcripts_in_gene({}): no matching transcripts'.format(gene_dbid)) + return [] + + def get_variant(dataset, pos, chrom, ref, alt, ds_version=None): """ Retrieve variant by position and change @@ -511,19 +527,16 @@ def get_variants_in_gene(dataset, gene_id): list: values for the variants """ ref_dbid = db.get_reference_dbid_dataset(dataset) - variants = [variant for variant in db.Variant.select().where(db.Variant.genes.contains(transcript_id)).dicts()] -# db.Variant.select().where(db.Variant.gene.contains(re - variants = [] - ##### remove when db is fixed - for variant in variants: - variant['vep_annotations'] = json.loads(variant['vep_annotations']) - ##### - - for variant in db.variants.find({'genes': gene_id}, projection={'_id': False}): - variant['vep_annotations'] = [x for x in variant['vep_annotations'] if x['Gene'] == gene_id] - add_consequence_to_variant(variant) - remove_extraneous_information(variant) - variants.append(variant) + gene = get_gene(dataset, gene_id) + # temporary while waiting for db fix + variants = get_variants_in_region(dataset, gene['chrom'], gene['start'], gene['stop']) + # variants = [variant for variant in db.Variant.select().where(db.Variant.genes.contains(transcript_id)).dicts()] + +# for variant in variants: +# variant['vep_annotations'] = [anno for anno in variant['vep_annotations'] if anno['Gene'] == gene_id] +# add_consequence_to_variant(variant) +# remove_extraneous_information(variant) +# variants.append(variant) return variants @@ -555,11 +568,17 @@ def get_variants_in_region(dataset, chrom, start_pos, end_pos, ds_version=None): ##### remove when db is fixed for variant in variants: + variant['quality_metrics'] = json.loads(variant['quality_metrics']) variant['vep_annotations'] = json.loads(variant['vep_annotations']) + variant['hom_count'] = 0 + variant['filter'] = variant['filter_string'] ##### utils.add_consequence_to_variants(variants) for variant in variants: + if variant['rsid']: + variant['rsid'] = 'rs{}'.format(variant['rsid']) + # add_rsid_to_variant(dataset, variant) remove_extraneous_information(variant) return variants @@ -569,24 +588,21 @@ def get_variants_in_transcript(dataset, transcript_id): Retrieve variants inside a transcript Args: - pos (int): position of the variant - chrom (str): name of the chromosome - ref (str): reference sequence - ref (str): variant sequence + dataset (str): short name of the dataset + transcript_id (str): id of the transcript (ENST) Returns: dict: values for the variant; empty if not found """ - variants = [variant for variant in db.Variant.select().where(db.Variant.transcripts.contains(transcript_id)).dicts()] - ##### remove when db is fixed - for variant in variants: - variant['vep_annotations'] = json.loads(variant['vep_annotations']) - ##### + transcript = get_transcript(dataset, transcript_id) + # temporary while waiting for db fix + variants = get_variants_in_region(dataset, transcript['chrom'], transcript['start'], transcript['stop']) + # variants = [variant for variant in db.Variant.select().where(db.Variant.transcripts.contains(transcript_id)).dicts()] - for variant in variants: - variant['vep_annotations'] = [anno for anno in variant['vep_annotations'] if anno['Feature'] == transcript_id] - add_consequence_to_variant(variant) - remove_extraneous_information(variant) +# for variant in variants: +# variant['vep_annotations'] = [anno for anno in variant['vep_annotations'] if anno['Feature'] == transcript_id] +# add_consequence_to_variant(variant) +# remove_extraneous_information(variant) return variants diff --git a/backend/modules/browser/pgsql.py b/backend/modules/browser/pgsql.py index 7298e3b7c..b549a5c25 100644 --- a/backend/modules/browser/pgsql.py +++ b/backend/modules/browser/pgsql.py @@ -52,7 +52,6 @@ def format_variant(variant): return {k: ", ".join(v) if isinstance(v,list) else v for k, v in variant.items()} variants = list(map(format_variant, variants)) - logging.error('VARIANTS_POST : ' + str(variants) + str(len(variants))) return {'variants': variants, 'headers': headers} @@ -83,7 +82,7 @@ def get_coverage(dataset, datatype, item, ds_version=None): transcript = lookups.get_transcript(dataset, item) start = transcript['start'] - EXON_PADDING stop = transcript['stop'] + EXON_PADDING - ret['coverage'] = lookups.get_coverage_for_transcript(dataset, start, stop, ds_version) + ret['coverage'] = lookups.get_coverage_for_transcript(dataset, transcript['chrom'], start, stop, ds_version) return ret @@ -120,3 +119,4 @@ def get_coverage_pos(dataset, datatype, item): ret['chrom'] = chrom return ret + diff --git a/backend/modules/browser/settings.py b/backend/modules/browser/settings.py deleted file mode 100644 index 875a002bd..000000000 --- a/backend/modules/browser/settings.py +++ /dev/null @@ -1,16 +0,0 @@ -import json - -try: - json_settings_fh = open("settings.json") -except FileNotFoundError: - json_settings_fh = open("../settings.json") - -json_settings = json.load(json_settings_fh) -json_settings_fh.close() - -# Mongodb settings -mongo_host = json_settings["mongoHost"] -mongo_port = json_settings["mongoPort"] -mongo_user = json_settings["mongoUser"] -mongo_password = json_settings["mongoPassword"] -mongo_databases = json_settings["mongoDatabases"] From a8341eb6737565d4af21cd0daed0b1b5a7081b31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Mon, 28 Jan 2019 14:11:22 +0100 Subject: [PATCH 031/170] fixed class order --- backend/modules/browser/browser_handlers.py | 181 ++++++++++---------- 1 file changed, 90 insertions(+), 91 deletions(-) diff --git a/backend/modules/browser/browser_handlers.py b/backend/modules/browser/browser_handlers.py index 26202361f..345281470 100644 --- a/backend/modules/browser/browser_handlers.py +++ b/backend/modules/browser/browser_handlers.py @@ -13,47 +13,86 @@ # maximum length of requested region (GetRegion) REGION_LIMIT = 100000 -class GetTranscript(handlers.UnsafeHandler): +class Autocomplete(handlers.UnsafeHandler): + def get(self, dataset, query): + ret = {} + + results = pgsql.get_autocomplete(dataset, query) + ret = {'values': sorted(list(set(results)))[:20]} + + self.finish( ret ) + + +class GetCoverage(handlers.UnsafeHandler): """ - Request information about a transcript + Retrieve coverage """ - def get(self, dataset, transcript): + def get(self, dataset, datatype, item, ds_version=None): + ret = pgsql.get_coverage(dataset, datatype, item, ds_version) + self.finish(ret) + + +class GetCoveragePos(handlers.UnsafeHandler): + """ + Retrieve coverage range + """ + def get(self, dataset, datatype, item): + ret = pgsql.get_coverage_pos(dataset, datatype, item) + self.finish(ret) + + +class Download(handlers.UnsafeHandler): + def get(self, dataset, datatype, item): + filename = "{}_{}_{}.csv".format(dataset, datatype, item) + self.set_header('Content-Type','text/csv') + self.set_header('content-Disposition','attachement; filename={}'.format(filename)) + + data = mongodb.get_variant_list(dataset, datatype, item) + # Write header + self.write(','.join([h[1] for h in data['headers']]) + '\n') + + for variant in data['variants']: + headers = [h[0] for h in data['headers']] + self.write(','.join(map(str, [variant[h] for h in headers])) + '\n') + + +class GetGene(handlers.UnsafeHandler): + """ + Request information about a gene + """ + def get(self, dataset, gene, ds_version=None): """ - Request information about a transcript + Request information about a gene Args: dataset (str): short name of the dataset - transcript (str): the transcript id - - Returns: - dict: transcript (transcript and exons), gene (gene information) + gene (str): the gene id """ - transcript_id = transcript - ret = {'transcript':{}, - 'gene':{}, - } + gene_id = gene - # Add transcript information - transcript = lookups.get_transcript(dataset, transcript_id) - ret['transcript']['id'] = transcript['transcript_id'] - ret['transcript']['number_of_CDS'] = len([t for t in transcript['exons'] if t['feature_type'] == 'CDS']) + ret = {'gene':{'gene_id': gene_id}} - # Add exon information + # Gene + gene = lookups.get_gene(dataset, gene_id) + if gene: + ret['gene'] = gene + + # Add exons from transcript + transcript = lookups.get_transcript(dataset, gene['canonical_transcript']) ret['exons'] = [] for exon in sorted(transcript['exons'], key=lambda k: k['start']): ret['exons'] += [{'start':exon['start'], 'stop':exon['stop'], 'type':exon['feature_type']}] - # Add gene information - gene = lookups.get_gene_by_dbid(dataset, transcript['gene']) - ret['gene']['id'] = gene['gene_id'] - ret['gene']['name'] = gene['name'] - ret['gene']['full_name'] = gene['full_name'] - ret['gene']['canonical_transcript'] = gene['canonical_transcript'] + # Variants + ret['gene']['variants'] = lookups.get_number_of_variants_in_transcript(dataset, gene['canonical_transcript'], ds_version) - gene_transcripts = lookups.get_transcripts_in_gene_by_dbid(transcript['gene']) - ret['gene']['transcripts'] = [g['transcript_id'] for g in gene_transcripts] + # Transcripts + transcripts_in_gene = lookups.get_transcripts_in_gene(dataset, gene_id) + if transcripts_in_gene: + ret['transcripts'] = [] + for transcript in transcripts_in_gene: + ret['transcripts'] += [{'transcript_id':transcript['transcript_id']}] - logging.error('Transcript with data {}'.format(ret)) self.finish(ret) @@ -114,42 +153,45 @@ def get(self, dataset, region): self.finish(ret) -class GetGene(handlers.UnsafeHandler): +class GetTranscript(handlers.UnsafeHandler): """ - Request information about a gene + Request information about a transcript """ - def get(self, dataset, gene, ds_version=None): + def get(self, dataset, transcript): """ - Request information about a gene + Request information about a transcript Args: dataset (str): short name of the dataset - gene (str): the gene id - """ - gene_id = gene + transcript (str): the transcript id - ret = {'gene':{'gene_id': gene_id}} + Returns: + dict: transcript (transcript and exons), gene (gene information) + """ + transcript_id = transcript + ret = {'transcript':{}, + 'gene':{}, + } - # Gene - gene = lookups.get_gene(dataset, gene_id) - if gene: - ret['gene'] = gene + # Add transcript information + transcript = lookups.get_transcript(dataset, transcript_id) + ret['transcript']['id'] = transcript['transcript_id'] + ret['transcript']['number_of_CDS'] = len([t for t in transcript['exons'] if t['feature_type'] == 'CDS']) - # Add exons from transcript - transcript = lookups.get_transcript(dataset, gene['canonical_transcript']) + # Add exon information ret['exons'] = [] for exon in sorted(transcript['exons'], key=lambda k: k['start']): ret['exons'] += [{'start':exon['start'], 'stop':exon['stop'], 'type':exon['feature_type']}] - # Variants - ret['gene']['variants'] = lookups.get_number_of_variants_in_transcript(dataset, gene['canonical_transcript'], ds_version) + # Add gene information + gene = lookups.get_gene_by_dbid(dataset, transcript['gene']) + ret['gene']['id'] = gene['gene_id'] + ret['gene']['name'] = gene['name'] + ret['gene']['full_name'] = gene['full_name'] + ret['gene']['canonical_transcript'] = gene['canonical_transcript'] - # Transcripts - transcripts_in_gene = lookups.get_transcripts_in_gene(dataset, gene_id) - if transcripts_in_gene: - ret['transcripts'] = [] - for transcript in transcripts_in_gene: - ret['transcripts'] += [{'transcript_id':transcript['transcript_id']}] + gene_transcripts = lookups.get_transcripts_in_gene_by_dbid(transcript['gene']) + ret['gene']['transcripts'] = [g['transcript_id'] for g in gene_transcripts] self.finish(ret) @@ -279,24 +321,6 @@ def get(self, dataset, datatype, item): self.finish( ret ) -class GetCoverage(handlers.UnsafeHandler): - """ - Retrieve coverage - """ - def get(self, dataset, datatype, item, ds_version=None): - ret = pgsql.get_coverage(dataset, datatype, item, ds_version) - self.finish(ret) - - -class GetCoveragePos(handlers.UnsafeHandler): - """ - Retrieve coverage range - """ - def get(self, dataset, datatype, item): - ret = pgsql.get_coverage_pos(dataset, datatype, item) - self.finish(ret) - - class Search(handlers.UnsafeHandler): """ Perform a search for the wanted object @@ -320,28 +344,3 @@ def get(self, dataset, query): ret["value"] = identifier self.finish(ret) - - -class Autocomplete(handlers.UnsafeHandler): - def get(self, dataset, query): - ret = {} - - results = pgsql.get_autocomplete(dataset, query) - ret = {'values': sorted(list(set(results)))[:20]} - - self.finish( ret ) - - -class Download(handlers.UnsafeHandler): - def get(self, dataset, datatype, item): - filename = "{}_{}_{}.csv".format(dataset, datatype, item) - self.set_header('Content-Type','text/csv') - self.set_header('content-Disposition','attachement; filename={}'.format(filename)) - - data = mongodb.get_variant_list(dataset, datatype, item) - # Write header - self.write(','.join([h[1] for h in data['headers']]) + '\n') - - for variant in data['variants']: - headers = [h[0] for h in data['headers']] - self.write(','.join(map(str, [variant[h] for h in headers])) + '\n') From 02cb50f6464567aea57f2c9170d11edd4fa7a466 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Mon, 28 Jan 2019 14:39:56 +0100 Subject: [PATCH 032/170] multiple fixes to get a working system with the current database --- backend/modules/browser/browser_handlers.py | 10 ++++ backend/modules/browser/lookups.py | 8 ++- backend/modules/browser/pgsql.py | 57 ++++++++++----------- 3 files changed, 45 insertions(+), 30 deletions(-) diff --git a/backend/modules/browser/browser_handlers.py b/backend/modules/browser/browser_handlers.py index 345281470..66319723c 100644 --- a/backend/modules/browser/browser_handlers.py +++ b/backend/modules/browser/browser_handlers.py @@ -74,6 +74,9 @@ def get(self, dataset, gene, ds_version=None): # Gene gene = lookups.get_gene(dataset, gene_id) + #### Remove when db is fixed + gene['stop'] = gene['start'] + 20000 + #### if gene: ret['gene'] = gene @@ -93,6 +96,11 @@ def get(self, dataset, gene, ds_version=None): for transcript in transcripts_in_gene: ret['transcripts'] += [{'transcript_id':transcript['transcript_id']}] + + # temporary fix for names + gene['gene_name'] = gene['name'] + gene['full_gene_name'] = gene['full_name'] + self.finish(ret) @@ -318,6 +326,8 @@ def get(self, dataset, datatype, item): n = a[0] + "".join([b[0].upper() + b[1:] for b in a.split("_")])[1:] headers += [[n, h]] ret['headers'] = headers + logging.error('Variant request {} items'.format(len(ret))) + logging.error('Variant request {} items'.format(ret)) self.finish( ret ) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index f2bb48742..639896429 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -529,7 +529,13 @@ def get_variants_in_gene(dataset, gene_id): ref_dbid = db.get_reference_dbid_dataset(dataset) gene = get_gene(dataset, gene_id) # temporary while waiting for db fix - variants = get_variants_in_region(dataset, gene['chrom'], gene['start'], gene['stop']) + logging.error('Found gene {}'.format(gene)) + #### remove when db is fixed + gene['stop'] = gene['start'] + 20000 + #### + + variants = get_variants_in_region(dataset, gene['chrom'], gene['start'], gene['stop']) + # variants = [variant for variant in db.Variant.select().where(db.Variant.genes.contains(transcript_id)).dicts()] # for variant in variants: diff --git a/backend/modules/browser/pgsql.py b/backend/modules/browser/pgsql.py index b549a5c25..b4c1cd53f 100644 --- a/backend/modules/browser/pgsql.py +++ b/backend/modules/browser/pgsql.py @@ -26,35 +26,6 @@ def get_autocomplete(dataset, query): return gene_names -def get_variant_list(dataset, datatype, item): - headers = [['variant_id','Variant'], ['chrom','Chrom'], ['pos','Position'], - ['HGVS','Consequence'], ['filter','Filter'], ['major_consequence','Annotation'], - ['flags','Flags'], ['allele_count','Allele Count'], ['allele_num','Allele Number'], - ['hom_count','Number of Homozygous Alleles'], ['allele_freq','Allele Frequency']] - - if datatype == 'gene': - variants = lookups.get_variants_in_gene(dataset, item) - elif datatype == 'region': - chrom, start, stop = item.split('-') - variants = lookups.get_variants_in_region(dataset, chrom, start, stop) - elif datatype == 'transcript': - variants = lookups.get_variants_in_transcript(dataset, item) - - # Format output - def format_variant(variant): - if variant['rsid'] == '.': - variant['rsid'] = '' - variant['major_consequence'] = (variant['major_consequence'].replace('_variant','') - .replace('_prime_', '\'') - .replace('_', ' ')) - - # This is so an array values turns into a comma separated string instead - return {k: ", ".join(v) if isinstance(v,list) else v for k, v in variant.items()} - - variants = list(map(format_variant, variants)) - return {'variants': variants, 'headers': headers} - - def get_coverage(dataset, datatype, item, ds_version=None): """ Retrieve coverage for a gene/region/transcript @@ -120,3 +91,31 @@ def get_coverage_pos(dataset, datatype, item): return ret + +def get_variant_list(dataset, datatype, item): + headers = [['variant_id','Variant'], ['chrom','Chrom'], ['pos','Position'], + ['HGVS','Consequence'], ['filter','Filter'], ['major_consequence','Annotation'], + ['flags','Flags'], ['allele_count','Allele Count'], ['allele_num','Allele Number'], + ['hom_count','Number of Homozygous Alleles'], ['allele_freq','Allele Frequency']] + + if datatype == 'gene': + variants = lookups.get_variants_in_gene(dataset, item) + elif datatype == 'region': + chrom, start, stop = item.split('-') + variants = lookups.get_variants_in_region(dataset, chrom, start, stop) + elif datatype == 'transcript': + variants = lookups.get_variants_in_transcript(dataset, item) + + # Format output + def format_variant(variant): + if variant['rsid'] == '.': + variant['rsid'] = '' + variant['major_consequence'] = (variant['major_consequence'].replace('_variant','') + .replace('_prime_', '\'') + .replace('_', ' ')) + + # This is so an array values turns into a comma separated string instead + return {k: ", ".join(v) if isinstance(v,list) else v for k, v in variant.items()} + + variants = list(map(format_variant, variants)) + return {'variants': variants, 'headers': headers} From 2b6aa28611239a22c12c95779878846ef15590f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Mon, 28 Jan 2019 16:12:39 +0100 Subject: [PATCH 033/170] test support added, mongodb removed --- backend/modules/browser/tests/__init__.py | 0 backend/modules/browser/{ => tests}/test_lookups.py | 2 +- backend/modules/browser/{ => tests}/test_utils.py | 6 ++++-- backend/run_pytest.sh | 5 +++++ backend/test_requirements.txt | 1 + 5 files changed, 11 insertions(+), 3 deletions(-) create mode 100644 backend/modules/browser/tests/__init__.py rename backend/modules/browser/{ => tests}/test_lookups.py (99%) rename backend/modules/browser/{ => tests}/test_utils.py (75%) create mode 100755 backend/run_pytest.sh create mode 100644 backend/test_requirements.txt diff --git a/backend/modules/browser/tests/__init__.py b/backend/modules/browser/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/backend/modules/browser/test_lookups.py b/backend/modules/browser/tests/test_lookups.py similarity index 99% rename from backend/modules/browser/test_lookups.py rename to backend/modules/browser/tests/test_lookups.py index a6942d32f..64e959ecb 100644 --- a/backend/modules/browser/test_lookups.py +++ b/backend/modules/browser/tests/test_lookups.py @@ -2,7 +2,7 @@ Tests for the functions available in lookups.py """ -import lookups +from .. import lookups def test_add_rsid_to_variant(): diff --git a/backend/modules/browser/test_utils.py b/backend/modules/browser/tests/test_utils.py similarity index 75% rename from backend/modules/browser/test_utils.py rename to backend/modules/browser/tests/test_utils.py index 0ce0c39a3..df560c615 100644 --- a/backend/modules/browser/test_utils.py +++ b/backend/modules/browser/tests/test_utils.py @@ -2,8 +2,8 @@ Tests for utils.py """ -import lookups -import utils +from .. import lookups +from .. import utils import json @@ -79,6 +79,8 @@ def test_get_protein_hgvs(): """ Test get_protein_hgvs() """ + annotation = {'MAX_AF_POPS': 'AA&gnomAD_AMR&gnomAD_ASJ&gnomAD_EAS&gnomAD_OTH&gnomAD_SAS&AFR&AMR&EAS&EUR&SAS', 'TSL': '', 'APPRIS': '', 'gnomAD_ASJ_AF': '1', 'AMR_AF': '1', 'SYMBOL': 'ADH6', 'AFR_AF': '1', 'Feature': 'ENST00000237653', 'Codons': 'Tgt/Agt', 'MOTIF_NAME': '', 'DOMAINS': 'hmmpanther:PTHR11695:SF307&hmmpanther:PTHR11695&Gene3D:3.90.180.10', 'SIFT': 'tolerated(1)', 'VARIANT_CLASS': 'SNV', 'EA_AF': '0.9995', 'CDS_position': '4', 'CCDS': 'CCDS3647.1', 'Allele': 'T', 'PolyPhen': 'benign(0)', 'AA_AF': '1', 'gnomAD_EAS_AF': '1', 'IMPACT': 'MODERATE', 'HGVSp': '', 'ENSP': 'ENSP00000237653', 'MAX_AF': '1', 'LoF': '', 'INTRON': '', 'gnomAD_FIN_AF': '0.9999', 'Existing_variation': 'rs4699735', 'HGVSc': '', 'SOURCE': 'Ensembl', 'LoF_filter': '', 'gnomAD_AF': '0.9998', 'gnomAD_AMR_AF': '1', 'GENE_PHENO': '', 'gnomAD_OTH_AF': '1', 'LoF_flags': '', 'MOTIF_SCORE_CHANGE': '', 'UNIPARC': 'UPI00001AE69C', 'cDNA_position': '389', 'ALLELE_NUM': '1', 'EAS_AF': '1', 'Feature_type': 'Transcript', 'AF': '1', 'gnomAD_AFR_AF': '0.9999', 'HGNC_ID': '255', 'SAS_AF': '1', 'LoF_info': '', 'SWISSPROT': 'P28332', 'FLAGS': '', 'miRNA': '', 'Consequence': 'missense_variant', 'Protein_position': '2', 'Gene': 'ENSG00000172955', 'HIGH_INF_POS': '', 'STRAND': '-1', 'gnomAD_NFE_AF': '0.9995', 'EUR_AF': '1', 'DISTANCE': '', 'CLIN_SIG': '', 'PHENO': '', 'SYMBOL_SOURCE': 'HGNC', 'Amino_acids': 'C/S', 'TREMBL': '', 'gnomAD_SAS_AF': '1', 'REFSEQ_MATCH': '', 'PUBMED': '', 'BIOTYPE': 'protein_coding', 'EXON': '1/8', 'SOMATIC': '', 'MOTIF_POS': '', 'CANONICAL': ''} + print(utils.get_protein_hgvs(annotation)) assert False diff --git a/backend/run_pytest.sh b/backend/run_pytest.sh new file mode 100755 index 000000000..95bda78cb --- /dev/null +++ b/backend/run_pytest.sh @@ -0,0 +1,5 @@ +#!/bin/sh + +BROWSER=modules/browser + +py.test . --cov==${BROWSER}/lookups --cov==${BROWSER}/utils diff --git a/backend/test_requirements.txt b/backend/test_requirements.txt new file mode 100644 index 000000000..c75c448bb --- /dev/null +++ b/backend/test_requirements.txt @@ -0,0 +1 @@ +pytest-cov From e6b3cd5d3cd7e90a3711c7486df45526d4924222 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Mon, 28 Jan 2019 16:15:25 +0100 Subject: [PATCH 034/170] fix slight error --- backend/run_pytest.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/run_pytest.sh b/backend/run_pytest.sh index 95bda78cb..cce3d98bf 100755 --- a/backend/run_pytest.sh +++ b/backend/run_pytest.sh @@ -1,5 +1,5 @@ #!/bin/sh -BROWSER=modules/browser +BROWSER=modules/browser/ -py.test . --cov==${BROWSER}/lookups --cov==${BROWSER}/utils +py.test . --cov=${BROWSER} From 7d73335266c816ad21cbb074c16504c01932b7fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Tue, 29 Jan 2019 14:02:19 +0100 Subject: [PATCH 035/170] more tests added, complete coverage in lookups; also some fixes due to problems found during testing --- backend/modules/browser/browser_handlers.py | 2 +- backend/modules/browser/lookups.py | 85 ++++++------ backend/modules/browser/tests/.coveragerc | 5 + backend/modules/browser/tests/test_lookups.py | 122 ++++++++++++------ backend/run_pytest.sh | 3 +- 5 files changed, 124 insertions(+), 93 deletions(-) create mode 100644 backend/modules/browser/tests/.coveragerc diff --git a/backend/modules/browser/browser_handlers.py b/backend/modules/browser/browser_handlers.py index 66319723c..4e1db8a6d 100644 --- a/backend/modules/browser/browser_handlers.py +++ b/backend/modules/browser/browser_handlers.py @@ -192,7 +192,7 @@ def get(self, dataset, transcript): ret['exons'] += [{'start':exon['start'], 'stop':exon['stop'], 'type':exon['feature_type']}] # Add gene information - gene = lookups.get_gene_by_dbid(dataset, transcript['gene']) + gene = lookups.get_gene_by_dbid(transcript['gene']) ret['gene']['id'] = gene['gene_id'] ret['gene']['name'] = gene['name'] ret['gene']['full_name'] = gene['full_name'] diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index 639896429..abf691a87 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -135,11 +135,11 @@ def get_coverage_for_bases(dataset, chrom, start_pos, end_pos=None, ds_version=N ds_version (str): version of the dataset Returns: - list: coverage dicts for the region of interest: None if unable to retrieve + list: coverage dicts for the region of interest. None if failed """ dataset_version = db.get_dataset_version(dataset, ds_version) if not dataset_version: - return + return None if end_pos is None: end_pos = start_pos @@ -172,7 +172,7 @@ def get_coverage_for_transcript(dataset, chrom, start_pos, end_pos=None, ds_vers # only return coverages that have coverage (if that makes any sense?) # return coverage_array if not coverage_array: - return + return None covered = [c for c in coverage_array if c['mean']] return covered @@ -191,7 +191,7 @@ def get_exons_in_transcript(dataset, transcript_id): ref_dbid = db.get_reference_dbid_dataset(dataset) if not ref_dbid: logging.error('get_exons_in_transcript({}, {}): unable to find dataset dbid'.format(dataset, transcript_id)) - return + return None try: transcript = (db.Transcript .select() @@ -201,7 +201,7 @@ def get_exons_in_transcript(dataset, transcript_id): .get()) except db.Transcript.DoesNotExist: logging.error('get_exons_in_transcript({}, {}): unable to retrieve transcript'.format(dataset, transcript_id)) - return + return None wanted_types = ('CDS', 'UTR', 'exon') return sorted(list(db.Feature.select().where((db.Feature.transcript == transcript) & (db.Feature.feature_type in wanted_types)).dicts()), @@ -229,24 +229,22 @@ def get_gene(dataset, gene_id): return {} -def get_gene_by_dbid(dataset, gene_dbid): +def get_gene_by_dbid(gene_dbid): """ Retrieve gene by gene database id Args: - dataset (str): short name of the dataset gene_dbid (str): the database id of the gene Returns: dict: values for the gene; empty if not found """ - ref_dbid = db.get_reference_dbid_dataset(dataset) - if not ref_dbid: - return {} try: return db.Gene.select().where(db.Gene.id == gene_dbid).dicts().get() except db.Gene.DoesNotExist: return {} + except ValueError: + return {} def get_gene_by_name(dataset, gene_name): @@ -292,16 +290,13 @@ def get_genes_in_region(dataset, chrom, start_pos, stop_pos): if not ref_dbid: return {} - try: - gene_query = db.Gene.select().where((db.Gene.reference_set == ref_dbid) & - ((((db.Gene.start >= start_pos) & - (db.Gene.start <= stop_pos)) | - ((db.Gene.stop >= start_pos) & - (db.Gene.stop <= stop_pos))) & - (db.Gene.chrom == chrom))).dicts() - return [gene for gene in gene_query] - except db.Gene.DoesNotExist: - logging.error('get_genes_in_region({}, {}, {}): no genes found'.format(chrom, start_pos, stop_pos)) + gene_query = db.Gene.select().where((db.Gene.reference_set == ref_dbid) & + ((((db.Gene.start >= start_pos) & + (db.Gene.start <= stop_pos)) | + ((db.Gene.stop >= start_pos) & + (db.Gene.stop <= stop_pos))) & + (db.Gene.chrom == chrom))).dicts() + return [gene for gene in gene_query] def get_number_of_variants_in_transcript(dataset, transcript_id, ds_version=None): @@ -314,11 +309,11 @@ def get_number_of_variants_in_transcript(dataset, transcript_id, ds_version=None ds_version (str): version of the dataset Returns: - dict: {filtered: nr_filtered, total: nr_total} + dict: {filtered: nr_filtered, total: nr_total}, None if error """ dataset_version = db.get_dataset_version(dataset, ds_version) if not dataset_version: - return + return None variants = get_variants_in_transcript(dataset, transcript_id) total = len(variants) @@ -344,7 +339,7 @@ def get_raw_variant(dataset, pos, chrom, ref, alt, ds_version=None): dataset_version = db.get_dataset_version(dataset, ds_version) if not dataset_version: return - + try: return (db.Variant .select() @@ -356,8 +351,8 @@ def get_raw_variant(dataset, pos, chrom, ref, alt, ds_version=None): .dicts() .get()) except db.Variant.DoesNotExist: - logging.error(('get_raw_variant({}, {}, {}, {}, {}, {})'.format(dataset, pos, chrom, ref, alt, dataset_version.id) + - ': unable to retrieve variant')) + logging.error('get_raw_variant({}, {}, {}, {}, {}, {}): unable to retrieve variant' + .format(dataset, pos, chrom, ref, alt, dataset_version.id)) return {} @@ -404,11 +399,12 @@ def get_transcripts_in_gene(dataset, gene_id): try: gene = db.Gene.select().where((db.Gene.reference_set == ref_dbid) & (db.Gene.gene_id == gene_id)).dicts().get() - return [transcript for transcript in db.Transcript.select().where(db.Transcript.gene == gene['id']).dicts()] - except db.Gene.DoesNotExist or db.Transcript.DoesNotExist: - logging.error('get_transcripts_in_gene({}, {}): unable to retrieve gene or transcript'.format(dataset, gene_id)) + except db.Gene.DoesNotExist: + logging.error('get_transcripts_in_gene({}, {}): unable to retrieve gene'.format(dataset, gene_id)) return [] + return [transcript for transcript in db.Transcript.select().where(db.Transcript.gene == gene['id']).dicts()] + def get_transcripts_in_gene_by_dbid(gene_dbid): """ @@ -418,11 +414,7 @@ def get_transcripts_in_gene_by_dbid(gene_dbid): Returns: list: transcripts (dict) associated with the gene; empty if no hits """ - try: - return [transcript for transcript in db.Transcript.select().where(db.Transcript.gene == gene_dbid).dicts()] - except db.Gene.DoesNotExist or db.Transcript.DoesNotExist: - logging.error('get_transcripts_in_gene({}): no matching transcripts'.format(gene_dbid)) - return [] + return [transcript for transcript in db.Transcript.select().where(db.Transcript.gene == gene_dbid).dicts()] def get_variant(dataset, pos, chrom, ref, alt, ds_version=None): @@ -441,18 +433,15 @@ def get_variant(dataset, pos, chrom, ref, alt, ds_version=None): Returns: dict: values for the variant; None if not found """ - try: - variant = get_raw_variant(dataset, pos, chrom, ref, alt, ds_version) - if not variant or 'rsid' not in variant: - return variant - if variant['rsid'] == '.' or variant['rsid'] is None: - add_rsid_to_variant(dataset, variant) - else: - if not str(variant['rsid']).startswith('rs'): - variant['rsid'] = 'rs{}'.format(variant['rsid']) + variant = get_raw_variant(dataset, pos, chrom, ref, alt, ds_version) + if not variant or 'rsid' not in variant: + return variant + if variant['rsid'] == '.' or variant['rsid'] is None: + add_rsid_to_variant(dataset, variant) + else: + if not str(variant['rsid']).startswith('rs'): + variant['rsid'] = 'rs{}'.format(variant['rsid']) return variant - except db.Variant.DoesNotExist: - return def get_variants_by_rsid(dataset, rsid, check_position=False, ds_version=None): @@ -528,14 +517,12 @@ def get_variants_in_gene(dataset, gene_id): """ ref_dbid = db.get_reference_dbid_dataset(dataset) gene = get_gene(dataset, gene_id) - # temporary while waiting for db fix - logging.error('Found gene {}'.format(gene)) #### remove when db is fixed gene['stop'] = gene['start'] + 20000 #### variants = get_variants_in_region(dataset, gene['chrom'], gene['start'], gene['stop']) - + # variants = [variant for variant in db.Variant.select().where(db.Variant.genes.contains(transcript_id)).dicts()] # for variant in variants: @@ -579,7 +566,7 @@ def get_variants_in_region(dataset, chrom, start_pos, end_pos, ds_version=None): variant['hom_count'] = 0 variant['filter'] = variant['filter_string'] ##### - + utils.add_consequence_to_variants(variants) for variant in variants: if variant['rsid']: @@ -601,6 +588,8 @@ def get_variants_in_transcript(dataset, transcript_id): dict: values for the variant; empty if not found """ transcript = get_transcript(dataset, transcript_id) + if not transcript: + return {} # temporary while waiting for db fix variants = get_variants_in_region(dataset, transcript['chrom'], transcript['start'], transcript['stop']) # variants = [variant for variant in db.Variant.select().where(db.Variant.transcripts.contains(transcript_id)).dicts()] diff --git a/backend/modules/browser/tests/.coveragerc b/backend/modules/browser/tests/.coveragerc new file mode 100644 index 000000000..fc2753f56 --- /dev/null +++ b/backend/modules/browser/tests/.coveragerc @@ -0,0 +1,5 @@ +[run] +omit = + # omit anything in a .local directory anywhere + */tests/* + */__init__.py \ No newline at end of file diff --git a/backend/modules/browser/tests/test_lookups.py b/backend/modules/browser/tests/test_lookups.py index 64e959ecb..ea559d809 100644 --- a/backend/modules/browser/tests/test_lookups.py +++ b/backend/modules/browser/tests/test_lookups.py @@ -104,29 +104,13 @@ def test_get_exons_in_transcript(caplog): """ Test get_exons_in_transcript() """ - result = lookups.get_exons_in_transcript('SweGen', 'ENST00000346817') - expected = [{'id': 326403, 'gene': 8600, 'transcript': 28186, 'chrom': '2', - 'start': 202047893, 'stop': 202048032, 'strand': '+', 'feature_type': 'exon'}, - {'id': 326404, 'gene': 8600, 'transcript': 28186, 'chrom': '2', - 'start': 202050495, 'stop': 202050848, 'strand': '+', 'feature_type': 'exon'}, - {'id': 326406, 'gene': 8600, 'transcript': 28186, 'chrom': '2', - 'start': 202052430, 'stop': 202052523, 'strand': '+', 'feature_type': 'exon'}, - {'id': 326408, 'gene': 8600, 'transcript': 28186, 'chrom': '2', - 'start': 202057708, 'stop': 202057843, 'strand': '+', 'feature_type': 'exon'}, - {'id': 326410, 'gene': 8600, 'transcript': 28186, 'chrom': '2', - 'start': 202060566, 'stop': 202060672, 'strand': '+', 'feature_type': 'exon'}, - {'id': 326412, 'gene': 8600, 'transcript': 28186, 'chrom': '2', - 'start': 202072799, 'stop': 202072907, 'strand': '+', 'feature_type': 'exon'}, - {'id': 326414, 'gene': 8600, 'transcript': 28186, 'chrom': '2', - 'start': 202073794, 'stop': 202074286, 'strand': '+', 'feature_type': 'exon'}, - {'id': 326416, 'gene': 8600, 'transcript': 28186, 'chrom': '2', - 'start': 202082312, 'stop': 202084804, 'strand': '+', 'feature_type': 'exon'}] - assert result == expected + result = lookups.get_exons_in_transcript('SweGen', 'ENST00000215855') + assert len(result) == 14 # bad dataset - result = lookups.get_exons_in_transcript('NO_DATASET', 'ENST00000346817') + result = lookups.get_exons_in_transcript('NO_DATASET', 'ENST00000215855') assert not result - assert caplog.messages[0] == 'get_exons_in_transcript(NO_DATASET, ENST00000346817): unable to find dataset dbid' + assert caplog.messages[0] == 'get_exons_in_transcript(NO_DATASET, ENST00000215855): unable to find dataset dbid' # bad transcript result = lookups.get_exons_in_transcript('SweGen', 'BAD_TRANSCRIPT') @@ -142,23 +126,15 @@ def test_get_gene(): expected = {'id': 1, 'reference_set': 1, 'gene_id': 'ENSG00000223972', - 'gene_name': 'DDX11L1', + 'name': 'DDX11L1', 'full_name': 'DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 1', 'canonical_transcript': 'ENST00000456328', 'chrom': '1', - 'start_pos': 11870, + 'start': 11870, 'strand': '+'} result = lookups.get_gene('SweGen', 'ENSG00000223972') - print(result) - assert result['id'] == expected['id'] - assert result['reference_set'] == expected['reference_set'] - assert result['gene_id'] == expected['gene_id'] - assert result['name'] == expected['gene_name'] - assert result['full_name'] == expected['full_name'] - assert result['canonical_transcript'] == expected['canonical_transcript'] - assert result['chrom'] == expected['chrom'] - assert result['start'] == expected['start_pos'] - assert result['strand'] == expected['strand'] + for val in expected: + assert result[val] == expected[val] # non-existing gene result = lookups.get_gene('SweGen', 'NOT_A_GENE') @@ -169,6 +145,30 @@ def test_get_gene(): assert not result +def test_get_gene_by_dbid(): + """ + Test get_gene_by_dbid() + """ + # normal entry + expected = {'id': 53626, + 'reference_set': 1, + 'gene_id': 'ENSG00000226444', + 'name': 'ACTR3BP6', + 'full_name': 'ACTR3B pseudogene 6', + 'canonical_transcript': 'ENST00000421366', + 'chrom': '22', + 'start': 16967411, + 'strand': '+'} + result = lookups.get_gene_by_dbid(53626) + for val in expected: + assert result[val] == expected[val] + + # non-existing genes + result = lookups.get_gene_by_dbid('NOT_A_GENE') + assert not result + result = lookups.get_gene_by_dbid(-1) + assert not result + def test_get_gene_by_name(caplog): """ Test get_gene_by_name() @@ -212,18 +212,34 @@ def test_get_genes_in_region(): """ Test get_genes_in_region() """ - res = lookups.get_genes_in_region('SweGen', '4', 99080000, 99210000) # stop_pos missing in db, so needs to be updated when available - # exp_names = - assert False + # normal + res = lookups.get_genes_in_region('SweGen', '22', 25595800, 25615800) + expected_names = set(['ENSG00000100053', 'ENSG00000236641', 'ENSG00000244752']) + names = set(gene['gene_id'] for gene in res) + assert names == expected_names + # bad dataset + res = lookups.get_genes_in_region('bad_dataset', '22', 25595800, 25615800) + # nothing found + res = lookups.get_genes_in_region('SweGen', '22', 25595800, 25595801) + assert not res def test_get_number_of_variants_in_transcript(): """ Test get_number_of_variants_in_transcripts() """ - assert False - lookups.get_number_of_variants_in_transcripts() + # normal + res = lookups.get_number_of_variants_in_transcript('SweGen', 'ENST00000424770') + assert res == {'filtered': 1, 'total': 23} + + # bad transcript + res = lookups.get_number_of_variants_in_transcript('SweGen', 'ENSTASDSADA') + assert res == {'filtered': 0, 'total': 0} + + # bad dataset + res = lookups.get_number_of_variants_in_transcript('bad_dataset', 'ENST00000424770') + assert res is None def test_get_transcript(): @@ -281,6 +297,9 @@ def test_get_transcripts_in_gene(): 'chrom': '1', 'start': 228320, 'stop': 228776, 'strand': '-'}] assert res == expected + assert not lookups.get_transcripts_in_gene('bad_dataset', 'ENSG00000241670') + assert not lookups.get_transcripts_in_gene('SweGen', 'ENSGASDFG') + def test_get_raw_variant(): """ @@ -290,8 +309,19 @@ def test_get_raw_variant(): assert result['genes'] == ['ENSG00000169174'] assert result['transcripts'] == ['ENST00000302118'] assert not lookups.get_raw_variant('SweGen', 55500281, '1', 'A', 'T') + assert not lookups.get_raw_variant('bad_dataset', 55500283, '1', 'A', 'T') +def test_get_transcripts_in_gene_by_dbid(): + """ + Test get_transcripts_in_gene_by_dbid() + """ + res = lookups.get_transcripts_in_gene_by_dbid(53626) + assert len(res) == 2 + res = lookups.get_transcripts_in_gene_by_dbid(-1) + assert not res + + def test_get_variant(): """ Test get_variant() @@ -327,9 +357,6 @@ def test_get_variants_by_rsid(caplog): assert result[0]['pos'] == 16080482 assert result[0]['genes'] == ['ENSG00000229286', 'ENSG00000235265'] assert result[0]['transcripts'] == ['ENST00000448070','ENST00000413156'] - print(type(result[0]['vep_annotations'])) - print(result[0]['vep_annotations']) - assert False # by position result = lookups.get_variants_by_rsid('SweGen', 'rs373706802', check_position=True) @@ -352,6 +379,16 @@ def test_get_variants_by_rsid(caplog): assert not lookups.get_variants_by_rsid('SweGen', 'rs1') +def test_get_variants_in_gene(): + """ + Test get_variants_in_gene() + """ + res = lookups.get_variants_in_gene('SweGen', 'ENSG00000198062') + assert len(res) == 1185 + assert not lookups.get_variants_in_gene('bad_dataset', 'ENSG00000198062') + assert not lookups.get_variants_in_gene('bad_dataset', 'ENSGASDFG') + + def test_get_variants_in_region(): """ Test get_variants_in_region() @@ -374,6 +411,5 @@ def test_get_variants_in_transcript(): """ Test get_variants_in_transcript() """ - # res = lookups.get_variants_in_transcript('ENST00000302118') - # assert len(res) == 426 - assert False + res = lookups.get_variants_in_transcript('SweGen', 'ENST00000452800') + assert len(res) == 1414 diff --git a/backend/run_pytest.sh b/backend/run_pytest.sh index cce3d98bf..27152697b 100755 --- a/backend/run_pytest.sh +++ b/backend/run_pytest.sh @@ -2,4 +2,5 @@ BROWSER=modules/browser/ -py.test . --cov=${BROWSER} +py.test . --cov=${BROWSER} --cov-config=modules/browser/tests/.coveragerc + From e4841f32dd98d929314e92c2c6e947e891532835 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Tue, 29 Jan 2019 15:55:25 +0100 Subject: [PATCH 036/170] More tests for utils, including some fixing in utils --- backend/modules/browser/tests/test_utils.py | 23 ++++++++++++++++++--- backend/modules/browser/utils.py | 20 +++++++++++++++--- 2 files changed, 37 insertions(+), 6 deletions(-) diff --git a/backend/modules/browser/tests/test_utils.py b/backend/modules/browser/tests/test_utils.py index df560c615..85001c26d 100644 --- a/backend/modules/browser/tests/test_utils.py +++ b/backend/modules/browser/tests/test_utils.py @@ -48,6 +48,14 @@ def test_annotation_severity(): assert res == -26.9 +def test_data_structures(): + """ + Test the constants + """ + assert len(utils.CSQ_ORDER) == len(set(utils.CSQ_ORDER)) # No duplicates + assert all(csq == utils.REV_CSQ_ORDER_DICT[utils.CSQ_ORDER_DICT[csq]] for csq in utils.CSQ_ORDER) + + def test_get_flags_from_variant(): """ Test get_flags_from_variant() @@ -79,9 +87,18 @@ def test_get_protein_hgvs(): """ Test get_protein_hgvs() """ - annotation = {'MAX_AF_POPS': 'AA&gnomAD_AMR&gnomAD_ASJ&gnomAD_EAS&gnomAD_OTH&gnomAD_SAS&AFR&AMR&EAS&EUR&SAS', 'TSL': '', 'APPRIS': '', 'gnomAD_ASJ_AF': '1', 'AMR_AF': '1', 'SYMBOL': 'ADH6', 'AFR_AF': '1', 'Feature': 'ENST00000237653', 'Codons': 'Tgt/Agt', 'MOTIF_NAME': '', 'DOMAINS': 'hmmpanther:PTHR11695:SF307&hmmpanther:PTHR11695&Gene3D:3.90.180.10', 'SIFT': 'tolerated(1)', 'VARIANT_CLASS': 'SNV', 'EA_AF': '0.9995', 'CDS_position': '4', 'CCDS': 'CCDS3647.1', 'Allele': 'T', 'PolyPhen': 'benign(0)', 'AA_AF': '1', 'gnomAD_EAS_AF': '1', 'IMPACT': 'MODERATE', 'HGVSp': '', 'ENSP': 'ENSP00000237653', 'MAX_AF': '1', 'LoF': '', 'INTRON': '', 'gnomAD_FIN_AF': '0.9999', 'Existing_variation': 'rs4699735', 'HGVSc': '', 'SOURCE': 'Ensembl', 'LoF_filter': '', 'gnomAD_AF': '0.9998', 'gnomAD_AMR_AF': '1', 'GENE_PHENO': '', 'gnomAD_OTH_AF': '1', 'LoF_flags': '', 'MOTIF_SCORE_CHANGE': '', 'UNIPARC': 'UPI00001AE69C', 'cDNA_position': '389', 'ALLELE_NUM': '1', 'EAS_AF': '1', 'Feature_type': 'Transcript', 'AF': '1', 'gnomAD_AFR_AF': '0.9999', 'HGNC_ID': '255', 'SAS_AF': '1', 'LoF_info': '', 'SWISSPROT': 'P28332', 'FLAGS': '', 'miRNA': '', 'Consequence': 'missense_variant', 'Protein_position': '2', 'Gene': 'ENSG00000172955', 'HIGH_INF_POS': '', 'STRAND': '-1', 'gnomAD_NFE_AF': '0.9995', 'EUR_AF': '1', 'DISTANCE': '', 'CLIN_SIG': '', 'PHENO': '', 'SYMBOL_SOURCE': 'HGNC', 'Amino_acids': 'C/S', 'TREMBL': '', 'gnomAD_SAS_AF': '1', 'REFSEQ_MATCH': '', 'PUBMED': '', 'BIOTYPE': 'protein_coding', 'EXON': '1/8', 'SOMATIC': '', 'MOTIF_POS': '', 'CANONICAL': ''} - print(utils.get_protein_hgvs(annotation)) - assert False + annotation = {'HGVSc': 'ENST00000343518.6:c.35C>T', + 'HGVSp': 'ENSP00000340610.6:p.Ser12Phe'} + result = utils.get_protein_hgvs(annotation) + assert result == 'p.Ser12Phe' + annotation = {'HGVSc': 'ENST00000343518.6:c.27G>A', + 'HGVSp': 'ENST00000343518.6:c.27G>A(p.%3D)', + 'Protein_position': '9', + 'Amino_acids': 'P'} + result = utils.get_protein_hgvs(annotation) + assert result == 'p.Pro9Pro' + annotation['Amino_acids'] = 'Z' + assert not utils.get_protein_hgvs(annotation) def test_get_transcript_hgvs(): diff --git a/backend/modules/browser/utils.py b/backend/modules/browser/utils.py index 833d14023..8396ff643 100644 --- a/backend/modules/browser/utils.py +++ b/backend/modules/browser/utils.py @@ -48,11 +48,9 @@ "feature_truncation", "intergenic_variant", ""] -assert len(CSQ_ORDER) == len(set(CSQ_ORDER)) # No dupplicates CSQ_ORDER_DICT = {csq:i for i,csq in enumerate(CSQ_ORDER)} REV_CSQ_ORDER_DICT = dict(enumerate(CSQ_ORDER)) -assert all(csq == REV_CSQ_ORDER_DICT[CSQ_ORDER_DICT[csq]] for csq in CSQ_ORDER) METRICS = [ 'BaseQRankSum', @@ -165,6 +163,15 @@ def get_flags_from_variant(variant): def get_proper_hgvs(csq): + """ + Get HGVS for change, either at transcript or protein level + + Args: + annotation (dict): VEP annotation with HGVS information + + Returns: + str: variant effect at aa level in HGVS format (p.), None if parsing fails + """ # Needs major_consequence if csq['major_consequence'] in ('splice_donor_variant', 'splice_acceptor_variant', 'splice_region_variant'): return get_transcript_hgvs(csq) @@ -174,7 +181,13 @@ def get_proper_hgvs(csq): def get_protein_hgvs(annotation): """ - Takes consequence dictionary, returns proper variant formatting for synonymous variants + Aa changes in HGVS format + + Args: + annotation (dict): VEP annotation with HGVS information + + Returns: + str: variant effect at aa level in HGVS format (p.), None if parsing fails """ if '%3D' in annotation['HGVSp']: # "%3D" is "=" try: @@ -182,6 +195,7 @@ def get_protein_hgvs(annotation): return "p." + amino_acids + annotation['Protein_position'] + amino_acids except KeyError: logging.error("Could not fetch protein hgvs - unknown amino acid") + return None return annotation['HGVSp'].split(':')[-1] From 309bcdbd8a691d96cedd860243c3e5a7bab5f697 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Wed, 30 Jan 2019 10:46:46 +0100 Subject: [PATCH 037/170] incorrect indentation of a return statement --- backend/modules/browser/lookups.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index abf691a87..f03f651fb 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -13,7 +13,6 @@ def add_rsid_to_variant(dataset, variant): """ Add rsid to a variant in the database based on position - Note that this may be inaccurate Args: dataset (str): short name of the dataset @@ -441,7 +440,7 @@ def get_variant(dataset, pos, chrom, ref, alt, ds_version=None): else: if not str(variant['rsid']).startswith('rs'): variant['rsid'] = 'rs{}'.format(variant['rsid']) - return variant + return variant def get_variants_by_rsid(dataset, rsid, check_position=False, ds_version=None): From 2325da3431169dc41888e1d75fbead1fe2d9c4b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Wed, 30 Jan 2019 10:47:11 +0100 Subject: [PATCH 038/170] 100% coverage, all passing in utils --- backend/modules/browser/tests/test_utils.py | 34 ++++++- backend/modules/browser/utils.py | 101 ++++++++++++++------ 2 files changed, 99 insertions(+), 36 deletions(-) diff --git a/backend/modules/browser/tests/test_utils.py b/backend/modules/browser/tests/test_utils.py index 85001c26d..fe5be6dbc 100644 --- a/backend/modules/browser/tests/test_utils.py +++ b/backend/modules/browser/tests/test_utils.py @@ -80,8 +80,14 @@ def test_get_proper_hgvs(): """ Test get_proper_hgvs() """ - assert False - + annotation = {'HGVSc': 'ENST00000343518.6:c.35C>T', + 'HGVSp': 'ENSP00000340610.6:p.Ser12Phe', + 'major_consequence': 'splice_donor_variant'} + assert utils.get_proper_hgvs(annotation) == 'c.35C>T' + annotation['major_consequence'] = 'coding_sequence_variant' + assert utils.get_proper_hgvs(annotation) == 'p.Ser12Phe' + assert not utils.get_proper_hgvs(dict()) + def test_get_protein_hgvs(): """ @@ -99,27 +105,45 @@ def test_get_protein_hgvs(): assert result == 'p.Pro9Pro' annotation['Amino_acids'] = 'Z' assert not utils.get_protein_hgvs(annotation) + assert not utils.get_protein_hgvs(dict()) def test_get_transcript_hgvs(): """ Test get_transcript_hgvs() + """ - assert False + annotation = {'HGVSc': 'ENST00000343518.6:c.35C>T', + 'HGVSp': 'ENSP00000340610.6:p.Ser12Phe'} + assert utils.get_transcript_hgvs(annotation) == 'c.35C>T' + assert not utils.get_transcript_hgvs(dict()) def test_order_vep_by_csq(): """ Test order_vep_by_csq() """ - assert False + annotation = [{'Consequence': 'frameshift_variant'}, + {'Consequence': 'transcript_ablation'}, + {'Consequence': 'mature_miRNA_variant'}] + expected = [{'Consequence': 'transcript_ablation', + 'major_consequence': 'transcript_ablation'}, + {'Consequence': 'frameshift_variant', + 'major_consequence': 'frameshift_variant'}, + {'Consequence': 'mature_miRNA_variant', + 'major_consequence': 'mature_miRNA_variant'}] + result = utils.order_vep_by_csq(annotation) + assert result == expected + assert utils.order_vep_by_csq([dict()]) == [{'major_consequence': ''}] def test_remove_extraneous_vep_annotations(): """ Test remove_extraneous_vep_annotations() """ - assert False + annotation = [{'Consequence': 'frameshift_variant'}, + {'Consequence': 'feature_elongation&TF_binding_site_variant'}] + assert utils.remove_extraneous_vep_annotations(annotation) == [{'Consequence': 'frameshift_variant'}] def test_worst_csq_from_csq(): diff --git a/backend/modules/browser/utils.py b/backend/modules/browser/utils.py index 8396ff643..b2b896d09 100644 --- a/backend/modules/browser/utils.py +++ b/backend/modules/browser/utils.py @@ -122,7 +122,7 @@ def add_consequence_to_variant(variant): def annotation_severity(annotation): """ - Evaluate severity of the consequences; "bigger is more important" + Evaluate severity of the consequences; "bigger is more important". Args: annotation (dict): vep_annotation from a variant @@ -139,9 +139,9 @@ def annotation_severity(annotation): def get_flags_from_variant(variant): """ Get flags from variant. - checks for: - - MNP (identical length of reference and variant) - - LoF (loss of function) + Checks for: + - MNP (identical length of reference and variant) + - LoF (loss of function) Args: variant (dict): a variant @@ -162,9 +162,9 @@ def get_flags_from_variant(variant): return flags -def get_proper_hgvs(csq): +def get_proper_hgvs(annotation): """ - Get HGVS for change, either at transcript or protein level + Get HGVS for change, either at transcript or protein level. Args: annotation (dict): VEP annotation with HGVS information @@ -173,15 +173,19 @@ def get_proper_hgvs(csq): str: variant effect at aa level in HGVS format (p.), None if parsing fails """ # Needs major_consequence - if csq['major_consequence'] in ('splice_donor_variant', 'splice_acceptor_variant', 'splice_region_variant'): - return get_transcript_hgvs(csq) - - return get_protein_hgvs(csq) + try: + if annotation['major_consequence'] in ('splice_donor_variant', + 'splice_acceptor_variant', + 'splice_region_variant'): + return get_transcript_hgvs(annotation) + return get_protein_hgvs(annotation) + except KeyError: + return None def get_protein_hgvs(annotation): """ - Aa changes in HGVS format + Aa changes in HGVS format. Args: annotation (dict): VEP annotation with HGVS information @@ -189,39 +193,73 @@ def get_protein_hgvs(annotation): Returns: str: variant effect at aa level in HGVS format (p.), None if parsing fails """ - if '%3D' in annotation['HGVSp']: # "%3D" is "=" - try: - amino_acids = ''.join([PROTEIN_LETTERS_1TO3[x] for x in annotation['Amino_acids']]) - return "p." + amino_acids + annotation['Protein_position'] + amino_acids - except KeyError: - logging.error("Could not fetch protein hgvs - unknown amino acid") - return None - return annotation['HGVSp'].split(':')[-1] + try: + if '%3D' in annotation['HGVSp']: # "%3D" is "=" + amino_acids = ''.join([PROTEIN_LETTERS_1TO3[aa] for aa in annotation['Amino_acids']]) + return "p." + amino_acids + annotation['Protein_position'] + amino_acids + return annotation['HGVSp'].split(':')[-1] + except KeyError: + logging.error("Could not fetch protein hgvs") + return None -def get_transcript_hgvs(csq): - return csq['HGVSc'].split(':')[-1] +def get_transcript_hgvs(annotation): + """ + Nucleotide change in HGVS format. + Args: + annotation (dict): VEP annotation with HGVS information -def order_vep_by_csq(annotation_list): + Returns: + str: variant effect at nucleotide level in HGVS format (c.), None if parsing fails + """ + try: + return annotation['HGVSc'].split(':')[-1] + except KeyError: + return None + + +def order_vep_by_csq(annotation_list: list): """ - Adds "major_consequence" to each annotation. - Returns them ordered from most deleterious to least. + Adds "major_consequence" to each annotation, orders by severity. + + Args: + annotation_list (list): VEP annotations (as dict) + + Returns: + list: annotations ordered by major consequence severity """ for ann in annotation_list: - ann['major_consequence'] = worst_csq_from_csq(ann['Consequence']) + try: + ann['major_consequence'] = worst_csq_from_csq(ann['Consequence']) + except KeyError: + ann['major_consequence'] = '' return sorted(annotation_list, key=(lambda ann:CSQ_ORDER_DICT[ann['major_consequence']])) -def remove_extraneous_vep_annotations(annotation_list): - return [ann for ann in annotation_list if worst_csq_index(ann['Consequence'].split('&')) <= CSQ_ORDER_DICT['intron_variant']] +def remove_extraneous_vep_annotations(annotation_list: list): + """ + Remove annotations with low-impact consequences (less than intron variant) + + Args: + annotation_list (list): VEP annotations (as dict) + + Returns: + list: VEP annotations with higher impact + """ + return [ann for ann in annotation_list + if worst_csq_index(ann['Consequence'].split('&')) <= CSQ_ORDER_DICT['intron_variant']] def worst_csq_from_list(csq_list): """ - Input list of consequences (e.g. ['frameshift_variant', 'missense_variant']) - Return the worst consequence (In this case, 'frameshift_variant') - Works well with worst_csq_from_list('non_coding_exon_variant&nc_transcript_variant'.split('&')) + Choose the worst consequence + + Args: + csq_list (list): list of consequences + + Returns: + str: the worst consequence """ return REV_CSQ_ORDER_DICT[worst_csq_index(csq_list)] @@ -256,12 +294,13 @@ def worst_csq_index(csq_list): def worst_csq_with_vep(annotation_list): """ Choose the vep annotation with the most severe consequence + Adds a"major_consequence" field for that annotation Args: annotation_list (list): VEP annotations Returns: - dict: the annotation with the most severe consequence; also adds "major_consequence" for that annotation + dict: the annotation with the most severe consequence """ if not annotation_list: return None From b14ba703a3803e8bb4d2e52004de13312d0b18b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Thu, 31 Jan 2019 08:45:30 +0100 Subject: [PATCH 039/170] pylint fixes in lookups --- backend/modules/browser/lookups.py | 23 +++++++++---------- backend/modules/browser/tests/test_lookups.py | 13 +++++------ 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index f03f651fb..f162ed0a8 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -1,4 +1,3 @@ - import json # remove when db is fixed import logging import re @@ -333,11 +332,11 @@ def get_raw_variant(dataset, pos, chrom, ref, alt, ds_version=None): ds_version (str): dataset version Returns: - dict: values for the variant; empty if not found + dict: values for the variant; None if not found """ dataset_version = db.get_dataset_version(dataset, ds_version) if not dataset_version: - return + return None try: return (db.Variant @@ -352,7 +351,7 @@ def get_raw_variant(dataset, pos, chrom, ref, alt, ds_version=None): except db.Variant.DoesNotExist: logging.error('get_raw_variant({}, {}, {}, {}, {}, {}): unable to retrieve variant' .format(dataset, pos, chrom, ref, alt, dataset_version.id)) - return {} + return None def get_transcript(dataset, transcript_id): @@ -455,21 +454,21 @@ def get_variants_by_rsid(dataset, rsid, check_position=False, ds_version=None): ds_version (str): version of the dataset Returns: - list: variant dicts; no hits + list: variants as dict; no hits returns None """ dataset_version = db.get_dataset_version(dataset, ds_version) if not dataset_version: - return + return None if not rsid.startswith('rs'): logging.error('get_variants_by_rsid({}, {}): rsid not starting with rs'.format(dataset, rsid)) - return + return None try: rsid = int(rsid.lstrip('rs')) except ValueError: logging.error('get_variants_by_rsid({}, {}): not an integer after rs'.format(dataset, rsid)) - return + return None if check_position: refset = (db.Dataset .select(db.ReferenceSet) @@ -544,11 +543,11 @@ def get_variants_in_region(dataset, chrom, start_pos, end_pos, ds_version=None): ds_version (str): version of the dataset Returns: - list: variant dicts + list: variant dicts, None if no hits """ dataset_version = db.get_dataset_version(dataset, ds_version) if not dataset_version: - return + return None query = (db.Variant .select() .where((db.Variant.pos >= start_pos) & @@ -584,11 +583,11 @@ def get_variants_in_transcript(dataset, transcript_id): transcript_id (str): id of the transcript (ENST) Returns: - dict: values for the variant; empty if not found + dict: values for the variant; None if not found """ transcript = get_transcript(dataset, transcript_id) if not transcript: - return {} + return None # temporary while waiting for db fix variants = get_variants_in_region(dataset, transcript['chrom'], transcript['start'], transcript['stop']) # variants = [variant for variant in db.Variant.select().where(db.Variant.transcripts.contains(transcript_id)).dicts()] diff --git a/backend/modules/browser/tests/test_lookups.py b/backend/modules/browser/tests/test_lookups.py index ea559d809..c4d34c768 100644 --- a/backend/modules/browser/tests/test_lookups.py +++ b/backend/modules/browser/tests/test_lookups.py @@ -17,7 +17,7 @@ def test_add_rsid_to_variant(): assert variant['rsid'] == rsid # "non-existing" del variant['rsid'] - lookups.add_rsid_to_variant(variant) + lookups.add_rsid_to_variant('SweGen', variant) assert variant['rsid'] == rsid @@ -45,7 +45,7 @@ def test_get_awesomebar_result(): assert result == ('not_found', 'DOES_NOT_EXIST') -def test_get_coverage_for_bases(caplog): +def test_get_coverage_for_bases(): """ Test get_coverage_for_bases() """ @@ -206,7 +206,7 @@ def test_get_gene_by_name(caplog): # name in other_names result = lookups.get_gene_by_name('SweGen', 'NIR') assert result['gene_id'] == 'ENSG00000188976' - + def test_get_genes_in_region(): """ @@ -299,7 +299,7 @@ def test_get_transcripts_in_gene(): assert not lookups.get_transcripts_in_gene('bad_dataset', 'ENSG00000241670') assert not lookups.get_transcripts_in_gene('SweGen', 'ENSGASDFG') - + def test_get_raw_variant(): """ @@ -321,7 +321,7 @@ def test_get_transcripts_in_gene_by_dbid(): res = lookups.get_transcripts_in_gene_by_dbid(-1) assert not res - + def test_get_variant(): """ Test get_variant() @@ -342,10 +342,9 @@ def test_get_variant(): 'ENST00000491666', 'ENST00000472607', 'ENST00000475776'] assert result['rsid'] == 'rs75050571' - # TODO: add test for entry with missing rsid # incorrect position - assert not lookups.get_variant(-1, '1', 'A', 'T') + assert not lookups.get_variant('SweGen', -1, '1', 'A', 'T') def test_get_variants_by_rsid(caplog): From cc2fe67a774346d245b7e7b60167d93eb4da8bd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Thu, 31 Jan 2019 08:50:03 +0100 Subject: [PATCH 040/170] pylint fixes in utils --- backend/modules/browser/tests/test_utils.py | 10 ++++----- backend/modules/browser/utils.py | 25 ++++++++++----------- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/backend/modules/browser/tests/test_utils.py b/backend/modules/browser/tests/test_utils.py index fe5be6dbc..bb74f5693 100644 --- a/backend/modules/browser/tests/test_utils.py +++ b/backend/modules/browser/tests/test_utils.py @@ -31,12 +31,12 @@ def test_add_consequence_to_variant(): variant['vep_annotations'] = json.loads(variant['vep_annotations']) # remove when db is fixed utils.add_consequence_to_variant(variant) assert variant['major_consequence'] == 'intron_variant' - + variant2 = lookups.get_variant('SweGen', 55500283, '1', 'A', 'T') variant2['vep_annotations'] = json.loads(variant2['vep_annotations']) # remove when db is fixed utils.add_consequence_to_variant(variant2) assert variant2['major_consequence'] == 'upstream_gene_variant' - + def test_annotation_severity(): """ @@ -46,7 +46,7 @@ def test_annotation_severity(): variant['vep_annotations'] = json.loads(variant['vep_annotations']) # remove when db is fixed res = utils.annotation_severity(variant['vep_annotations'][0]) assert res == -26.9 - + def test_data_structures(): """ @@ -55,7 +55,7 @@ def test_data_structures(): assert len(utils.CSQ_ORDER) == len(set(utils.CSQ_ORDER)) # No duplicates assert all(csq == utils.REV_CSQ_ORDER_DICT[utils.CSQ_ORDER_DICT[csq]] for csq in utils.CSQ_ORDER) - + def test_get_flags_from_variant(): """ Test get_flags_from_variant() @@ -87,7 +87,7 @@ def test_get_proper_hgvs(): annotation['major_consequence'] = 'coding_sequence_variant' assert utils.get_proper_hgvs(annotation) == 'p.Ser12Phe' assert not utils.get_proper_hgvs(dict()) - + def test_get_protein_hgvs(): """ diff --git a/backend/modules/browser/utils.py b/backend/modules/browser/utils.py index b2b896d09..c045f6536 100644 --- a/backend/modules/browser/utils.py +++ b/backend/modules/browser/utils.py @@ -1,5 +1,4 @@ import logging -from operator import itemgetter AF_BUCKETS = [0.0001, 0.0002, 0.0005, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1] @@ -139,7 +138,7 @@ def annotation_severity(annotation): def get_flags_from_variant(variant): """ Get flags from variant. - Checks for: + Checks for: - MNP (identical length of reference and variant) - LoF (loss of function) @@ -186,7 +185,7 @@ def get_proper_hgvs(annotation): def get_protein_hgvs(annotation): """ Aa changes in HGVS format. - + Args: annotation (dict): VEP annotation with HGVS information @@ -195,8 +194,8 @@ def get_protein_hgvs(annotation): """ try: if '%3D' in annotation['HGVSp']: # "%3D" is "=" - amino_acids = ''.join([PROTEIN_LETTERS_1TO3[aa] for aa in annotation['Amino_acids']]) - return "p." + amino_acids + annotation['Protein_position'] + amino_acids + amino_acids = ''.join([PROTEIN_LETTERS_1TO3[aa] for aa in annotation['Amino_acids']]) + return "p." + amino_acids + annotation['Protein_position'] + amino_acids return annotation['HGVSp'].split(':')[-1] except KeyError: logging.error("Could not fetch protein hgvs") @@ -212,7 +211,7 @@ def get_transcript_hgvs(annotation): Returns: str: variant effect at nucleotide level in HGVS format (c.), None if parsing fails - """ + """ try: return annotation['HGVSc'].split(':')[-1] except KeyError: @@ -222,7 +221,7 @@ def get_transcript_hgvs(annotation): def order_vep_by_csq(annotation_list: list): """ Adds "major_consequence" to each annotation, orders by severity. - + Args: annotation_list (list): VEP annotations (as dict) @@ -240,7 +239,7 @@ def order_vep_by_csq(annotation_list: list): def remove_extraneous_vep_annotations(annotation_list: list): """ Remove annotations with low-impact consequences (less than intron variant) - + Args: annotation_list (list): VEP annotations (as dict) @@ -254,7 +253,7 @@ def remove_extraneous_vep_annotations(annotation_list: list): def worst_csq_from_list(csq_list): """ Choose the worst consequence - + Args: csq_list (list): list of consequences @@ -266,11 +265,11 @@ def worst_csq_from_list(csq_list): def worst_csq_from_csq(csq): """ - Find worst consequence in a possibly &-filled consequence string + Find worst consequence in a possibly &-filled consequence string Args: csq (str): string of consequences, seperated with & (if multiple) - + Returns: str: the worst consequence """ @@ -297,8 +296,8 @@ def worst_csq_with_vep(annotation_list): Adds a"major_consequence" field for that annotation Args: - annotation_list (list): VEP annotations - + annotation_list (list): VEP annotations + Returns: dict: the annotation with the most severe consequence """ From baaf98274894378c379acacb79967bda69ecdd94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Thu, 31 Jan 2019 09:23:46 +0100 Subject: [PATCH 041/170] type hints added --- backend/modules/browser/lookups.py | 44 ++++++++++++++++-------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index f162ed0a8..6fd698d65 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -9,7 +9,7 @@ SEARCH_LIMIT = 10000 -def add_rsid_to_variant(dataset, variant): +def add_rsid_to_variant(dataset:str, variant:str): """ Add rsid to a variant in the database based on position @@ -41,7 +41,7 @@ def add_rsid_to_variant(dataset, variant): REGION_REGEX = re.compile(r'^\s*(\d+|X|Y|M|MT)\s*([-:]?)\s*(\d*)-?([\dACTG]*)-?([ACTG]*)') -def get_awesomebar_result(dataset, query, ds_version=None): +def get_awesomebar_result(dataset:str, query:str, ds_version:str=None): """ Parse the search input @@ -121,7 +121,7 @@ def get_awesomebar_result(dataset, query, ds_version=None): return 'not_found', query -def get_coverage_for_bases(dataset, chrom, start_pos, end_pos=None, ds_version=None): +def get_coverage_for_bases(dataset:str, chrom:str, start_pos:int, end_pos:int=None, ds_version:str=None): """ Get the coverage for the list of bases given by start_pos->end_pos, inclusive @@ -150,7 +150,7 @@ def get_coverage_for_bases(dataset, chrom, start_pos, end_pos=None, ds_version=N .dicts())] -def get_coverage_for_transcript(dataset, chrom, start_pos, end_pos=None, ds_version=None): +def get_coverage_for_transcript(dataset:str, chrom:str, start_pos:int, end_pos:int=None, ds_version:str=None): """ Get the coverage for the list of bases given by start_pos->end_pos, inclusive @@ -175,7 +175,7 @@ def get_coverage_for_transcript(dataset, chrom, start_pos, end_pos=None, ds_vers return covered -def get_exons_in_transcript(dataset, transcript_id): +def get_exons_in_transcript(dataset:str, transcript_id:str): """ Retrieve exons associated with the given transcript id @@ -206,7 +206,7 @@ def get_exons_in_transcript(dataset, transcript_id): key=lambda k: k['start']) -def get_gene(dataset, gene_id): +def get_gene(dataset:str, gene_id:str): """ Retrieve gene by gene id @@ -227,7 +227,7 @@ def get_gene(dataset, gene_id): return {} -def get_gene_by_dbid(gene_dbid): +def get_gene_by_dbid(gene_dbid:str): """ Retrieve gene by gene database id @@ -245,7 +245,7 @@ def get_gene_by_dbid(gene_dbid): return {} -def get_gene_by_name(dataset, gene_name): +def get_gene_by_name(dataset:str, gene_name:str): """ Retrieve gene by gene_name. First checks gene_name, then other_names. @@ -271,7 +271,7 @@ def get_gene_by_name(dataset, gene_name): return {} -def get_genes_in_region(dataset, chrom, start_pos, stop_pos): +def get_genes_in_region(dataset:str, chrom:str, start_pos:int, stop_pos:int): """ Retrieve genes located within a region @@ -297,7 +297,7 @@ def get_genes_in_region(dataset, chrom, start_pos, stop_pos): return [gene for gene in gene_query] -def get_number_of_variants_in_transcript(dataset, transcript_id, ds_version=None): +def get_number_of_variants_in_transcript(dataset:str, transcript_id:str, ds_version:str=None): """ Get the total and filtered amount of variants in a transcript @@ -314,12 +314,14 @@ def get_number_of_variants_in_transcript(dataset, transcript_id, ds_version=None return None variants = get_variants_in_transcript(dataset, transcript_id) + if not variants: + return None total = len(variants) filtered = len(tuple(variant for variant in variants if variant['filter_string'] == 'PASS')) return {'filtered': filtered, 'total': total} -def get_raw_variant(dataset, pos, chrom, ref, alt, ds_version=None): +def get_raw_variant(dataset:str, pos:int, chrom:str, ref:str, alt:str, ds_version:str=None): """ Retrieve variant by position and change @@ -354,7 +356,7 @@ def get_raw_variant(dataset, pos, chrom, ref, alt, ds_version=None): return None -def get_transcript(dataset, transcript_id): +def get_transcript(dataset:str, transcript_id:str): """ Retrieve transcript by transcript id Also includes exons as ['exons'] @@ -381,7 +383,7 @@ def get_transcript(dataset, transcript_id): return {} -def get_transcripts_in_gene(dataset, gene_id): +def get_transcripts_in_gene(dataset:str, gene_id:str): """ Get the transcripts associated with a gene Args: @@ -404,18 +406,18 @@ def get_transcripts_in_gene(dataset, gene_id): return [transcript for transcript in db.Transcript.select().where(db.Transcript.gene == gene['id']).dicts()] -def get_transcripts_in_gene_by_dbid(gene_dbid): +def get_transcripts_in_gene_by_dbid(gene_dbid:int): """ Get the transcripts associated with a gene Args: - gene_dbid (str): database id of the gene + gene_dbid (int): database id of the gene Returns: list: transcripts (dict) associated with the gene; empty if no hits """ return [transcript for transcript in db.Transcript.select().where(db.Transcript.gene == gene_dbid).dicts()] -def get_variant(dataset, pos, chrom, ref, alt, ds_version=None): +def get_variant(dataset:str, pos:int, chrom:str, ref:str, alt:str, ds_version:str=None): """ Retrieve variant by position and change Retrieves rsid from db (if available) if not present in variant @@ -442,7 +444,7 @@ def get_variant(dataset, pos, chrom, ref, alt, ds_version=None): return variant -def get_variants_by_rsid(dataset, rsid, check_position=False, ds_version=None): +def get_variants_by_rsid(dataset:str, rsid:str, check_position:str=False, ds_version:str=None): """ Retrieve variants by their associated rsid May also look up rsid and search for variants at the position @@ -502,12 +504,12 @@ def get_variants_by_rsid(dataset, rsid, check_position=False, ds_version=None): return variants -def get_variants_in_gene(dataset, gene_id): +def get_variants_in_gene(dataset:str, gene_id:str): """ Retrieve variants present inside a gene Args: - dataset: short name of the dataset + dataset (str): short name of the dataset gene_id (str): id of the gene Returns: @@ -531,7 +533,7 @@ def get_variants_in_gene(dataset, gene_id): return variants -def get_variants_in_region(dataset, chrom, start_pos, end_pos, ds_version=None): +def get_variants_in_region(dataset:str, chrom:str, start_pos:int, end_pos:int, ds_version:str=None): """ Variants that overlap a region @@ -574,7 +576,7 @@ def get_variants_in_region(dataset, chrom, start_pos, end_pos, ds_version=None): return variants -def get_variants_in_transcript(dataset, transcript_id): +def get_variants_in_transcript(dataset:str, transcript_id:str): """ Retrieve variants inside a transcript From 3c0f9042f65b24e8ee9fb8d57df1717714338031 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Fri, 1 Feb 2019 13:50:16 +0100 Subject: [PATCH 042/170] migrate the download function --- backend/modules/browser/browser_handlers.py | 24 ++++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/backend/modules/browser/browser_handlers.py b/backend/modules/browser/browser_handlers.py index 4e1db8a6d..b518e5d88 100644 --- a/backend/modules/browser/browser_handlers.py +++ b/backend/modules/browser/browser_handlers.py @@ -1,7 +1,6 @@ import json # remove when db is fixed import logging -import db import handlers from . import lookups @@ -17,7 +16,7 @@ class Autocomplete(handlers.UnsafeHandler): def get(self, dataset, query): ret = {} - results = pgsql.get_autocomplete(dataset, query) + results = pgsql.get_autocomplete(query) ret = {'values': sorted(list(set(results)))[:20]} self.finish( ret ) @@ -42,12 +41,21 @@ def get(self, dataset, datatype, item): class Download(handlers.UnsafeHandler): - def get(self, dataset, datatype, item): + def get(self, dataset: str, datatype, item, ds_version=None): + """ + Download variants as csv + + Args: + dataset (str): dataset short name + datatype (str): type of data + item (str): query item + ds_version (str): dataset version + """ filename = "{}_{}_{}.csv".format(dataset, datatype, item) self.set_header('Content-Type','text/csv') self.set_header('content-Disposition','attachement; filename={}'.format(filename)) - data = mongodb.get_variant_list(dataset, datatype, item) + data = pgsql.get_variant_list(dataset, datatype, item) # Write header self.write(','.join([h[1] for h in data['headers']]) + '\n') @@ -111,7 +119,7 @@ class GetRegion(handlers.UnsafeHandler): def get(self, dataset, region): """ Request information about genes in a region - + Args: dataset (str): short name of the dataset region (str): the region in the format chr-startpos-endpos @@ -135,7 +143,7 @@ def get(self, dataset, region): self.send_error(status_code=400) self.set_user_msg('Unable to parse region', 'error') return - + if not start: start = 0 if not stop and start: @@ -207,7 +215,7 @@ def get(self, dataset, transcript): class GetVariant(handlers.UnsafeHandler): """ Request information about a gene - """ + """ def get(self, dataset, variant): """ Request information about a gene @@ -229,7 +237,7 @@ def get(self, dataset, variant): return orig_variant = variant variant = lookups.get_variant(dataset, v[1], v[0], v[2], v[3]) - + if not variant: logging.error('Variant not found ({})'.format(orig_variant)) self.send_error(status_code=404) From 6dbccd40b1d11fab77b7c78e0698c62da7148117 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Fri, 1 Feb 2019 13:51:26 +0100 Subject: [PATCH 043/170] some fixes for tests, documentation additions --- backend/modules/browser/pgsql.py | 34 +++++++++++++------ backend/modules/browser/tests/test_lookups.py | 5 +-- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/backend/modules/browser/pgsql.py b/backend/modules/browser/pgsql.py index b4c1cd53f..9a7be7f38 100644 --- a/backend/modules/browser/pgsql.py +++ b/backend/modules/browser/pgsql.py @@ -10,23 +10,20 @@ EXON_PADDING = 50 -def get_autocomplete(dataset, query): +def get_autocomplete(query:str): """ Provide autocomplete suggestions based on the query - NOTE: dataset is not used for sql Args: - dataset (str): name of the dataset query (str): the query to compare to the available gene names Returns: list: A list of genes names whose beginning matches the query """ genes = db.Gene.select(db.Gene.name).where(db.Gene.name.startswith(query)) gene_names = [str(gene.name) for gene in genes] - logging.error('Autocomplete: {}'.format(gene_names)) return gene_names -def get_coverage(dataset, datatype, item, ds_version=None): +def get_coverage(dataset:str, datatype:str, item:str, ds_version:str=None): """ Retrieve coverage for a gene/region/transcript @@ -35,6 +32,9 @@ def get_coverage(dataset, datatype, item, ds_version=None): datatype (str): type of "region" (gene/region/transcript) item (str): the datatype item to look up ds_version (str): the dataset version + + Returns: + dict: start, stop, coverage list """ ret = {'coverage':[]} @@ -58,7 +58,7 @@ def get_coverage(dataset, datatype, item, ds_version=None): return ret -def get_coverage_pos(dataset, datatype, item): +def get_coverage_pos(dataset:str, datatype:str, item:str): """ Retrieve coverage range @@ -66,7 +66,9 @@ def get_coverage_pos(dataset, datatype, item): dataset (str): short name of the dataset datatype (str): type of "region" (gene/region/transcript) item (str): the datatype item to look up - ds_version (str): the dataset version + + Returns: + dict: start, stop, chromosome """ ret = {'start':None, 'stop':None, 'chrom':None} @@ -90,9 +92,21 @@ def get_coverage_pos(dataset, datatype, item): ret['chrom'] = chrom return ret - -def get_variant_list(dataset, datatype, item): + +def get_variant_list(dataset:str, datatype:str, item:str, ds_version:str=None): + """ + Retrieve variants for a datatype + + Args: + dataset (str): dataset short name + datatype (str): type of data + item (str): query item + ds_version (str): dataset version + + Returns: + dict: {variants:list, headers:list} + """ headers = [['variant_id','Variant'], ['chrom','Chrom'], ['pos','Position'], ['HGVS','Consequence'], ['filter','Filter'], ['major_consequence','Annotation'], ['flags','Flags'], ['allele_count','Allele Count'], ['allele_num','Allele Number'], @@ -116,6 +130,6 @@ def format_variant(variant): # This is so an array values turns into a comma separated string instead return {k: ", ".join(v) if isinstance(v,list) else v for k, v in variant.items()} - + variants = list(map(format_variant, variants)) return {'variants': variants, 'headers': headers} diff --git a/backend/modules/browser/tests/test_lookups.py b/backend/modules/browser/tests/test_lookups.py index c4d34c768..043707a01 100644 --- a/backend/modules/browser/tests/test_lookups.py +++ b/backend/modules/browser/tests/test_lookups.py @@ -64,6 +64,7 @@ def test_get_coverage_for_bases(): 'pos': 55500320, 'mean': 39.69, 'median': 38.0, 'coverage': [1.0, 1.0, 1.0, 1.0, 0.996, 0.961, 0.856, 0.117, 0.001]}] assert coverage == expected + assert len(lookups.get_coverage_for_bases('SweGen', '22', 46615715, 46615880)) == 17 # no end_pos coverage = lookups.get_coverage_for_bases('SweGen', '1', 55500290) @@ -231,11 +232,11 @@ def test_get_number_of_variants_in_transcript(): """ # normal res = lookups.get_number_of_variants_in_transcript('SweGen', 'ENST00000424770') - assert res == {'filtered': 1, 'total': 23} + assert res == {'filtered': 243, 'total': 309} # bad transcript res = lookups.get_number_of_variants_in_transcript('SweGen', 'ENSTASDSADA') - assert res == {'filtered': 0, 'total': 0} + assert res is None # bad dataset res = lookups.get_number_of_variants_in_transcript('bad_dataset', 'ENST00000424770') From 400bc177241f987de77fabad46f2a72a28c2f301 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Mon, 4 Feb 2019 09:08:32 +0100 Subject: [PATCH 044/170] Api routes updated to support dataset version as well. Thanks to @kusalananda for help with regex. --- backend/modules/browser/browser_handlers.py | 14 +++++++------- backend/modules/browser/route.py | 20 ++++++++++---------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/backend/modules/browser/browser_handlers.py b/backend/modules/browser/browser_handlers.py index b518e5d88..7ddc3c664 100644 --- a/backend/modules/browser/browser_handlers.py +++ b/backend/modules/browser/browser_handlers.py @@ -13,7 +13,7 @@ REGION_LIMIT = 100000 class Autocomplete(handlers.UnsafeHandler): - def get(self, dataset, query): + def get(self, dataset, query, ds_version=None): ret = {} results = pgsql.get_autocomplete(query) @@ -35,7 +35,7 @@ class GetCoveragePos(handlers.UnsafeHandler): """ Retrieve coverage range """ - def get(self, dataset, datatype, item): + def get(self, dataset, datatype, item, ds_version=None): ret = pgsql.get_coverage_pos(dataset, datatype, item) self.finish(ret) @@ -116,7 +116,7 @@ class GetRegion(handlers.UnsafeHandler): """ Request information about genes in a region """ - def get(self, dataset, region): + def get(self, dataset, region, ds_version=None): """ Request information about genes in a region @@ -173,7 +173,7 @@ class GetTranscript(handlers.UnsafeHandler): """ Request information about a transcript """ - def get(self, dataset, transcript): + def get(self, dataset, transcript, ds_version=None): """ Request information about a transcript @@ -216,7 +216,7 @@ class GetVariant(handlers.UnsafeHandler): """ Request information about a gene """ - def get(self, dataset, variant): + def get(self, dataset, variant, ds_version=None): """ Request information about a gene @@ -318,7 +318,7 @@ class GetVariants(handlers.UnsafeHandler): """ Retrieve variants """ - def get(self, dataset, datatype, item): + def get(self, dataset, datatype, item, ds_version=None): """ Retrieve variants @@ -343,7 +343,7 @@ class Search(handlers.UnsafeHandler): """ Perform a search for the wanted object """ - def get(self, dataset, query): + def get(self, dataset, query, ds_version=None): """ Perform a search for the wanted object diff --git a/backend/modules/browser/route.py b/backend/modules/browser/route.py index df6913617..350c74f30 100755 --- a/backend/modules/browser/route.py +++ b/backend/modules/browser/route.py @@ -1,14 +1,14 @@ from . import browser_handlers as handlers # Browser links -routes = [ (r"/api/datasets/(?P[^\/]+)/browser/gene/(?P[^\/]+)", handlers.GetGene), - (r"/api/datasets/(?P[^\/]+)/browser/region/(?P[^\/]+)", handlers.GetRegion), - (r"/api/datasets/(?P[^\/]+)/browser/transcript/(?P[^\/]+)", handlers.GetTranscript), - (r"/api/datasets/(?P[^\/]+)/browser/variant/(?P[^\/]+)", handlers.GetVariant), - (r"/api/datasets/(?P[^\/]+)/browser/variants/(?P[^\/]+)/(?P[^\/]+)", handlers.GetVariants), - (r"/api/datasets/(?P[^\/]+)/browser/coverage/(?P[^\/]+)/(?P[^\/]+)", handlers.GetCoverage), - (r"/api/datasets/(?P[^\/]+)/browser/coverage_pos/(?P[^\/]+)/(?P[^\/]+)", handlers.GetCoveragePos), - (r"/api/datasets/(?P[^\/]+)/browser/autocomplete/(?P[^\/]+)", handlers.Autocomplete), - (r"/api/datasets/(?P[^\/]+)/browser/search/(?P[^\/]+)", handlers.Search), - (r"/api/datasets/(?P[^\/]+)/browser/download/(?P[^\/]+)/(?P[^\/]+)", handlers.Download), +routes = [(r"/api/datasets/(?P[^/]+)/(?:version/(?P[^/]+)/)?browser/gene/(?P[^/]+)" , handlers.GetGene), + (r"/api/datasets/(?P[^/]+)/(?:version/(?P[^/]+)/)?browser/region/(?P[^\/]+)", handlers.GetRegion), + (r"/api/datasets/(?P[^/]+)/(?:version/(?P[^/]+)/)?browser/transcript/(?P[^/]+)", handlers.GetTranscript), + (r"/api/datasets/(?P[^/]+)/(?:version/(?P[^/]+)/)?browser/variant/(?P[^/]+)", handlers.GetVariant), + (r"/api/datasets/(?P[^/]+)/(?:version/(?P[^/]+)/)?browser/variants/(?P[^/]+)/(?P[^/]+)", handlers.GetVariants), + (r"/api/datasets/(?P[^/]+)/(?:version/(?P[^/]+)/)?browser/coverage/(?P[^/]+)/(?P[^/]+)", handlers.GetCoverage), + (r"/api/datasets/(?P[^/]+)/(?:version/(?P[^/]+)/)?browser/coverage_pos/(?P[^/]+)/(?P[^/]+)", handlers.GetCoveragePos), + (r"/api/datasets/(?P[^/]+)/(?:version/(?P[^/]+)/)?browser/autocomplete/(?P[^/]+)", handlers.Autocomplete), + (r"/api/datasets/(?P[^/]+)/(?:version/(?P[^/]+)/)?browser/search/(?P[^/]+)", handlers.Search), + (r"/api/datasets/(?P[^/]+)/(?:version/(?P[^/]+)/)?browser/download/(?P[^/]+)/(?P[^/]+)", handlers.Download), ] From 7bc68c20394b1e7d5686e991f0c6f504123d436d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Fri, 8 Feb 2019 14:47:24 +0100 Subject: [PATCH 045/170] updated get_gene_by_name for new schema --- backend/modules/browser/lookups.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index 6fd698d65..900c76a9c 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -264,9 +264,9 @@ def get_gene_by_name(dataset:str, gene_name:str): (db.Gene.name==gene_name)).dicts().get() except db.Gene.DoesNotExist: try: - return db.Gene.select().where((db.Gene.reference_set == ref_dbid) & - (db.Gene.other_names.contains(gene_name))).dicts().get() - except db.Gene.DoesNotExist: + return db.GeneOtherNames.select().join(db.Gene).where((db.GeneOtherNames.name == gene_name) & + (db.Gene.reference_set == ref_dbid)).dicts().get() + except db.GeneOtherNames.DoesNotExist: logging.error('get_gene_by_name({}, {}): unable to retrieve gene'.format(dataset, gene_name)) return {} From c3ae8e4a8d4c7262d4920deb5461a5b028d537ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Fri, 8 Feb 2019 15:00:55 +0100 Subject: [PATCH 046/170] should fix get_variants_in_gene for new db schema --- backend/modules/browser/lookups.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index 900c76a9c..ff56cc815 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -504,32 +504,32 @@ def get_variants_by_rsid(dataset:str, rsid:str, check_position:str=False, ds_ver return variants -def get_variants_in_gene(dataset:str, gene_id:str): +def get_variants_in_gene(dataset:str, gene_id:str, ds_version=None): """ Retrieve variants present inside a gene Args: dataset (str): short name of the dataset gene_id (str): id of the gene + ds_version (str): version of the dataset Returns: list: values for the variants """ ref_dbid = db.get_reference_dbid_dataset(dataset) - gene = get_gene(dataset, gene_id) - #### remove when db is fixed - gene['stop'] = gene['start'] + 20000 - #### + if not ref_dbid: + return None + dataset_version = db.get_dataset_version(dataset, ds_version) + if not dataset_version: + return None - variants = get_variants_in_region(dataset, gene['chrom'], gene['start'], gene['stop']) + gene = get_gene(dataset, gene_id) - # variants = [variant for variant in db.Variant.select().where(db.Variant.genes.contains(transcript_id)).dicts()] + variants = [variant for variant in db.Variant.select() + .join(VariantGenes) + .where((db.VariantGenes.name == gene_id) & + (db.Variant.dataset_version == dataset_version)).dicts()] -# for variant in variants: -# variant['vep_annotations'] = [anno for anno in variant['vep_annotations'] if anno['Gene'] == gene_id] -# add_consequence_to_variant(variant) -# remove_extraneous_information(variant) -# variants.append(variant) return variants From 40ffcf177b7503112a490edb1f5dec8aeb5b4feb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Fri, 8 Feb 2019 15:15:13 +0100 Subject: [PATCH 047/170] should fix get_variants_in_transcripts for new db schema, also some other fixes in the get_variants_in functions --- backend/modules/browser/lookups.py | 43 +++++++++++++++++++----------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index ff56cc815..24754640a 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -504,7 +504,7 @@ def get_variants_by_rsid(dataset:str, rsid:str, check_position:str=False, ds_ver return variants -def get_variants_in_gene(dataset:str, gene_id:str, ds_version=None): +def get_variants_in_gene(dataset:str, gene_id:str, ds_version:str=None): """ Retrieve variants present inside a gene @@ -527,9 +527,13 @@ def get_variants_in_gene(dataset:str, gene_id:str, ds_version=None): variants = [variant for variant in db.Variant.select() .join(VariantGenes) - .where((db.VariantGenes.name == gene_id) & + .where((db.VariantGenes.gene == gene['id']) & (db.Variant.dataset_version == dataset_version)).dicts()] + utils.add_consequence_to_variants(variants) + for variant in variants: + add_rsid_to_variant(dataset, variant) + remove_extraneous_information(variant) return variants @@ -571,33 +575,42 @@ def get_variants_in_region(dataset:str, chrom:str, start_pos:int, end_pos:int, d for variant in variants: if variant['rsid']: variant['rsid'] = 'rs{}'.format(variant['rsid']) - # add_rsid_to_variant(dataset, variant) + add_rsid_to_variant(dataset, variant) remove_extraneous_information(variant) return variants -def get_variants_in_transcript(dataset:str, transcript_id:str): +def get_variants_in_transcript(dataset:str, transcript_id:str, ds_version:str=None): """ Retrieve variants inside a transcript Args: dataset (str): short name of the dataset transcript_id (str): id of the transcript (ENST) + ds_version (str): version of the dataset Returns: dict: values for the variant; None if not found """ - transcript = get_transcript(dataset, transcript_id) - if not transcript: - return None - # temporary while waiting for db fix - variants = get_variants_in_region(dataset, transcript['chrom'], transcript['start'], transcript['stop']) - # variants = [variant for variant in db.Variant.select().where(db.Variant.transcripts.contains(transcript_id)).dicts()] - -# for variant in variants: -# variant['vep_annotations'] = [anno for anno in variant['vep_annotations'] if anno['Feature'] == transcript_id] -# add_consequence_to_variant(variant) -# remove_extraneous_information(variant) + ref_dbid = db.get_reference_dbid_dataset(dataset) + if not ref_dbid: + return None + dataset_version = db.get_dataset_version(dataset, ds_version) + if not dataset_version: + return None + + transcript = get_transcript(dataset, gene_id) + + variants = [variant for variant in db.Variant.select() + .join(VariantTranscripts) + .where((db.VariantTranscripts.transcript == transcript['id']) & + (db.Variant.dataset_version == dataset_version)).dicts()] + + utils.add_consequence_to_variants(variants) + for variant in variants: + variant['vep_annotations'] = [anno for anno in variant['vep_annotations'] if anno['Feature'] == transcript_id] + add_rsid_to_variant(dataset, variant) + remove_extraneous_information(variant) return variants From 2d7c4f5a21b7b082ae014aea45da35948b51e06d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Mon, 11 Feb 2019 12:46:59 +0100 Subject: [PATCH 048/170] a couple of small fixes --- backend/modules/browser/lookups.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index 24754640a..4ed40f8f7 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -526,7 +526,7 @@ def get_variants_in_gene(dataset:str, gene_id:str, ds_version:str=None): gene = get_gene(dataset, gene_id) variants = [variant for variant in db.Variant.select() - .join(VariantGenes) + .join(db.VariantGenes) .where((db.VariantGenes.gene == gene['id']) & (db.Variant.dataset_version == dataset_version)).dicts()] @@ -599,10 +599,10 @@ def get_variants_in_transcript(dataset:str, transcript_id:str, ds_version:str=No if not dataset_version: return None - transcript = get_transcript(dataset, gene_id) + transcript = get_transcript(dataset, transcript_id) variants = [variant for variant in db.Variant.select() - .join(VariantTranscripts) + .join(db.VariantTranscripts) .where((db.VariantTranscripts.transcript == transcript['id']) & (db.Variant.dataset_version == dataset_version)).dicts()] From c407e9feeac1d36bf7045a7425914e5caad7902d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Tue, 12 Feb 2019 10:39:13 +0100 Subject: [PATCH 049/170] remove when db is fixed --- backend/modules/browser/browser_handlers.py | 6 ------ backend/modules/browser/tests/test_utils.py | 8 -------- 2 files changed, 14 deletions(-) diff --git a/backend/modules/browser/browser_handlers.py b/backend/modules/browser/browser_handlers.py index 7ddc3c664..00135db70 100644 --- a/backend/modules/browser/browser_handlers.py +++ b/backend/modules/browser/browser_handlers.py @@ -1,4 +1,3 @@ -import json # remove when db is fixed import logging import handlers @@ -82,9 +81,6 @@ def get(self, dataset, gene, ds_version=None): # Gene gene = lookups.get_gene(dataset, gene_id) - #### Remove when db is fixed - gene['stop'] = gene['start'] + 20000 - #### if gene: ret['gene'] = gene @@ -245,14 +241,12 @@ def get(self, dataset, variant, ds_version=None): return # Just get the information we need - variant['quality_metrics'] = json.loads(variant['quality_metrics']) # remove when db is fixed for item in ["variant_id", "chrom", "pos", "ref", "alt", "rsid", "allele_num", "allele_freq", "allele_count", "orig_alt_alleles", "site_quality", "quality_metrics", "transcripts", "genes"]: ret['variant'][item] = variant[item] ret['variant']['filter'] = variant['filter_string'] - variant['vep_annotations'] = json.loads(variant['vep_annotations']) # remove when db is fixed # Variant Effect Predictor (VEP) annotations # https://www.ensembl.org/info/docs/tools/vep/vep_formats.html ret['variant']['consequences'] = [] diff --git a/backend/modules/browser/tests/test_utils.py b/backend/modules/browser/tests/test_utils.py index bb74f5693..96c80db98 100644 --- a/backend/modules/browser/tests/test_utils.py +++ b/backend/modules/browser/tests/test_utils.py @@ -5,8 +5,6 @@ from .. import lookups from .. import utils -import json - def test_add_consequence_to_variants(): """ @@ -15,8 +13,6 @@ def test_add_consequence_to_variants(): variants = [] variants.append(lookups.get_variant('SweGen', 47730411, '21', 'TA', 'T')) variants.append(lookups.get_variant('SweGen', 55500283, '1', 'A', 'T')) - variants[0]['vep_annotations'] = json.loads(variants[0]['vep_annotations']) # remove when db is fixed - variants[1]['vep_annotations'] = json.loads(variants[1]['vep_annotations']) # remove when db is fixed utils.add_consequence_to_variants(variants) assert variants[0]['major_consequence'] == 'intron_variant' @@ -28,12 +24,10 @@ def test_add_consequence_to_variant(): Test add_consequence_to_variant() """ variant = lookups.get_variant('SweGen', 47730411, '21', 'TA', 'T') - variant['vep_annotations'] = json.loads(variant['vep_annotations']) # remove when db is fixed utils.add_consequence_to_variant(variant) assert variant['major_consequence'] == 'intron_variant' variant2 = lookups.get_variant('SweGen', 55500283, '1', 'A', 'T') - variant2['vep_annotations'] = json.loads(variant2['vep_annotations']) # remove when db is fixed utils.add_consequence_to_variant(variant2) assert variant2['major_consequence'] == 'upstream_gene_variant' @@ -43,7 +37,6 @@ def test_annotation_severity(): Test annotation_severity() """ variant = lookups.get_variant('SweGen', 55500283, '1', 'A', 'T') - variant['vep_annotations'] = json.loads(variant['vep_annotations']) # remove when db is fixed res = utils.annotation_severity(variant['vep_annotations'][0]) assert res == -26.9 @@ -151,7 +144,6 @@ def test_worst_csq_from_csq(): Test worst_csq_from_csq() """ variant = lookups.get_variant('SweGen', 55500283, '1', 'A', 'T') - variant['vep_annotations'] = json.loads(variant['vep_annotations']) # remove when db is fixed res = utils.worst_csq_from_csq(variant['vep_annotations'][0]['Consequence']) assert res == 'upstream_gene_variant' res = utils.worst_csq_from_csq('non_coding_exon_variant&nc_transcript_variant') From e9faf47ebe67a8d5c9f530a4296e63185b6a7700 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Tue, 12 Feb 2019 13:06:01 +0100 Subject: [PATCH 050/170] adding genes and transcripts to variants after db schema update --- backend/modules/browser/lookups.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index 4ed40f8f7..a531ef1ef 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -341,15 +341,23 @@ def get_raw_variant(dataset:str, pos:int, chrom:str, ref:str, alt:str, ds_versio return None try: - return (db.Variant - .select() - .where((db.Variant.pos == pos) & - (db.Variant.ref == ref) & - (db.Variant.alt == alt) & - (db.Variant.chrom == chrom) & - (db.Variant.dataset_version == dataset_version.id)) - .dicts() - .get()) + variant = (db.Variant + .select() + .where((db.Variant.pos == pos) & + (db.Variant.ref == ref) & + (db.Variant.alt == alt) & + (db.Variant.chrom == chrom) & + (db.Variant.dataset_version == dataset_version.id)) + .dicts() + .get()) + variant['genes'] = [gene for gene in + db.VariantGenes.select(db.VariantGenes.gene) + .where(db.VariantGenes.variant == variant['id']) + .dicts()] + variant['transcripts'] = [transcript for transcript in + db.VariantTranscripts.select(db.VariantTranscripts.transcript) + .where(db.VariantTranscripts.variant == variant['id']) + .dicts()] except db.Variant.DoesNotExist: logging.error('get_raw_variant({}, {}, {}, {}, {}, {}): unable to retrieve variant' .format(dataset, pos, chrom, ref, alt, dataset_version.id)) From e48d54cca1c54381f156d68a80dffd891cb20480 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Wed, 13 Feb 2019 10:54:56 +0100 Subject: [PATCH 051/170] seems to work with new db schema, but speed is not optimal --- backend/modules/browser/lookups.py | 41 ++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index a531ef1ef..34f2c00a0 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -358,6 +358,7 @@ def get_raw_variant(dataset:str, pos:int, chrom:str, ref:str, alt:str, ds_versio db.VariantTranscripts.select(db.VariantTranscripts.transcript) .where(db.VariantTranscripts.variant == variant['id']) .dicts()] + return variant except db.Variant.DoesNotExist: logging.error('get_raw_variant({}, {}, {}, {}, {}, {}): unable to retrieve variant' .format(dataset, pos, chrom, ref, alt, dataset_version.id)) @@ -537,10 +538,18 @@ def get_variants_in_gene(dataset:str, gene_id:str, ds_version:str=None): .join(db.VariantGenes) .where((db.VariantGenes.gene == gene['id']) & (db.Variant.dataset_version == dataset_version)).dicts()] + ##### remove when db is fixed + for variant in variants: + variant['hom_count'] = 0 + variant['filter'] = variant['filter_string'] + ##### utils.add_consequence_to_variants(variants) for variant in variants: - add_rsid_to_variant(dataset, variant) + if variant['rsid'] and variant['rsid'] != '.': + variant['rsid'] = 'rs{}'.format(variant['rsid']) + else: + add_rsid_to_variant(dataset, variant) remove_extraneous_information(variant) return variants @@ -569,21 +578,21 @@ def get_variants_in_region(dataset:str, chrom:str, start_pos:int, end_pos:int, d (db.Variant.chrom == chrom) & (db.Variant.dataset_version == dataset_version)) .dicts()) + variants = [variant for variant in query] ##### remove when db is fixed for variant in variants: - variant['quality_metrics'] = json.loads(variant['quality_metrics']) - variant['vep_annotations'] = json.loads(variant['vep_annotations']) variant['hom_count'] = 0 variant['filter'] = variant['filter_string'] ##### utils.add_consequence_to_variants(variants) for variant in variants: - if variant['rsid']: + if variant['rsid'] and variant['rsid'] != '.': variant['rsid'] = 'rs{}'.format(variant['rsid']) - add_rsid_to_variant(dataset, variant) + else: + add_rsid_to_variant(dataset, variant) remove_extraneous_information(variant) return variants @@ -610,14 +619,24 @@ def get_variants_in_transcript(dataset:str, transcript_id:str, ds_version:str=No transcript = get_transcript(dataset, transcript_id) variants = [variant for variant in db.Variant.select() - .join(db.VariantTranscripts) - .where((db.VariantTranscripts.transcript == transcript['id']) & - (db.Variant.dataset_version == dataset_version)).dicts()] + .join(db.VariantTranscripts) + .where((db.VariantTranscripts.transcript == transcript['id']) & + (db.Variant.dataset_version == dataset_version)) + .dicts()] + + ##### remove when db is fixed + for variant in variants: + variant['hom_count'] = 0 + variant['filter'] = variant['filter_string'] + ##### utils.add_consequence_to_variants(variants) for variant in variants: variant['vep_annotations'] = [anno for anno in variant['vep_annotations'] if anno['Feature'] == transcript_id] - add_rsid_to_variant(dataset, variant) + if variant['rsid'] and variant['rsid'] != '.': + variant['rsid'] = 'rs{}'.format(variant['rsid']) + else: + add_rsid_to_variant(dataset, variant) remove_extraneous_information(variant) return variants @@ -625,8 +644,8 @@ def get_variants_in_transcript(dataset:str, transcript_id:str, ds_version:str=No def remove_extraneous_information(variant): #del variant['genotype_depths'] #del variant['genotype_qualities'] - del variant['transcripts'] - del variant['genes'] +# del variant['transcripts'] +# del variant['genes'] del variant['orig_alt_alleles'] del variant['site_quality'] del variant['vep_annotations'] From 19f16988487179f8b002649ab9f9026481bf99f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Wed, 13 Feb 2019 13:43:56 +0100 Subject: [PATCH 052/170] decreased logging --- backend/modules/browser/browser_handlers.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/backend/modules/browser/browser_handlers.py b/backend/modules/browser/browser_handlers.py index 00135db70..7b5b2b17f 100644 --- a/backend/modules/browser/browser_handlers.py +++ b/backend/modules/browser/browser_handlers.py @@ -328,9 +328,7 @@ def get(self, dataset, datatype, item, ds_version=None): n = a[0] + "".join([b[0].upper() + b[1:] for b in a.split("_")])[1:] headers += [[n, h]] ret['headers'] = headers - logging.error('Variant request {} items'.format(len(ret))) - logging.error('Variant request {} items'.format(ret)) - self.finish( ret ) + self.finish(ret) class Search(handlers.UnsafeHandler): From 85644cb2c49e242f86e96f031297a1c35f2c03cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Wed, 13 Feb 2019 13:44:48 +0100 Subject: [PATCH 053/170] rsid fixing; they can't be '.' with new db --- backend/modules/browser/lookups.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index 34f2c00a0..0cfd57d2e 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -25,7 +25,7 @@ def add_rsid_to_variant(dataset:str, variant:str): .get()) dbsnp_version = refset['dbsnp_version'] - if variant['rsid'] == '.' or variant['rsid'] is None: + if not variant['rsid']: try: rsid = (db.DbSNP .select() @@ -36,7 +36,8 @@ def add_rsid_to_variant(dataset:str, variant:str): .get()) variant['rsid'] = 'rs{}'.format(rsid['rsid']) except db.DbSNP.DoesNotExist: - logging.error('add_rsid_to_variant({}, variant[dbid: {}]): unable to retrieve rsid'.format(dataset, variant['id'])) + pass + # logging.error('add_rsid_to_variant({}, variant[dbid: {}]): unable to retrieve rsid'.format(dataset, variant['id'])) REGION_REGEX = re.compile(r'^\s*(\d+|X|Y|M|MT)\s*([-:]?)\s*(\d*)-?([\dACTG]*)-?([ACTG]*)') @@ -546,7 +547,7 @@ def get_variants_in_gene(dataset:str, gene_id:str, ds_version:str=None): utils.add_consequence_to_variants(variants) for variant in variants: - if variant['rsid'] and variant['rsid'] != '.': + if variant['rsid']: variant['rsid'] = 'rs{}'.format(variant['rsid']) else: add_rsid_to_variant(dataset, variant) @@ -589,7 +590,7 @@ def get_variants_in_region(dataset:str, chrom:str, start_pos:int, end_pos:int, d utils.add_consequence_to_variants(variants) for variant in variants: - if variant['rsid'] and variant['rsid'] != '.': + if variant['rsid']: variant['rsid'] = 'rs{}'.format(variant['rsid']) else: add_rsid_to_variant(dataset, variant) @@ -633,7 +634,7 @@ def get_variants_in_transcript(dataset:str, transcript_id:str, ds_version:str=No utils.add_consequence_to_variants(variants) for variant in variants: variant['vep_annotations'] = [anno for anno in variant['vep_annotations'] if anno['Feature'] == transcript_id] - if variant['rsid'] and variant['rsid'] != '.': + if variant['rsid']: variant['rsid'] = 'rs{}'.format(variant['rsid']) else: add_rsid_to_variant(dataset, variant) From 428b12e3b9a5370197458f014e889b0a80336851 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Wed, 13 Feb 2019 13:45:23 +0100 Subject: [PATCH 054/170] converting tests to chromosome 22 --- backend/modules/browser/tests/test_lookups.py | 92 +++++++++---------- 1 file changed, 42 insertions(+), 50 deletions(-) diff --git a/backend/modules/browser/tests/test_lookups.py b/backend/modules/browser/tests/test_lookups.py index 043707a01..3160418da 100644 --- a/backend/modules/browser/tests/test_lookups.py +++ b/backend/modules/browser/tests/test_lookups.py @@ -9,16 +9,14 @@ def test_add_rsid_to_variant(): """ Test add_rsid_to_variant() """ - # "with ." - variant = lookups.get_variant('SweGen', 55500283, '1', 'A', 'T') - rsid = variant['rsid'] - variant['rsid'] = '.' + variant = lookups.get_variant('SweGen', 34730985, '22', 'G', 'A') lookups.add_rsid_to_variant('SweGen', variant) - assert variant['rsid'] == rsid - # "non-existing" - del variant['rsid'] + assert variant['rsid'] == 'rs924645261' + variant = lookups.get_variant('SweGen', 16113980, '22', 'C', 'T') + rsid = variant['rsid'] + variant['rsid'] = '' lookups.add_rsid_to_variant('SweGen', variant) - assert variant['rsid'] == rsid + assert variant['rsid'] == 'rs9680543' def test_get_awesomebar_result(): @@ -50,27 +48,17 @@ def test_get_coverage_for_bases(): Test get_coverage_for_bases() """ # normal - coverage = lookups.get_coverage_for_bases('SweGen', '1', 55500283, 55500320) - expected = [{'id': 5474062, 'dataset_version': 4, 'chrom': '1', - 'pos': 55500290, 'mean': 40.66, 'median': 39.0, - 'coverage': [1.0, 1.0, 1.0, 1.0, 0.996, 0.97, 0.867, 0.127, 0.001]}, - {'id': 5474063, 'dataset_version': 4, 'chrom': '1', - 'pos': 55500300, 'mean': 40.7, 'median': 39.0, - 'coverage': [1.0, 1.0, 1.0, 1.0, 0.996, 0.971, 0.878, 0.132, 0.001]}, - {'id': 5474064, 'dataset_version': 4, 'chrom': '1', - 'pos': 55500310, 'mean': 40.35, 'median': 39.0, - 'coverage': [1.0, 1.0, 1.0, 1.0, 0.995, 0.974, 0.859, 0.138, 0.001]}, - {'id': 5474065, 'dataset_version': 4, 'chrom': '1', - 'pos': 55500320, 'mean': 39.69, 'median': 38.0, - 'coverage': [1.0, 1.0, 1.0, 1.0, 0.996, 0.961, 0.856, 0.117, 0.001]}] - assert coverage == expected + coverage = lookups.get_coverage_for_bases('SweGen', '22', 46546423, 46549652) + assert len(coverage) == 323 + assert coverage[0] == {'chrom': '22', 'coverage': [1.0, 1.0, 0.993, 0.91, 0.697, 0.426, 0.2, 0.009, 0.0], + 'dataset_version': 4, 'id': 2954967, 'mean': 24.94, 'median': 24.0, 'pos': 46546430} assert len(lookups.get_coverage_for_bases('SweGen', '22', 46615715, 46615880)) == 17 # no end_pos - coverage = lookups.get_coverage_for_bases('SweGen', '1', 55500290) - expected = [{'id': 5474062, 'dataset_version': 4, 'chrom': '1', - 'pos': 55500290, 'mean': 40.66, 'median': 39.0, - 'coverage': [1.0, 1.0, 1.0, 1.0, 0.996, 0.97, 0.867, 0.127, 0.001]}] + coverage = lookups.get_coverage_for_bases('SweGen', '22', 46546430) + assert coverage == [{'chrom': '22', 'coverage': [1.0, 1.0, 0.993, 0.91, 0.697, 0.426, 0.2, 0.009, 0.0], + 'dataset_version': 4, 'id': 2954967, 'mean': 24.94, 'median': 24.0, 'pos': 46546430}] + assert len(lookups.get_coverage_for_bases('SweGen', '22', 46615715, 46615880)) == 17 # no hits coverage = lookups.get_coverage_for_bases('SweGen', '1', 55500283, 55500285) @@ -84,21 +72,25 @@ def test_get_coverage_for_transcript(): """ Test get_coverage_for_transcript() """ - coverage = lookups.get_coverage_for_transcript('SweGen', '1', 55500283, 55500320) - expected = [{'id': 5474062, 'dataset_version': 4, 'chrom': '1', - 'pos': 55500290, 'mean': 40.66, 'median': 39.0, - 'coverage': [1.0, 1.0, 1.0, 1.0, 0.996, 0.97, 0.867, 0.127, 0.001]}, - {'id': 5474063, 'dataset_version': 4, 'chrom': '1', - 'pos': 55500300, 'mean': 40.7, 'median': 39.0, - 'coverage': [1.0, 1.0, 1.0, 1.0, 0.996, 0.971, 0.878, 0.132, 0.001]}, - {'id': 5474064, 'dataset_version': 4, 'chrom': '1', - 'pos': 55500310, 'mean': 40.35, 'median': 39.0, - 'coverage': [1.0, 1.0, 1.0, 1.0, 0.995, 0.974, 0.859, 0.138, 0.001]}, - {'id': 5474065, 'dataset_version': 4, 'chrom': '1', - 'pos': 55500320, 'mean': 39.69, 'median': 38.0, - 'coverage': [1.0, 1.0, 1.0, 1.0, 0.996, 0.961, 0.856, 0.117, 0.001]}] - assert coverage == expected - assert not lookups.get_coverage_for_transcript('BAD_DATASET', '1', 55500283, 55500320) + # normal + coverage = lookups.get_coverage_for_bases('SweGen', '22', 46546423, 46549652) + assert len(coverage) == 323 + assert coverage[0] == {'chrom': '22', 'coverage': [1.0, 1.0, 0.993, 0.91, 0.697, 0.426, 0.2, 0.009, 0.0], + 'dataset_version': 4, 'id': 2954967, 'mean': 24.94, 'median': 24.0, 'pos': 46546430} + assert len(lookups.get_coverage_for_bases('SweGen', '22', 46615715, 46615880)) == 17 + + # no end_pos + coverage = lookups.get_coverage_for_bases('SweGen', '22', 46546430) + assert coverage == [{'chrom': '22', 'coverage': [1.0, 1.0, 0.993, 0.91, 0.697, 0.426, 0.2, 0.009, 0.0], + 'dataset_version': 4, 'id': 2954967, 'mean': 24.94, 'median': 24.0, 'pos': 46546430}] + assert len(lookups.get_coverage_for_bases('SweGen', '22', 46615715, 46615880)) == 17 + + # no hits + coverage = lookups.get_coverage_for_bases('SweGen', '1', 55500283, 55500285) + assert not coverage + + # incorrect dataset + assert not lookups.get_coverage_for_bases('BAD_DATASET', '1', 55500283, 55500320) def test_get_exons_in_transcript(caplog): @@ -124,16 +116,16 @@ def test_get_gene(): Test get_gene() """ # normal entry - expected = {'id': 1, - 'reference_set': 1, - 'gene_id': 'ENSG00000223972', - 'name': 'DDX11L1', - 'full_name': 'DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 1', - 'canonical_transcript': 'ENST00000456328', - 'chrom': '1', - 'start': 11870, + expected = {'gene_id': 'ENSG00000223972', + 'name': 'SNORA15', + 'full_name': '', + 'canonical_transcript': 'ENST00000516131', + 'chrom': '22', + 'start': 19237396, + 'stop': 19237489, 'strand': '+'} - result = lookups.get_gene('SweGen', 'ENSG00000223972') + + result = lookups.get_gene('SweGen', 'ENSG00000251940') for val in expected: assert result[val] == expected[val] From 314eb20753faf9478af3f7d308858788691a058c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Wed, 13 Feb 2019 14:03:34 +0100 Subject: [PATCH 055/170] fix for finding other names; return the db.Gene part --- backend/modules/browser/lookups.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index 0cfd57d2e..011737ff6 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -261,12 +261,19 @@ def get_gene_by_name(dataset:str, gene_name:str): if not ref_dbid: return {} try: - return db.Gene.select().where((db.Gene.reference_set == ref_dbid) & - (db.Gene.name==gene_name)).dicts().get() + return (db.Gene.select() + .where((db.Gene.reference_set == ref_dbid) & + (db.Gene.name==gene_name)) + .dicts() + .get()) except db.Gene.DoesNotExist: try: - return db.GeneOtherNames.select().join(db.Gene).where((db.GeneOtherNames.name == gene_name) & - (db.Gene.reference_set == ref_dbid)).dicts().get() + return (db.GeneOtherNames.select(db.Gene) + .join(db.Gene) + .where((db.GeneOtherNames.name == gene_name) & + (db.Gene.reference_set == ref_dbid)) + .dicts() + .get()) except db.GeneOtherNames.DoesNotExist: logging.error('get_gene_by_name({}, {}): unable to retrieve gene'.format(dataset, gene_name)) return {} From dccc240cd5c3128c0b7d028444bf32c9bdea94a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Wed, 13 Feb 2019 14:42:35 +0100 Subject: [PATCH 056/170] handling a few cases of missing data observed during testing --- backend/modules/browser/lookups.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index 011737ff6..243c46b0c 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -325,6 +325,7 @@ def get_number_of_variants_in_transcript(dataset:str, transcript_id:str, ds_vers if not variants: return None total = len(variants) + filtered = len(tuple(variant for variant in variants if variant['filter_string'] == 'PASS')) return {'filtered': filtered, 'total': total} @@ -383,9 +384,11 @@ def get_transcript(dataset:str, transcript_id:str): transcript_id (str): the id of the transcript Returns: - dict: values for the transcript, including exons; empty if not found + dict: values for the transcript, including exons; None if not found """ ref_dbid = db.get_reference_dbid_dataset(dataset) + if not ref_dbid: + return None try: transcript = (db.Transcript .select() @@ -397,7 +400,7 @@ def get_transcript(dataset:str, transcript_id:str): transcript['exons'] = get_exons_in_transcript(dataset, transcript_id) return transcript except db.Transcript.DoesNotExist: - return {} + return None def get_transcripts_in_gene(dataset:str, gene_id:str): @@ -625,6 +628,8 @@ def get_variants_in_transcript(dataset:str, transcript_id:str, ds_version:str=No return None transcript = get_transcript(dataset, transcript_id) + if not transcript: + return None variants = [variant for variant in db.Variant.select() .join(db.VariantTranscripts) From 887baed71cb8f5ca7a298d2cd7a03927c8a3a052 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Wed, 13 Feb 2019 15:03:34 +0100 Subject: [PATCH 057/170] fix incorrect naming of returned genes/transcripts --- backend/modules/browser/lookups.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index 243c46b0c..2d94255e1 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -359,12 +359,14 @@ def get_raw_variant(dataset:str, pos:int, chrom:str, ref:str, alt:str, ds_versio (db.Variant.dataset_version == dataset_version.id)) .dicts() .get()) - variant['genes'] = [gene for gene in - db.VariantGenes.select(db.VariantGenes.gene) + variant['genes'] = [gene['gene_id'] for gene in + db.VariantGenes.select(db.Gene.gene_id) + .join(db.Gene) .where(db.VariantGenes.variant == variant['id']) .dicts()] - variant['transcripts'] = [transcript for transcript in - db.VariantTranscripts.select(db.VariantTranscripts.transcript) + variant['transcripts'] = [transcript['transcript_id'] for transcript in + db.VariantTranscripts.select(db.Transcript.transcript_id) + .join(db.Transcript) .where(db.VariantTranscripts.variant == variant['id']) .dicts()] return variant From f8337395f0691ad2719d7a8155666aba31398bb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Wed, 13 Feb 2019 15:04:14 +0100 Subject: [PATCH 058/170] further migration of test to chromosome 22 --- backend/modules/browser/tests/test_lookups.py | 113 ++++++------------ 1 file changed, 38 insertions(+), 75 deletions(-) diff --git a/backend/modules/browser/tests/test_lookups.py b/backend/modules/browser/tests/test_lookups.py index 3160418da..82d077eea 100644 --- a/backend/modules/browser/tests/test_lookups.py +++ b/backend/modules/browser/tests/test_lookups.py @@ -116,9 +116,9 @@ def test_get_gene(): Test get_gene() """ # normal entry - expected = {'gene_id': 'ENSG00000223972', + expected = {'gene_id': 'ENSG00000251940', 'name': 'SNORA15', - 'full_name': '', + 'full_name': None, 'canonical_transcript': 'ENST00000516131', 'chrom': '22', 'start': 19237396, @@ -143,14 +143,13 @@ def test_get_gene_by_dbid(): Test get_gene_by_dbid() """ # normal entry - expected = {'id': 53626, - 'reference_set': 1, - 'gene_id': 'ENSG00000226444', + expected = {'gene_id': 'ENSG00000226444', 'name': 'ACTR3BP6', 'full_name': 'ACTR3B pseudogene 6', 'canonical_transcript': 'ENST00000421366', 'chrom': '22', - 'start': 16967411, + 'start': 16967410, + 'stop': 16969212, 'strand': '+'} result = lookups.get_gene_by_dbid(53626) for val in expected: @@ -162,30 +161,23 @@ def test_get_gene_by_dbid(): result = lookups.get_gene_by_dbid(-1) assert not result + def test_get_gene_by_name(caplog): """ Test get_gene_by_name() """ # normal entry - expected = {'id': 1, - 'reference_set': 1, - 'gene_id': 'ENSG00000223972', - 'gene_name': 'DDX11L1', - 'full_name': 'DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 1', - 'canonical_transcript': 'ENST00000456328', - 'chrom': '1', - 'start_pos': 11870, + expected = {'gene_id': 'ENSG00000226444', + 'name': 'ACTR3BP6', + 'full_name': 'ACTR3B pseudogene 6', + 'canonical_transcript': 'ENST00000421366', + 'chrom': '22', + 'start': 16967410, + 'stop': 16969212, 'strand': '+'} - result = lookups.get_gene_by_name('SweGen', 'DDX11L1') - assert result['id'] == expected['id'] - assert result['reference_set'] == expected['reference_set'] - assert result['gene_id'] == expected['gene_id'] - assert result['name'] == expected['gene_name'] - assert result['full_name'] == expected['full_name'] - assert result['canonical_transcript'] == expected['canonical_transcript'] - assert result['chrom'] == expected['chrom'] - assert result['start'] == expected['start_pos'] - assert result['strand'] == expected['strand'] + result = lookups.get_gene_by_name('SweGen', 'ACTR3BP6') + for val in expected: + assert result[val] == expected[val] # non-existing gene result = lookups.get_gene_by_name('SweGen', 'NOT_A_GENE') @@ -197,8 +189,9 @@ def test_get_gene_by_name(caplog): assert not result # name in other_names - result = lookups.get_gene_by_name('SweGen', 'NIR') - assert result['gene_id'] == 'ENSG00000188976' + result = lookups.get_gene_by_name('SweGen', 'BCL8C') + print(result) + assert result['gene_id'] == 'ENSG00000223875' def test_get_genes_in_region(): @@ -224,7 +217,7 @@ def test_get_number_of_variants_in_transcript(): """ # normal res = lookups.get_number_of_variants_in_transcript('SweGen', 'ENST00000424770') - assert res == {'filtered': 243, 'total': 309} + assert res == {'filtered': 66, 'total': 309} # bad transcript res = lookups.get_number_of_variants_in_transcript('SweGen', 'ENSTASDSADA') @@ -240,38 +233,16 @@ def test_get_transcript(): Test get_transcript() """ # normal entry - expected = {'id': 5, - 'transcript_id': 'ENST00000438504', - 'gene': '2', - 'mim_annotation': 'Was protein family homolog 1; wash1', - 'chrom': '1', - 'mim_gene_accession': 613632, - 'start_pos': 14364, - 'stop_pos': 29371, - 'strand': '-'} - exp_exon = [{'id': 28, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 14364, 'stop': 14830, 'strand': '-', 'feature_type': 'exon'}, - {'id': 27, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 14971, 'stop': 15039, 'strand': '-', 'feature_type': 'exon'}, - {'id': 26, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 15797, 'stop': 15902, 'strand': '-', 'feature_type': 'exon'}, - {'id': 25, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 15905, 'stop': 15948, 'strand': '-', 'feature_type': 'exon'}, - {'id': 24, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 16608, 'stop': 16766, 'strand': '-', 'feature_type': 'exon'}, - {'id': 23, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 16855, 'stop': 17056, 'strand': '-', 'feature_type': 'exon'}, - {'id': 22, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 17234, 'stop': 17365, 'strand': '-', 'feature_type': 'exon'}, - {'id': 21, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 17603, 'stop': 17743, 'strand': '-', 'feature_type': 'exon'}, - {'id': 20, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 17916, 'stop': 18062, 'strand': '-', 'feature_type': 'exon'}, - {'id': 19, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 18269, 'stop': 18380, 'strand': '-', 'feature_type': 'exon'}, - {'id': 18, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 24739, 'stop': 24892, 'strand': '-', 'feature_type': 'exon'}, - {'id': 17, 'gene': 2, 'transcript': 5, 'chrom': '1', 'start': 29322, 'stop': 29371, 'strand': '-', 'feature_type': 'exon'}] - - result = lookups.get_transcript('SweGen', 'ENST00000438504') - assert result['id'] == expected['id'] - assert result['mim_annotation'] == expected['mim_annotation'] - assert result['transcript_id'] == expected['transcript_id'] - assert result['mim_gene_accession'] == expected['mim_gene_accession'] - assert result['chrom'] == expected['chrom'] - assert result['start'] == expected['start_pos'] - assert result['stop'] == expected['stop_pos'] - assert result['strand'] == expected['strand'] - assert result['exons'] == exp_exon + expected = {'transcript_id': 'ENST00000398242', + 'chrom': '22', + 'start': 16122720, + 'stop': 16123768, + 'strand': '+'} + + result = lookups.get_transcript('SweGen', 'ENST00000398242') + for val in expected: + assert result[val] == expected[val] + assert len(result['exons']) == 1 # non-existing assert not lookups.get_transcript('SweGen', 'INCORRECT') @@ -281,14 +252,8 @@ def test_get_transcripts_in_gene(): """ Test get_transcripts_in_gene() """ - res = lookups.get_transcripts_in_gene('SweGen', 'ENSG00000241670') - expected = [{'id': 39, 'transcript_id': 'ENST00000424429', 'gene': 19, - 'mim_gene_accession': None, 'mim_annotation': None, - 'chrom': '1', 'start': 228293, 'stop': 228655, 'strand': '-'}, - {'id': 40, 'transcript_id': 'ENST00000450734', 'gene': 19, - 'mim_gene_accession': None, 'mim_annotation': None, - 'chrom': '1', 'start': 228320, 'stop': 228776, 'strand': '-'}] - assert res == expected + res = lookups.get_transcripts_in_gene('SweGen', 'ENSG00000103197') + assert len(res) == 27 assert not lookups.get_transcripts_in_gene('bad_dataset', 'ENSG00000241670') assert not lookups.get_transcripts_in_gene('SweGen', 'ENSGASDFG') @@ -298,9 +263,9 @@ def test_get_raw_variant(): """ Test get_raw_variant """ - result = lookups.get_raw_variant('SweGen', 55500283, '1', 'A', 'T') - assert result['genes'] == ['ENSG00000169174'] - assert result['transcripts'] == ['ENST00000302118'] + result = lookups.get_variant('SweGen', 16057464, '22', 'G', 'A') + assert result['genes'] == ['ENSG00000233866'] + assert result['transcripts'] == ['ENST00000424770'] assert not lookups.get_raw_variant('SweGen', 55500281, '1', 'A', 'T') assert not lookups.get_raw_variant('bad_dataset', 55500283, '1', 'A', 'T') @@ -319,13 +284,11 @@ def test_get_variant(): """ Test get_variant() """ - result = lookups.get_variant('SweGen', 55500283, '1', 'A', 'T') - assert result['genes'] == ['ENSG00000169174'] - assert result['transcripts'] == ['ENST00000302118'] - assert result['rsid'] == 'rs75050571' + result = lookups.get_variant('SweGen', 16057464, '22', 'G', 'A') + assert result['genes'] == ['ENSG00000233866'] + assert result['transcripts'] == ['ENST00000424770'] # missing rsid in result, multiple transcripts - # slow, need to fix db result = lookups.get_variant('SweGen', 47730411, '21', 'TA', 'T') assert result['genes'] == ['ENSG00000160298'] assert result['transcripts'] == ['ENST00000417060', 'ENST00000397682', From 862f0c81dc8322de3dd28c167773aeca62ac3661 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Tue, 19 Feb 2019 13:21:07 +0100 Subject: [PATCH 059/170] lookups updated for testing only chr22 --- backend/modules/browser/lookups.py | 18 +++++++++-- backend/modules/browser/tests/test_lookups.py | 31 +++++++------------ 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index 2d94255e1..e23cc2be8 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -500,12 +500,14 @@ def get_variants_by_rsid(dataset:str, rsid:str, check_position:str=False, ds_ver .where(db.Dataset.short_name == dataset) .dicts() .get()) + if not refset: + return [] dbsnp_version = refset['dbsnp_version'] rsid_dbsnp = (db.DbSNP .select() .where((db.DbSNP.rsid == rsid) & - (db.DbSNP.version_id == dbsnp_version) ) + (db.DbSNP.version_id == dbsnp_version)) .dicts() .get()) query = (db.Variant @@ -522,7 +524,19 @@ def get_variants_by_rsid(dataset:str, rsid:str, check_position:str=False, ds_ver .dicts()) variants = [variant for variant in query] - # add_consequence_to_variants(variants) + for variant in variants: + variant['genes'] = [gene['gene_id'] for gene in + db.VariantGenes.select(db.Gene.gene_id) + .join(db.Gene) + .where(db.VariantGenes.variant == variant['id']) + .dicts()] + variant['transcripts'] = [transcript['transcript_id'] for transcript in + db.VariantTranscripts.select(db.Transcript.transcript_id) + .join(db.Transcript) + .where(db.VariantTranscripts.variant == variant['id']) + .dicts()] + + utils.add_consequence_to_variants(variants) return variants diff --git a/backend/modules/browser/tests/test_lookups.py b/backend/modules/browser/tests/test_lookups.py index 82d077eea..4ab9674d5 100644 --- a/backend/modules/browser/tests/test_lookups.py +++ b/backend/modules/browser/tests/test_lookups.py @@ -288,17 +288,6 @@ def test_get_variant(): assert result['genes'] == ['ENSG00000233866'] assert result['transcripts'] == ['ENST00000424770'] - # missing rsid in result, multiple transcripts - result = lookups.get_variant('SweGen', 47730411, '21', 'TA', 'T') - assert result['genes'] == ['ENSG00000160298'] - assert result['transcripts'] == ['ENST00000417060', 'ENST00000397682', - 'ENST00000397683', 'ENST00000397680', - 'ENST00000397685', 'ENST00000397679', - 'ENST00000291691', 'ENST00000445935', - 'ENST00000491666', 'ENST00000472607', - 'ENST00000475776'] - assert result['rsid'] == 'rs75050571' - # incorrect position assert not lookups.get_variant('SweGen', -1, '1', 'A', 'T') @@ -308,16 +297,18 @@ def test_get_variants_by_rsid(caplog): Test get_variants_by_rsid() ''' # normal - result = lookups.get_variants_by_rsid('SweGen', 'rs373706802') - assert result[0]['pos'] == 16080482 - assert result[0]['genes'] == ['ENSG00000229286', 'ENSG00000235265'] - assert result[0]['transcripts'] == ['ENST00000448070','ENST00000413156'] + result = lookups.get_variants_by_rsid('SweGen', 'rs185758992') + assert result[0]['pos'] == 38481311 + assert set(result[0]['genes']) == set(['ENSG00000100156', 'ENSG00000128298', 'ENSG00000272720']) + assert len(result[0]['genes']) == 3 + assert len(result[0]['transcripts']) == 6 # by position - result = lookups.get_variants_by_rsid('SweGen', 'rs373706802', check_position=True) - assert result[0]['pos'] == 16080482 - assert result[0]['genes'] == ['ENSG00000229286', 'ENSG00000235265'] - assert result[0]['transcripts'] == ['ENST00000448070','ENST00000413156'] + result = lookups.get_variants_by_rsid('SweGen', 'rs185758992', check_position=True) + assert result[0]['pos'] == 38481311 + assert set(result[0]['genes']) == set(['ENSG00000100156', 'ENSG00000128298', 'ENSG00000272720']) + assert len(result[0]['genes']) == 3 + assert len(result[0]['transcripts']) == 6 # errors assert lookups.get_variants_by_rsid('incorrect_name', 'rs373706802') is None @@ -367,4 +358,4 @@ def test_get_variants_in_transcript(): Test get_variants_in_transcript() """ res = lookups.get_variants_in_transcript('SweGen', 'ENST00000452800') - assert len(res) == 1414 + assert len(res) == 1174 From 667a8a5f728295ff42059ead10cfe823b32989fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Tue, 19 Feb 2019 13:21:30 +0100 Subject: [PATCH 060/170] pgsql updated for testing only chr22 --- backend/modules/browser/pgsql.py | 45 ++++++++------- backend/modules/browser/tests/test_pgsql.py | 64 +++++++++++++++++++++ 2 files changed, 87 insertions(+), 22 deletions(-) create mode 100644 backend/modules/browser/tests/test_pgsql.py diff --git a/backend/modules/browser/pgsql.py b/backend/modules/browser/pgsql.py index 9a7be7f38..2289ad22d 100644 --- a/backend/modules/browser/pgsql.py +++ b/backend/modules/browser/pgsql.py @@ -40,10 +40,12 @@ def get_coverage(dataset:str, datatype:str, item:str, ds_version:str=None): if datatype == 'gene': gene = lookups.get_gene(dataset, item) - transcript = lookups.get_transcript(dataset, gene['canonical_transcript']) - start = transcript['start'] - EXON_PADDING - stop = transcript['stop'] + EXON_PADDING - ret['coverage'] = lookups.get_coverage_for_transcript(dataset, transcript['chrom'], start, stop, ds_version) + if gene: + transcript = lookups.get_transcript(dataset, gene['canonical_transcript']) + if transcript: + start = transcript['start'] - EXON_PADDING + stop = transcript['stop'] + EXON_PADDING + ret['coverage'] = lookups.get_coverage_for_transcript(dataset, transcript['chrom'], start, stop, ds_version) elif datatype == 'region': chrom, start, stop = item.split('-') start = int(start) @@ -51,9 +53,10 @@ def get_coverage(dataset:str, datatype:str, item:str, ds_version:str=None): ret['coverage'] = lookups.get_coverage_for_bases(dataset, chrom, start, stop, ds_version) elif datatype == 'transcript': transcript = lookups.get_transcript(dataset, item) - start = transcript['start'] - EXON_PADDING - stop = transcript['stop'] + EXON_PADDING - ret['coverage'] = lookups.get_coverage_for_transcript(dataset, transcript['chrom'], start, stop, ds_version) + if transcript: + start = transcript['start'] - EXON_PADDING + stop = transcript['stop'] + EXON_PADDING + ret['coverage'] = lookups.get_coverage_for_transcript(dataset, transcript['chrom'], start, stop, ds_version) return ret @@ -72,24 +75,22 @@ def get_coverage_pos(dataset:str, datatype:str, item:str): """ ret = {'start':None, 'stop':None, 'chrom':None} - if datatype == 'gene': - gene = lookups.get_gene(dataset, item) - transcript = lookups.get_transcript(dataset, gene['canonical_transcript']) - elif datatype == 'transcript': - transcript = lookups.get_transcript(dataset, item) - if datatype == 'region': chrom, start, stop = item.split('-') - start = int(start) - stop = int(stop) + if start and stop and chrom: + ret['start'] = int(start) + ret['stop'] = int(stop) + ret['chrom'] = chrom else: - start = transcript['start'] - EXON_PADDING - stop = transcript['stop'] + EXON_PADDING - chrom = transcript['chrom'] - - ret['start'] = start - ret['stop'] = stop - ret['chrom'] = chrom + if datatype == 'gene': + gene = lookups.get_gene(dataset, item) + transcript = lookups.get_transcript(dataset, gene['canonical_transcript']) + elif datatype == 'transcript': + transcript = lookups.get_transcript(dataset, item) + if transcript: + ret['start'] = transcript['start'] - EXON_PADDING + ret['stop'] = transcript['stop'] + EXON_PADDING + ret['chrom'] = transcript['chrom'] return ret diff --git a/backend/modules/browser/tests/test_pgsql.py b/backend/modules/browser/tests/test_pgsql.py new file mode 100644 index 000000000..ac0014602 --- /dev/null +++ b/backend/modules/browser/tests/test_pgsql.py @@ -0,0 +1,64 @@ +""" +Tests for the functions available in pgsql.py +""" + +from .. import pgsql + + +def test_get_autocomplete(): + """ + Test get_autocomplete() + """ + res = pgsql.get_autocomplete('ADH') + expected = set(['ADH1A', 'ADH1B', 'ADH1C', 'ADH4', + 'ADH5', 'ADH6', 'ADH7', 'ADH5P2', + 'ADH5P3', 'ADH5P4', 'ADHFE1']) + assert set(res) == expected + + +def test_get_coverage(): + """ + Test get_coverage() + """ + res = pgsql.get_coverage('SweGen', 'gene', 'ENSG00000231565') + assert len(res['coverage']) == 144 + res = pgsql.get_coverage('SweGen', 'region', '22-46615715-46615880') + assert len(res['coverage']) == 17 + res = pgsql.get_coverage('SweGen', 'transcript', 'ENST00000438441') + assert len(res['coverage']) == 144 + + assert not pgsql.get_coverage('BAD_SET', 'transcript', 'ENST00000438441')['coverage'] + + +def test_get_coverage_pos(): + """ + Test get_coverage_pos() + """ + res = pgsql.get_coverage_pos('SweGen', 'gene', 'ENSG00000231565') + assert res['chrom'] == '22' + assert res['start'] == 16364817 + assert res['stop'] == 16366254 + res = pgsql.get_coverage_pos('SweGen', 'region', '22-46615715-46615880') + assert res['chrom'] == '22' + assert res['start'] == 46615715 + assert res['stop'] == 46615880 + res = pgsql.get_coverage_pos('SweGen', 'transcript', 'ENST00000438441') + assert res['chrom'] == '22' + assert res['start'] == 16364817 + assert res['stop'] == 16366254 + + res = pgsql.get_coverage_pos('BAD_SET', 'transcript', 'ENST00000438441') + for value in res.values(): + assert not value + + +def test_get_variant_list(): + """ + Test get_variant_list() + """ + res = pgsql.get_variant_list('SweGen', 'gene', 'ENSG00000231565') + assert len(res['variants']) == 405 + res = pgsql.get_variant_list('SweGen', 'region', '22-46615715-46615880') + assert len(res['variants']) == 3 + res = pgsql.get_variant_list('SweGen', 'transcript', 'ENST00000438441') + assert len(res['variants']) == 405 From 133b6d6404fec984590aa80e7a4ce52b7e0da1ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Tue, 19 Feb 2019 13:30:49 +0100 Subject: [PATCH 061/170] utils updated to test chr 22 --- backend/modules/browser/tests/test_utils.py | 24 ++++++++++++--------- backend/modules/browser/utils.py | 2 ++ 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/backend/modules/browser/tests/test_utils.py b/backend/modules/browser/tests/test_utils.py index 96c80db98..aabfe1685 100644 --- a/backend/modules/browser/tests/test_utils.py +++ b/backend/modules/browser/tests/test_utils.py @@ -11,11 +11,11 @@ def test_add_consequence_to_variants(): Test add_consequence_to_variants() """ variants = [] - variants.append(lookups.get_variant('SweGen', 47730411, '21', 'TA', 'T')) - variants.append(lookups.get_variant('SweGen', 55500283, '1', 'A', 'T')) + variants.append(lookups.get_variant('SweGen', 38481311, '22', 'C', 'T')) + variants.append(lookups.get_variant('SweGen', 38480546, '22', 'TG', 'TGG')) utils.add_consequence_to_variants(variants) - assert variants[0]['major_consequence'] == 'intron_variant' + assert variants[0]['major_consequence'] == 'missense_variant' assert variants[1]['major_consequence'] == 'upstream_gene_variant' @@ -23,20 +23,24 @@ def test_add_consequence_to_variant(): """ Test add_consequence_to_variant() """ - variant = lookups.get_variant('SweGen', 47730411, '21', 'TA', 'T') + variant = lookups.get_variant('SweGen', 38481311, '22', 'C', 'T') utils.add_consequence_to_variant(variant) - assert variant['major_consequence'] == 'intron_variant' + assert variant['major_consequence'] == 'missense_variant' - variant2 = lookups.get_variant('SweGen', 55500283, '1', 'A', 'T') - utils.add_consequence_to_variant(variant2) - assert variant2['major_consequence'] == 'upstream_gene_variant' + variant = lookups.get_variant('SweGen', 38480546, '22', 'TG', 'TGG') + utils.add_consequence_to_variant(variant) + assert variant['major_consequence'] == 'upstream_gene_variant' + + # bad variant + variant = lookups.get_variant('SweGen', 38481311, '444', 'C', 'T') + assert not variant def test_annotation_severity(): """ Test annotation_severity() """ - variant = lookups.get_variant('SweGen', 55500283, '1', 'A', 'T') + variant = lookups.get_variant('SweGen', 38481311, '22', 'C', 'T') res = utils.annotation_severity(variant['vep_annotations'][0]) assert res == -26.9 @@ -143,7 +147,7 @@ def test_worst_csq_from_csq(): """ Test worst_csq_from_csq() """ - variant = lookups.get_variant('SweGen', 55500283, '1', 'A', 'T') + variant = lookups.get_variant('SweGen', 38481311, '22', 'C', 'T') res = utils.worst_csq_from_csq(variant['vep_annotations'][0]['Consequence']) assert res == 'upstream_gene_variant' res = utils.worst_csq_from_csq('non_coding_exon_variant&nc_transcript_variant') diff --git a/backend/modules/browser/utils.py b/backend/modules/browser/utils.py index c045f6536..79f63a180 100644 --- a/backend/modules/browser/utils.py +++ b/backend/modules/browser/utils.py @@ -92,6 +92,8 @@ def add_consequence_to_variant(variant): Args: variant (dict): variant information """ + if not variant: + return dict() worst_csq = worst_csq_with_vep(variant['vep_annotations']) variant['major_consequence'] = '' if worst_csq is None: From 816f84271c936656bfeb2bf409092bcad8385b7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Tue, 19 Feb 2019 14:06:21 +0100 Subject: [PATCH 062/170] rsid != '.' in postgres --- backend/modules/browser/pgsql.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/backend/modules/browser/pgsql.py b/backend/modules/browser/pgsql.py index 2289ad22d..f339b3687 100644 --- a/backend/modules/browser/pgsql.py +++ b/backend/modules/browser/pgsql.py @@ -123,8 +123,6 @@ def get_variant_list(dataset:str, datatype:str, item:str, ds_version:str=None): # Format output def format_variant(variant): - if variant['rsid'] == '.': - variant['rsid'] = '' variant['major_consequence'] = (variant['major_consequence'].replace('_variant','') .replace('_prime_', '\'') .replace('_', ' ')) From 9a9e5187977fb6bf07e04d936580887bdd87b0a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Tue, 19 Feb 2019 14:13:19 +0100 Subject: [PATCH 063/170] increased coverage --- backend/modules/browser/lookups.py | 14 ++++---------- backend/modules/browser/tests/test_lookups.py | 17 ++++++++--------- backend/modules/browser/tests/test_utils.py | 1 + 3 files changed, 13 insertions(+), 19 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index e23cc2be8..37f3be895 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -138,7 +138,7 @@ def get_coverage_for_bases(dataset:str, chrom:str, start_pos:int, end_pos:int=No """ dataset_version = db.get_dataset_version(dataset, ds_version) if not dataset_version: - return None + return [] if end_pos is None: end_pos = start_pos @@ -171,7 +171,7 @@ def get_coverage_for_transcript(dataset:str, chrom:str, start_pos:int, end_pos:i # only return coverages that have coverage (if that makes any sense?) # return coverage_array if not coverage_array: - return None + return [] covered = [c for c in coverage_array if c['mean']] return covered @@ -189,7 +189,7 @@ def get_exons_in_transcript(dataset:str, transcript_id:str): """ ref_dbid = db.get_reference_dbid_dataset(dataset) if not ref_dbid: - logging.error('get_exons_in_transcript({}, {}): unable to find dataset dbid'.format(dataset, transcript_id)) + logging.info('get_exons_in_transcript({}, {}): unable to find dataset dbid'.format(dataset, transcript_id)) return None try: transcript = (db.Transcript @@ -199,7 +199,7 @@ def get_exons_in_transcript(dataset:str, transcript_id:str): (db.Gene.reference_set == ref_dbid)) .get()) except db.Transcript.DoesNotExist: - logging.error('get_exons_in_transcript({}, {}): unable to retrieve transcript'.format(dataset, transcript_id)) + logging.info('get_exons_in_transcript({}, {}): unable to retrieve transcript'.format(dataset, transcript_id)) return None wanted_types = ('CDS', 'UTR', 'exon') return sorted(list(db.Feature.select().where((db.Feature.transcript == transcript) & @@ -500,8 +500,6 @@ def get_variants_by_rsid(dataset:str, rsid:str, check_position:str=False, ds_ver .where(db.Dataset.short_name == dataset) .dicts() .get()) - if not refset: - return [] dbsnp_version = refset['dbsnp_version'] rsid_dbsnp = (db.DbSNP @@ -556,8 +554,6 @@ def get_variants_in_gene(dataset:str, gene_id:str, ds_version:str=None): if not ref_dbid: return None dataset_version = db.get_dataset_version(dataset, ds_version) - if not dataset_version: - return None gene = get_gene(dataset, gene_id) @@ -640,8 +636,6 @@ def get_variants_in_transcript(dataset:str, transcript_id:str, ds_version:str=No if not ref_dbid: return None dataset_version = db.get_dataset_version(dataset, ds_version) - if not dataset_version: - return None transcript = get_transcript(dataset, transcript_id) if not transcript: diff --git a/backend/modules/browser/tests/test_lookups.py b/backend/modules/browser/tests/test_lookups.py index 4ab9674d5..154729805 100644 --- a/backend/modules/browser/tests/test_lookups.py +++ b/backend/modules/browser/tests/test_lookups.py @@ -73,27 +73,27 @@ def test_get_coverage_for_transcript(): Test get_coverage_for_transcript() """ # normal - coverage = lookups.get_coverage_for_bases('SweGen', '22', 46546423, 46549652) + coverage = lookups.get_coverage_for_transcript('SweGen', '22', 46546423, 46549652) assert len(coverage) == 323 assert coverage[0] == {'chrom': '22', 'coverage': [1.0, 1.0, 0.993, 0.91, 0.697, 0.426, 0.2, 0.009, 0.0], 'dataset_version': 4, 'id': 2954967, 'mean': 24.94, 'median': 24.0, 'pos': 46546430} - assert len(lookups.get_coverage_for_bases('SweGen', '22', 46615715, 46615880)) == 17 + assert len(lookups.get_coverage_for_transcript('SweGen', '22', 46615715, 46615880)) == 17 # no end_pos - coverage = lookups.get_coverage_for_bases('SweGen', '22', 46546430) + coverage = lookups.get_coverage_for_transcript('SweGen', '22', 46546430) assert coverage == [{'chrom': '22', 'coverage': [1.0, 1.0, 0.993, 0.91, 0.697, 0.426, 0.2, 0.009, 0.0], 'dataset_version': 4, 'id': 2954967, 'mean': 24.94, 'median': 24.0, 'pos': 46546430}] - assert len(lookups.get_coverage_for_bases('SweGen', '22', 46615715, 46615880)) == 17 + assert len(lookups.get_coverage_for_transcript('SweGen', '22', 46615715, 46615880)) == 17 # no hits - coverage = lookups.get_coverage_for_bases('SweGen', '1', 55500283, 55500285) + coverage = lookups.get_coverage_for_transcript('SweGen', '1', 55500283, 55500285) assert not coverage # incorrect dataset - assert not lookups.get_coverage_for_bases('BAD_DATASET', '1', 55500283, 55500320) + assert not lookups.get_coverage_for_transcript('BAD_DATASET', '1', 55500283, 55500320) -def test_get_exons_in_transcript(caplog): +def test_get_exons_in_transcript(): """ Test get_exons_in_transcript() """ @@ -103,12 +103,10 @@ def test_get_exons_in_transcript(caplog): # bad dataset result = lookups.get_exons_in_transcript('NO_DATASET', 'ENST00000215855') assert not result - assert caplog.messages[0] == 'get_exons_in_transcript(NO_DATASET, ENST00000215855): unable to find dataset dbid' # bad transcript result = lookups.get_exons_in_transcript('SweGen', 'BAD_TRANSCRIPT') assert not result - assert caplog.messages[1] == 'get_exons_in_transcript(SweGen, BAD_TRANSCRIPT): unable to retrieve transcript' def test_get_gene(): @@ -359,3 +357,4 @@ def test_get_variants_in_transcript(): """ res = lookups.get_variants_in_transcript('SweGen', 'ENST00000452800') assert len(res) == 1174 + res = lookups.get_variants_in_transcript('BAD_DATASET', 'ENST00000452800') diff --git a/backend/modules/browser/tests/test_utils.py b/backend/modules/browser/tests/test_utils.py index aabfe1685..5e704505a 100644 --- a/backend/modules/browser/tests/test_utils.py +++ b/backend/modules/browser/tests/test_utils.py @@ -33,6 +33,7 @@ def test_add_consequence_to_variant(): # bad variant variant = lookups.get_variant('SweGen', 38481311, '444', 'C', 'T') + utils.add_consequence_to_variant(variant) assert not variant From aeef79523fc305957b98ef223e738bb919c4b3d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Tue, 19 Feb 2019 14:14:08 +0100 Subject: [PATCH 064/170] no longer needs json --- backend/modules/browser/lookups.py | 1 - 1 file changed, 1 deletion(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index 37f3be895..9555f77fa 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -1,4 +1,3 @@ -import json # remove when db is fixed import logging import re From ecec24c14572ad2ee49a7d0ea9ab49bfbb191d4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Tue, 19 Feb 2019 14:18:29 +0100 Subject: [PATCH 065/170] pylint fixes --- backend/modules/browser/pgsql.py | 3 --- backend/modules/browser/utils.py | 26 +++++++++++++------------- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/backend/modules/browser/pgsql.py b/backend/modules/browser/pgsql.py index f339b3687..94aa9c3ad 100644 --- a/backend/modules/browser/pgsql.py +++ b/backend/modules/browser/pgsql.py @@ -1,9 +1,6 @@ """ Replaces mongodb.py """ - -import logging - import db from . import lookups diff --git a/backend/modules/browser/utils.py b/backend/modules/browser/utils.py index 79f63a180..73e5e7a5a 100644 --- a/backend/modules/browser/utils.py +++ b/backend/modules/browser/utils.py @@ -74,7 +74,7 @@ } -def add_consequence_to_variants(variant_list): +def add_consequence_to_variants(variant_list:list): """ Add information about variant consequence to multiple variants @@ -85,7 +85,7 @@ def add_consequence_to_variants(variant_list): add_consequence_to_variant(variant) -def add_consequence_to_variant(variant): +def add_consequence_to_variant(variant:dict): """ Add information about variant consequence to a variant @@ -93,7 +93,7 @@ def add_consequence_to_variant(variant): variant (dict): variant information """ if not variant: - return dict() + return worst_csq = worst_csq_with_vep(variant['vep_annotations']) variant['major_consequence'] = '' if worst_csq is None: @@ -121,7 +121,7 @@ def add_consequence_to_variant(variant): variant['flags'] = get_flags_from_variant(variant) -def annotation_severity(annotation): +def annotation_severity(annotation:dict): """ Evaluate severity of the consequences; "bigger is more important". @@ -137,7 +137,7 @@ def annotation_severity(annotation): return rv -def get_flags_from_variant(variant): +def get_flags_from_variant(variant:dict): """ Get flags from variant. Checks for: @@ -163,7 +163,7 @@ def get_flags_from_variant(variant): return flags -def get_proper_hgvs(annotation): +def get_proper_hgvs(annotation:dict): """ Get HGVS for change, either at transcript or protein level. @@ -204,7 +204,7 @@ def get_protein_hgvs(annotation): return None -def get_transcript_hgvs(annotation): +def get_transcript_hgvs(annotation:dict): """ Nucleotide change in HGVS format. @@ -220,7 +220,7 @@ def get_transcript_hgvs(annotation): return None -def order_vep_by_csq(annotation_list: list): +def order_vep_by_csq(annotation_list:list): """ Adds "major_consequence" to each annotation, orders by severity. @@ -238,7 +238,7 @@ def order_vep_by_csq(annotation_list: list): return sorted(annotation_list, key=(lambda ann:CSQ_ORDER_DICT[ann['major_consequence']])) -def remove_extraneous_vep_annotations(annotation_list: list): +def remove_extraneous_vep_annotations(annotation_list:list): """ Remove annotations with low-impact consequences (less than intron variant) @@ -252,7 +252,7 @@ def remove_extraneous_vep_annotations(annotation_list: list): if worst_csq_index(ann['Consequence'].split('&')) <= CSQ_ORDER_DICT['intron_variant']] -def worst_csq_from_list(csq_list): +def worst_csq_from_list(csq_list:list): """ Choose the worst consequence @@ -265,7 +265,7 @@ def worst_csq_from_list(csq_list): return REV_CSQ_ORDER_DICT[worst_csq_index(csq_list)] -def worst_csq_from_csq(csq): +def worst_csq_from_csq(csq:str): """ Find worst consequence in a possibly &-filled consequence string @@ -278,7 +278,7 @@ def worst_csq_from_csq(csq): return REV_CSQ_ORDER_DICT[worst_csq_index(csq.split('&'))] -def worst_csq_index(csq_list): +def worst_csq_index(csq_list:list): """ Find the index of the worst consequence. Corresponds to the lowest value (index) from CSQ_ORDER_DICT @@ -292,7 +292,7 @@ def worst_csq_index(csq_list): return min([CSQ_ORDER_DICT[csq] for csq in csq_list]) -def worst_csq_with_vep(annotation_list): +def worst_csq_with_vep(annotation_list:list): """ Choose the vep annotation with the most severe consequence Adds a"major_consequence" field for that annotation From 0d202c20ab6b59025569af01b5eda6f8721a3152 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Tue, 19 Feb 2019 14:21:11 +0100 Subject: [PATCH 066/170] pylint fixes for tests --- backend/modules/browser/tests/test_lookups.py | 3 +-- backend/modules/browser/tests/test_pgsql.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/backend/modules/browser/tests/test_lookups.py b/backend/modules/browser/tests/test_lookups.py index 154729805..4b1d1ef9f 100644 --- a/backend/modules/browser/tests/test_lookups.py +++ b/backend/modules/browser/tests/test_lookups.py @@ -13,7 +13,6 @@ def test_add_rsid_to_variant(): lookups.add_rsid_to_variant('SweGen', variant) assert variant['rsid'] == 'rs924645261' variant = lookups.get_variant('SweGen', 16113980, '22', 'C', 'T') - rsid = variant['rsid'] variant['rsid'] = '' lookups.add_rsid_to_variant('SweGen', variant) assert variant['rsid'] == 'rs9680543' @@ -122,7 +121,7 @@ def test_get_gene(): 'start': 19237396, 'stop': 19237489, 'strand': '+'} - + result = lookups.get_gene('SweGen', 'ENSG00000251940') for val in expected: assert result[val] == expected[val] diff --git a/backend/modules/browser/tests/test_pgsql.py b/backend/modules/browser/tests/test_pgsql.py index ac0014602..34e41baf4 100644 --- a/backend/modules/browser/tests/test_pgsql.py +++ b/backend/modules/browser/tests/test_pgsql.py @@ -14,7 +14,7 @@ def test_get_autocomplete(): 'ADH5', 'ADH6', 'ADH7', 'ADH5P2', 'ADH5P3', 'ADH5P4', 'ADHFE1']) assert set(res) == expected - + def test_get_coverage(): """ @@ -37,7 +37,7 @@ def test_get_coverage_pos(): res = pgsql.get_coverage_pos('SweGen', 'gene', 'ENSG00000231565') assert res['chrom'] == '22' assert res['start'] == 16364817 - assert res['stop'] == 16366254 + assert res['stop'] == 16366254 res = pgsql.get_coverage_pos('SweGen', 'region', '22-46615715-46615880') assert res['chrom'] == '22' assert res['start'] == 46615715 From 8d9a45e587e40e7528c99272e30086f33e7148da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Thu, 21 Feb 2019 14:10:44 +0100 Subject: [PATCH 067/170] start of api testing for the handlers using requests --- .../browser/tests/test_browser_handlers.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 backend/modules/browser/tests/test_browser_handlers.py diff --git a/backend/modules/browser/tests/test_browser_handlers.py b/backend/modules/browser/tests/test_browser_handlers.py new file mode 100644 index 000000000..4bfba7ed2 --- /dev/null +++ b/backend/modules/browser/tests/test_browser_handlers.py @@ -0,0 +1,18 @@ +""" +Test the browser handlers +""" + +import requests + +BASE_URL="http://localhost:4000" + +def test_get_gene(): + """ + Test GetGene.get() + """ + dataset = 'SweGen' + gene = 'ENSG00000172955' + response = requests.get('{}/api/datasets/{}/browser/gene/{}'.format(BASE_URL, dataset, gene)) + expected = '{"gene": {"id": 13918, "referenceSet": 1, "geneId": "ENSG00000172955", "name": "ADH6", "fullName": "alcohol dehydrogenase 6 (class V)", "canonicalTranscript": "ENST00000394899", "chrom": "4", "start": 100123795, "stop": 100140694, "strand": "-", "variants": null, "geneName": "ADH6", "fullGeneName": "alcohol dehydrogenase 6 (class V)"}, "exons": [{"start": 100123796, "stop": 100125400, "type": "exon"}, {"start": 100123796, "stop": 100125378, "type": "UTR"}, {"start": 100125379, "stop": 100125400, "type": "CDS"}, {"start": 100126082, "stop": 100126220, "type": "exon"}, {"start": 100126082, "stop": 100126220, "type": "CDS"}, {"start": 100128603, "stop": 100128738, "type": "exon"}, {"start": 100128603, "stop": 100128738, "type": "CDS"}, {"start": 100129825, "stop": 100130085, "type": "exon"}, {"start": 100129825, "stop": 100130085, "type": "CDS"}, {"start": 100131239, "stop": 100131455, "type": "exon"}, {"start": 100131239, "stop": 100131455, "type": "CDS"}, {"start": 100131572, "stop": 100131659, "type": "exon"}, {"start": 100131572, "stop": 100131659, "type": "CDS"}, {"start": 100134763, "stop": 100134904, "type": "exon"}, {"start": 100134763, "stop": 100134904, "type": "CDS"}, {"start": 100137318, "stop": 100137419, "type": "exon"}, {"start": 100137318, "stop": 100137419, "type": "CDS"}, {"start": 100140292, "stop": 100140403, "type": "exon"}, {"start": 100140292, "stop": 100140309, "type": "CDS"}, {"start": 100140310, "stop": 100140403, "type": "UTR"}], "transcripts": [{"transcriptId": "ENST00000394897"}, {"transcriptId": "ENST00000394899"}, {"transcriptId": "ENST00000512708"}, {"transcriptId": "ENST00000507484"}, {"transcriptId": "ENST00000407820"}, {"transcriptId": "ENST00000237653"}, {"transcriptId": "ENST00000508558"}, {"transcriptId": "ENST00000504257"}, {"transcriptId": "ENST00000513262"}]}' + assert response.text == expected + From 3c0f69503a223c7578a35597ad8390acdbf3591f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Thu, 21 Feb 2019 14:56:44 +0100 Subject: [PATCH 068/170] skeleton for all tests, initial test for region and transcript --- .../browser/tests/test_browser_handlers.py | 86 ++++++++++++++++++- 1 file changed, 82 insertions(+), 4 deletions(-) diff --git a/backend/modules/browser/tests/test_browser_handlers.py b/backend/modules/browser/tests/test_browser_handlers.py index 4bfba7ed2..f25a81f62 100644 --- a/backend/modules/browser/tests/test_browser_handlers.py +++ b/backend/modules/browser/tests/test_browser_handlers.py @@ -3,16 +3,94 @@ """ import requests +import json BASE_URL="http://localhost:4000" +def test_get_autocomplete(): + """ + Test GetAutocomplete.get() + """ + assert False + + +def test_download(): + """ + Test GetCoveragePos.get() + """ + assert False + + +def test_get_coverage(): + """ + Test GetCoverage.get() + """ + assert False + + +def test_get_coverage_pos(): + """ + Test GetCoveragePos.get() + """ + assert False + + def test_get_gene(): """ Test GetGene.get() """ dataset = 'SweGen' - gene = 'ENSG00000172955' - response = requests.get('{}/api/datasets/{}/browser/gene/{}'.format(BASE_URL, dataset, gene)) - expected = '{"gene": {"id": 13918, "referenceSet": 1, "geneId": "ENSG00000172955", "name": "ADH6", "fullName": "alcohol dehydrogenase 6 (class V)", "canonicalTranscript": "ENST00000394899", "chrom": "4", "start": 100123795, "stop": 100140694, "strand": "-", "variants": null, "geneName": "ADH6", "fullGeneName": "alcohol dehydrogenase 6 (class V)"}, "exons": [{"start": 100123796, "stop": 100125400, "type": "exon"}, {"start": 100123796, "stop": 100125378, "type": "UTR"}, {"start": 100125379, "stop": 100125400, "type": "CDS"}, {"start": 100126082, "stop": 100126220, "type": "exon"}, {"start": 100126082, "stop": 100126220, "type": "CDS"}, {"start": 100128603, "stop": 100128738, "type": "exon"}, {"start": 100128603, "stop": 100128738, "type": "CDS"}, {"start": 100129825, "stop": 100130085, "type": "exon"}, {"start": 100129825, "stop": 100130085, "type": "CDS"}, {"start": 100131239, "stop": 100131455, "type": "exon"}, {"start": 100131239, "stop": 100131455, "type": "CDS"}, {"start": 100131572, "stop": 100131659, "type": "exon"}, {"start": 100131572, "stop": 100131659, "type": "CDS"}, {"start": 100134763, "stop": 100134904, "type": "exon"}, {"start": 100134763, "stop": 100134904, "type": "CDS"}, {"start": 100137318, "stop": 100137419, "type": "exon"}, {"start": 100137318, "stop": 100137419, "type": "CDS"}, {"start": 100140292, "stop": 100140403, "type": "exon"}, {"start": 100140292, "stop": 100140309, "type": "CDS"}, {"start": 100140310, "stop": 100140403, "type": "UTR"}], "transcripts": [{"transcriptId": "ENST00000394897"}, {"transcriptId": "ENST00000394899"}, {"transcriptId": "ENST00000512708"}, {"transcriptId": "ENST00000507484"}, {"transcriptId": "ENST00000407820"}, {"transcriptId": "ENST00000237653"}, {"transcriptId": "ENST00000508558"}, {"transcriptId": "ENST00000504257"}, {"transcriptId": "ENST00000513262"}]}' - assert response.text == expected + gene_id = 'ENSG00000015475' + response = requests.get('{}/api/datasets/{}/browser/gene/{}'.format(BASE_URL, dataset, gene_id)) + expected = {"name": "BID", "canonicalTranscript": "ENST00000317361", "chrom": "22", "strand": "-", "geneName": "BID"} + gene = json.loads(response.text) + + for value in expected: + assert gene['gene'][value] == expected[value] + assert len(gene['exons']) == 14 + assert len(gene['transcripts']) == 10 + +def test_get_region(): + """ + Test GetRegion.get() + """ + dataset = 'SweGen' + region_def = '22-46615715-46615880' + response = requests.get('{}/api/datasets/{}/browser/region/{}'.format(BASE_URL, dataset, region_def)) + region = json.loads(response.text) + assert region == {'region': {'chrom': '22', 'start': 46615715, 'stop': 46615880, 'limit': 100000}} + + +def test_get_transcript(): + """ + Test GetTranscript.get() + """ + dataset = 'SweGen' + transcript_id = 'ENST00000317361' + response = requests.get('{}/api/datasets/{}/browser/transcript/{}'.format(BASE_URL, dataset, transcript_id)) + transcript = json.loads(response.text) + + assert transcript['gene']['id'] == 'ENSG00000015475' + assert len(transcript['exons']) == 14 + + +def test_get_variant(): + """ + Test GetVariant.get() + """ + assert False + + +def test_get_variants(): + """ + Test GetVariants.get() + """ + assert False + + +def test_searhc(): + """ + Test Search.get() + """ + assert False From cf3b5f45f4e58a1a4b871fa1fdfd9123d6ce0f93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Fri, 22 Feb 2019 08:50:23 +0100 Subject: [PATCH 069/170] added version to browser requests in frontend --- .../src/js/controller.browserController.js | 20 ++++---- frontend/src/js/factory.browser.js | 47 +++++++++++-------- 2 files changed, 38 insertions(+), 29 deletions(-) diff --git a/frontend/src/js/controller.browserController.js b/frontend/src/js/controller.browserController.js index 6537e6c4c..95986c814 100644 --- a/frontend/src/js/controller.browserController.js +++ b/frontend/src/js/controller.browserController.js @@ -54,7 +54,7 @@ if ($routeParams.transcript) { localThis.itemType = "transcript"; localThis.item = $routeParams.transcript; - Browser.getTranscript($routeParams.dataset, $routeParams.transcript).then( function(data) { + Browser.getTranscript($routeParams.dataset, $routeParams.version, $routeParams.transcript).then( function(data) { localThis.transcript = data.transcript; localThis.gene = data.gene; localThis.coverage.region.exons = data.exons; @@ -63,21 +63,21 @@ if ($routeParams.region) { localThis.itemType = "region"; localThis.item = $routeParams.region; - Browser.getRegion($routeParams.dataset, $routeParams.region).then( function(data) { + Browser.getRegion($routeParams.dataset, $routeParams.version, $routeParams.region).then( function(data) { localThis.region = data.region; }); } if ($routeParams.gene) { localThis.itemType = "gene"; localThis.item = $routeParams.gene; - Browser.getGene($routeParams.dataset, $routeParams.gene).then( function(data) { + Browser.getGene($routeParams.dataset, $routeParams.version, $routeParams.gene).then( function(data) { localThis.gene = data.gene; localThis.transcripts = data.transcripts; localThis.coverage.region.exons = data.exons; }); } if (localThis.itemType) { - Browser.getVariants($routeParams.dataset, localThis.itemType, localThis.item).then( function(data) { + Browser.getVariants($routeParams.dataset, $routeParams.version, localThis.itemType, localThis.item).then( function(data) { localThis.variants = data.variants; localThis.headers = data.headers; @@ -92,12 +92,12 @@ localThis.filterVariants(); }); - Browser.getCoveragePos($routeParams.dataset, localThis.itemType, localThis.item).then( function(data) { + Browser.getCoveragePos($routeParams.dataset, $routeParams.version, localThis.itemType, localThis.item).then( function(data) { localThis.coverage.region.start = data.start; localThis.coverage.region.stop = data.stop; localThis.coverage.region.chrom = data.chrom; }); - Browser.getCoverage($routeParams.dataset, localThis.itemType, localThis.item).then(function(data) { + Browser.getCoverage($routeParams.dataset, $routeParams.version, localThis.itemType, localThis.item).then(function(data) { localThis.coverage.data = data.coverage; localThis.coverage.loaded = true; }, function() { @@ -105,11 +105,11 @@ }); } if ($routeParams.variant) { - Browser.getVariant($routeParams.dataset, $routeParams.variant).then( function(data) { + Browser.getVariant($routeParams.dataset, $routeParams.version, $routeParams.variant).then( function(data) { localThis.variant = data.variant; }); } - Dataset.getDataset($routeParams.dataset, $routeParams.version) + Dataset.getDataset($routeParams.dataset, $routeParams.version, $routeParams.version) .then(function(data) { localThis.dataset = data.dataset; }, @@ -131,7 +131,7 @@ localThis.query = query; } if (localThis.query) { - Browser.search($routeParams.dataset, localThis.query).then( function(data) { + Browser.search($routeParams.dataset, $routeParams.version, localThis.query).then( function(data) { var url = browserLink(`${data.type}/${data.value}`); if ( data.type == "error" || data.type == "not_found" ) { @@ -145,7 +145,7 @@ function autocomplete() { localThis.activeSuggestion = -1; if (localThis.query) { - Browser.autocomplete($routeParams.dataset, localThis.query) + Browser.autocomplete($routeParams.dataset, $routeParams.version, localThis.query) .then( function(data) { localThis.suggestions = data.values; }); diff --git a/frontend/src/js/factory.browser.js b/frontend/src/js/factory.browser.js index 66959bcd0..7c125bee2 100644 --- a/frontend/src/js/factory.browser.js +++ b/frontend/src/js/factory.browser.js @@ -13,56 +13,65 @@ getCoveragePos:getCoveragePos, }; - function getGene(dataset, gene) { - return $http.get("/api/datasets/" + dataset + "/browser/gene/" + gene).then(function(data) { + function baseUrl(dataset, version) { + var url = "/api/datasets/" + dataset + "/"; + if ( version ) { + url += "version/" + version + "/" + } + url += 'browser/'; + return url; + } + + function getGene(dataset, version, gene) { + return $http.get(baseUrl(dataset, version) + "/gene/" + gene).then(function(data) { return data.data; }); } - function getRegion(dataset, region) { - return $http.get("/api/datasets/" + dataset + "/browser/region/" + region).then(function(data) { + function getRegion(dataset, version, region) { + return $http.get(baseUrl(dataset, version) + "/region/" + region).then(function(data) { return data.data; }); } - function getTranscript(dataset, transcript) { - return $http.get("/api/datasets/" + dataset + "/browser/transcript/" + transcript).then(function(data) { + function getTranscript(dataset, version, transcript) { + return $http.get(baseUrl(dataset, version) + "/transcript/" + transcript).then(function(data) { return data.data; }); } - function getVariant(dataset, variant) { - return $http.get("/api/datasets/" + dataset + "/browser/variant/" + variant).then(function(data) { + function getVariant(dataset, version, variant) { + return $http.get(baseUrl(dataset, version) + "variant/" + variant).then(function(data) { return data.data; - }); + }); } - function search(dataset, query) { - return $http.get("/api/datasets/" + dataset + "/browser/search/" + query).then(function(data) { + function search(dataset, version, query) { + return $http.get(baseUrl(dataset, version) + "/search/" + query).then(function(data) { return data.data; }); } - function autocomplete(dataset, query) { - return $http.get("/api/datasets/" + dataset + "/browser/autocomplete/" + query).then(function(data) { + function autocomplete(dataset, version, query) { + return $http.get(baseUrl(dataset, version) + "/autocomplete/" + query).then(function(data) { return data.data; }); } - function getVariants(dataset, datatype, item) { - return $http.get("api/datasets/" + dataset + "/browser/variants/" + datatype + "/" + item).then(function(data) { + function getVariants(dataset, version, datatype, item) { + return $http.get(baseUrl(dataset, version) + "/variants/" + datatype + "/" + item).then(function(data) { return data.data; }); } - function getCoverage(dataset, datatype, item) { - return $http.get("api/datasets/" + dataset + "/browser/coverage/" + datatype + "/" + item).then(function(data) { + function getCoverage(dataset, version, datatype, item) { + return $http.get(baseUrl(dataset, version) + "/coverage/" + datatype + "/" + item).then(function(data) { return data.data; }); } - function getCoveragePos(dataset, datatype, item) { - return $http.get("api/datasets/" + dataset + "/browser/coverage_pos/" + datatype + "/" + item).then(function(data) { + function getCoveragePos(dataset, version, datatype, item) { + return $http.get(baseUrl(dataset, version) + "/coverage_pos/" + datatype + "/" + item).then(function(data) { return data.data; }); } From 2fefda85f09c9f857417ea78520ccc72a17cb7cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Fri, 22 Feb 2019 09:53:58 +0100 Subject: [PATCH 070/170] updated tests with version --- backend/modules/browser/tests/.coveragerc | 12 ++++++++++- backend/modules/browser/tests/test_lookups.py | 20 +++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/backend/modules/browser/tests/.coveragerc b/backend/modules/browser/tests/.coveragerc index fc2753f56..cea6a995c 100644 --- a/backend/modules/browser/tests/.coveragerc +++ b/backend/modules/browser/tests/.coveragerc @@ -2,4 +2,14 @@ omit = # omit anything in a .local directory anywhere */tests/* - */__init__.py \ No newline at end of file + */__init__.py + */venv/* + */virtualenv/* + +[report] +omit = + # omit anything in a .local directory anywhere + */tests/* + */__init__.py + */venv/* + */virtualenv/* diff --git a/backend/modules/browser/tests/test_lookups.py b/backend/modules/browser/tests/test_lookups.py index 4b1d1ef9f..cb8cefb5e 100644 --- a/backend/modules/browser/tests/test_lookups.py +++ b/backend/modules/browser/tests/test_lookups.py @@ -282,12 +282,21 @@ def test_get_variant(): Test get_variant() """ result = lookups.get_variant('SweGen', 16057464, '22', 'G', 'A') + assert result['variant_id'] == '22-16057464-G-A' assert result['genes'] == ['ENSG00000233866'] assert result['transcripts'] == ['ENST00000424770'] + result = lookups.get_variant('SweGen', 9435852, '21', 'T', 'C') + assert not result # incorrect position assert not lookups.get_variant('SweGen', -1, '1', 'A', 'T') + # with version + result = lookups.get_variant('SweGen', 16057464, '22', 'G', 'A', "20161223") + assert not result + result = lookups.get_variant('SweGen', 9435852, '21', 'T', 'C', "20161223") + assert result['variant_id'] == '21-9435852-T-C' + def test_get_variants_by_rsid(caplog): ''' @@ -299,6 +308,11 @@ def test_get_variants_by_rsid(caplog): assert set(result[0]['genes']) == set(['ENSG00000100156', 'ENSG00000128298', 'ENSG00000272720']) assert len(result[0]['genes']) == 3 assert len(result[0]['transcripts']) == 6 + assert not lookups.get_variants_by_rsid('SweGen', 'rs76676778') + # with version + assert not lookups.get_variants_by_rsid('SweGen', 'rs185758992', '20161223') + result = lookups.get_variants_by_rsid('SweGen', 'rs76676778', '20161223') + assert result[0]['variant_id'] == '21-9411609-G-T' # by position result = lookups.get_variants_by_rsid('SweGen', 'rs185758992', check_position=True) @@ -306,9 +320,15 @@ def test_get_variants_by_rsid(caplog): assert set(result[0]['genes']) == set(['ENSG00000100156', 'ENSG00000128298', 'ENSG00000272720']) assert len(result[0]['genes']) == 3 assert len(result[0]['transcripts']) == 6 + assert not lookups.get_variants_by_rsid('SweGen', 'rs76676778', check_position=True) + # with version + assert not lookups.get_variants_by_rsid('SweGen', 'rs185758992', '20161223', check_position=True) + result = lookups.get_variants_by_rsid('SweGen', 'rs76676778', '20161223', check_position=True) + assert result[0]['variant_id'] == '21-9411609-G-T' # errors assert lookups.get_variants_by_rsid('incorrect_name', 'rs373706802') is None + assert lookups.get_variants_by_rsid('SweGen', 'rs37356766700', check_position=True) is None assert lookups.get_variants_by_rsid('SweGen', '373706802') is None assert lookups.get_variants_by_rsid('SweGen', 'rs3737o68o2') is None From b19a54ccde4ecc9472dca36987253c7536a5ac17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Fri, 22 Feb 2019 09:54:25 +0100 Subject: [PATCH 071/170] fix for exception due to rsid not in db --- backend/modules/browser/lookups.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index 9555f77fa..adc4f2a52 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -465,7 +465,7 @@ def get_variant(dataset:str, pos:int, chrom:str, ref:str, alt:str, ds_version:st return variant -def get_variants_by_rsid(dataset:str, rsid:str, check_position:str=False, ds_version:str=None): +def get_variants_by_rsid(dataset:str, rsid:str, ds_version:str=None, check_position:str=False): """ Retrieve variants by their associated rsid May also look up rsid and search for variants at the position @@ -500,13 +500,16 @@ def get_variants_by_rsid(dataset:str, rsid:str, check_position:str=False, ds_ver .dicts() .get()) dbsnp_version = refset['dbsnp_version'] - - rsid_dbsnp = (db.DbSNP - .select() - .where((db.DbSNP.rsid == rsid) & - (db.DbSNP.version_id == dbsnp_version)) - .dicts() - .get()) + try: + rsid_dbsnp = (db.DbSNP + .select() + .where((db.DbSNP.rsid == rsid) & + (db.DbSNP.version_id == dbsnp_version)) + .dicts() + .get()) + except db.DbSNP.DoesNotExist: + logging.error('get_variants_by_rsid({}, {}): rsid not in dbsnp'.format(dataset, rsid)) + return None query = (db.Variant .select() .where((db.Variant.pos == rsid_dbsnp['pos']) & From 19102cd58b10b29238fee78147db1fd5e1993a59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Fri, 22 Feb 2019 09:56:19 +0100 Subject: [PATCH 072/170] functions in alphabetical order; variant call using version as well --- backend/modules/browser/browser_handlers.py | 42 +++++++++++---------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/backend/modules/browser/browser_handlers.py b/backend/modules/browser/browser_handlers.py index 7b5b2b17f..4e32ca83c 100644 --- a/backend/modules/browser/browser_handlers.py +++ b/backend/modules/browser/browser_handlers.py @@ -1,3 +1,7 @@ +""" +Request handlers for the browser +""" + import logging import handlers @@ -18,24 +22,6 @@ def get(self, dataset, query, ds_version=None): results = pgsql.get_autocomplete(query) ret = {'values': sorted(list(set(results)))[:20]} - self.finish( ret ) - - -class GetCoverage(handlers.UnsafeHandler): - """ - Retrieve coverage - """ - def get(self, dataset, datatype, item, ds_version=None): - ret = pgsql.get_coverage(dataset, datatype, item, ds_version) - self.finish(ret) - - -class GetCoveragePos(handlers.UnsafeHandler): - """ - Retrieve coverage range - """ - def get(self, dataset, datatype, item, ds_version=None): - ret = pgsql.get_coverage_pos(dataset, datatype, item) self.finish(ret) @@ -63,6 +49,24 @@ def get(self, dataset: str, datatype, item, ds_version=None): self.write(','.join(map(str, [variant[h] for h in headers])) + '\n') +class GetCoverage(handlers.UnsafeHandler): + """ + Retrieve coverage + """ + def get(self, dataset, datatype, item, ds_version=None): + ret = pgsql.get_coverage(dataset, datatype, item, ds_version) + self.finish(ret) + + +class GetCoveragePos(handlers.UnsafeHandler): + """ + Retrieve coverage range + """ + def get(self, dataset, datatype, item, ds_version=None): + ret = pgsql.get_coverage_pos(dataset, datatype, item) + self.finish(ret) + + class GetGene(handlers.UnsafeHandler): """ Request information about a gene @@ -232,7 +236,7 @@ def get(self, dataset, variant, ds_version=None): self.set_user_msg('Unable to parse variant', 'error') return orig_variant = variant - variant = lookups.get_variant(dataset, v[1], v[0], v[2], v[3]) + variant = lookups.get_variant(dataset, v[1], v[0], v[2], v[3], ds_version) if not variant: logging.error('Variant not found ({})'.format(orig_variant)) From efa91e5e87b18409a8e5a3dc15d5e89499bb3ae3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Fri, 22 Feb 2019 12:13:19 +0100 Subject: [PATCH 073/170] updated coveragerc --- .coveragerc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.coveragerc b/.coveragerc index 8db5982de..fd89de1f0 100644 --- a/.coveragerc +++ b/.coveragerc @@ -2,4 +2,4 @@ omit = /usr/local/*,/home/travis/virtualenv/* [report] -omit = /usr/local/*,/home/travis/virtualenv/* +omit = /usr/local/*,/home/travis/virtualenv/*, */__init.py__, */test*.py From 185b1274f26d487ab7e6ce334c0e13607ee4ff1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Fri, 22 Feb 2019 12:48:34 +0100 Subject: [PATCH 074/170] function parameter type hints added --- backend/modules/browser/browser_handlers.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/backend/modules/browser/browser_handlers.py b/backend/modules/browser/browser_handlers.py index 4e32ca83c..bb9dc7b62 100644 --- a/backend/modules/browser/browser_handlers.py +++ b/backend/modules/browser/browser_handlers.py @@ -16,7 +16,7 @@ REGION_LIMIT = 100000 class Autocomplete(handlers.UnsafeHandler): - def get(self, dataset, query, ds_version=None): + def get(self, dataset:str, query:str, ds_version:str=None): ret = {} results = pgsql.get_autocomplete(query) @@ -26,7 +26,7 @@ def get(self, dataset, query, ds_version=None): class Download(handlers.UnsafeHandler): - def get(self, dataset: str, datatype, item, ds_version=None): + def get(self, dataset:str, datatype:str, item:str, ds_version:str=None): """ Download variants as csv @@ -53,7 +53,7 @@ class GetCoverage(handlers.UnsafeHandler): """ Retrieve coverage """ - def get(self, dataset, datatype, item, ds_version=None): + def get(self, dataset:str, datatype:str, item:str, ds_version:str=None): ret = pgsql.get_coverage(dataset, datatype, item, ds_version) self.finish(ret) @@ -62,7 +62,7 @@ class GetCoveragePos(handlers.UnsafeHandler): """ Retrieve coverage range """ - def get(self, dataset, datatype, item, ds_version=None): + def get(self, dataset:str, datatype:str, item:str, ds_version:str=None): ret = pgsql.get_coverage_pos(dataset, datatype, item) self.finish(ret) @@ -71,7 +71,7 @@ class GetGene(handlers.UnsafeHandler): """ Request information about a gene """ - def get(self, dataset, gene, ds_version=None): + def get(self, dataset:str, gene:str, ds_version:str=None): """ Request information about a gene @@ -116,7 +116,7 @@ class GetRegion(handlers.UnsafeHandler): """ Request information about genes in a region """ - def get(self, dataset, region, ds_version=None): + def get(self, dataset:str, region:str, ds_version:str=None): """ Request information about genes in a region @@ -173,7 +173,7 @@ class GetTranscript(handlers.UnsafeHandler): """ Request information about a transcript """ - def get(self, dataset, transcript, ds_version=None): + def get(self, dataset:str, transcript:str, ds_version:str=None): """ Request information about a transcript @@ -216,7 +216,7 @@ class GetVariant(handlers.UnsafeHandler): """ Request information about a gene """ - def get(self, dataset, variant, ds_version=None): + def get(self, dataset:str, variant:str, ds_version:str=None): """ Request information about a gene @@ -316,7 +316,7 @@ class GetVariants(handlers.UnsafeHandler): """ Retrieve variants """ - def get(self, dataset, datatype, item, ds_version=None): + def get(self, dataset:str, datatype:str, item:str, ds_version:str=None): """ Retrieve variants @@ -339,7 +339,7 @@ class Search(handlers.UnsafeHandler): """ Perform a search for the wanted object """ - def get(self, dataset, query, ds_version=None): + def get(self, dataset:str, query:str, ds_version:str=None): """ Perform a search for the wanted object From 36f16dc2bdbf5af1605002211c8f292b4b8d1575 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Fri, 22 Feb 2019 14:19:00 +0100 Subject: [PATCH 075/170] updates to test to limit everything to chromosome 21 and 22 --- .coveragerc | 4 +- .../browser/tests/test_browser_handlers.py | 89 ++++++++++++++++--- backend/modules/browser/tests/test_lookups.py | 57 ++++++------ backend/modules/browser/tests/test_pgsql.py | 7 +- 4 files changed, 115 insertions(+), 42 deletions(-) diff --git a/.coveragerc b/.coveragerc index fd89de1f0..c18574f4f 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,5 +1,5 @@ [run] -omit = /usr/local/*,/home/travis/virtualenv/* +omit = /usr/local/*,/home/travis/virtualenv/*,*venv* [report] -omit = /usr/local/*,/home/travis/virtualenv/*, */__init.py__, */test*.py +omit = /usr/local/*,/home/travis/virtualenv/*, */__init.py__, */test*.py, *venv* diff --git a/backend/modules/browser/tests/test_browser_handlers.py b/backend/modules/browser/tests/test_browser_handlers.py index f25a81f62..302914f4f 100644 --- a/backend/modules/browser/tests/test_browser_handlers.py +++ b/backend/modules/browser/tests/test_browser_handlers.py @@ -5,34 +5,58 @@ import requests import json -BASE_URL="http://localhost:4000" +BASE_URL="http://localhost:4001" def test_get_autocomplete(): """ Test GetAutocomplete.get() """ - assert False + dataset = 'SweGen' + + query = 'PA' + response = requests.get('{}/api/datasets/{}/browser/autocomplete/{}'.format(BASE_URL, dataset, query)) + data = json.loads(response.text) + assert set(data["values"]) == set(["PABPC1P9", "PACSIN2", "PANX2", "PARP4P3", + "PARVB", "PARVG", "PATZ1", "PAXBP1", "PAXBP1-AS1"]) def test_download(): """ - Test GetCoveragePos.get() + Test Download.get() """ - assert False + dataset = 'SweGen' + + data_type = 'transcript' + data_item = 'ENST00000438441' + response = requests.get('{}/api/datasets/{}/browser/download/{}/{}'.format(BASE_URL, dataset, data_type, data_item)) + assert len(response.text.split('\n')) == 407 def test_get_coverage(): """ Test GetCoverage.get() """ - assert False + dataset = 'SweGen' + + data_type = 'transcript' + data_item = 'ENST00000438441' + response = requests.get('{}/api/datasets/{}/browser/coverage/{}/{}'.format(BASE_URL, dataset, data_type, data_item)) + data = json.loads(response.text) + assert len(data['coverage']) == 144 def test_get_coverage_pos(): """ Test GetCoveragePos.get() """ - assert False + dataset = 'SweGen' + data_type = 'region' + data_item = '22-100001-100101' + response = requests.get('{}/api/datasets/{}/browser/coverage_pos/{}/{}'.format(BASE_URL, dataset, data_type, data_item)) + cov_pos = json.loads(response.text) + assert cov_pos['start'] == 100001 + assert cov_pos['stop'] == 100101 + assert cov_pos['chrom'] == '22' def test_get_gene(): @@ -50,6 +74,12 @@ def test_get_gene(): assert len(gene['exons']) == 14 assert len(gene['transcripts']) == 10 + dataset = 'SweGen' + gene_id = 'ENSG00000015475' + response = requests.get('{}/api/datasets/{}/browser/gene/{}'.format(BASE_URL, dataset, gene_id)) + expected = {"name": "BID", "canonicalTranscript": "ENST00000317361", "chrom": "22", "strand": "-", "geneName": "BID"} + gene = json.loads(response.text) + def test_get_region(): """ @@ -79,18 +109,57 @@ def test_get_variant(): """ Test GetVariant.get() """ - assert False + dataset = 'SweGen' + variant_id = '22-16057464-G-A' + response = requests.get('{}/api/datasets/{}/browser/variant/{}'.format(BASE_URL, dataset, variant_id)) + variant = json.loads(response.text) + + assert variant['variant']['variantId'] == '22-16057464-G-A' + assert variant['variant']['genes'] == ['ENSG00000233866'] + assert variant['variant']['transcripts'] == ['ENST00000424770'] + + variant_id = '21-9435852-T-C' + version = '20161223' + response = requests.get('{}/api/datasets/{}/browser/variant/{}'.format(BASE_URL, dataset, variant_id)) + assert response.status_code == 404 + response = requests.get('{}/api/datasets/{}/version/{}/browser/variant/{}'.format(BASE_URL, dataset, version, variant_id)) + variant = json.loads(response.text) + assert variant['variant']['variantId'] == '21-9435852-T-C' def test_get_variants(): """ Test GetVariants.get() """ - assert False + dataset = 'SweGen' + data_type = 'gene' + data_item = 'ENSG00000231565' + response = requests.get('{}/api/datasets/{}/browser/variants/{}/{}'.format(BASE_URL, dataset, data_type, data_item)) + data = json.loads(response.text) + assert len(data['variants']) == 405 -def test_searhc(): + data_type = 'region' + data_item = '22-46615715-46615880' + response = requests.get('{}/api/datasets/{}/browser/variants/{}/{}'.format(BASE_URL, dataset, data_type, data_item)) + data = json.loads(response.text) + assert len(data['variants']) == 3 + + data_type = 'transcript' + data_item = 'ENST00000438441' + response = requests.get('{}/api/datasets/{}/browser/variants/{}/{}'.format(BASE_URL, dataset, data_type, data_item)) + data = json.loads(response.text) + assert len(data['variants']) == 405 + + +def test_search(): """ Test Search.get() """ - assert False + dataset = 'SweGen' + + query = 'NF1P3' + response = requests.get('{}/api/datasets/{}/browser/search/{}'.format(BASE_URL, dataset, query)) + data = json.loads(response.text) + assert data['type'] == 'gene' + assert data['value'] == 'ENSG00000183249' diff --git a/backend/modules/browser/tests/test_lookups.py b/backend/modules/browser/tests/test_lookups.py index cb8cefb5e..f584ea87e 100644 --- a/backend/modules/browser/tests/test_lookups.py +++ b/backend/modules/browser/tests/test_lookups.py @@ -26,12 +26,12 @@ def test_get_awesomebar_result(): assert result == ('dbsnp_variant_set', 373706802) result = lookups.get_awesomebar_result('SweGen', 'rs783') assert result == ('variant', '22-29461622-G-A') - result = lookups.get_awesomebar_result('SweGen', 'ADH6') - assert result == ('gene', 'ENSG00000172955') - result = lookups.get_awesomebar_result('SweGen', 'ENSG00000172955') - assert result == ('gene', 'ENSG00000172955') - result = lookups.get_awesomebar_result('SweGen', 'ENST00000237653') - assert result == ('transcript', 'ENST00000237653') + result = lookups.get_awesomebar_result('SweGen', 'NF1P3') + assert result == ('gene', 'ENSG00000183249') + result = lookups.get_awesomebar_result('SweGen', 'ENSG00000183249') + assert result == ('gene', 'ENSG00000183249') + result = lookups.get_awesomebar_result('SweGen', 'ENST00000457709') + assert result == ('transcript', 'ENST00000457709') result = lookups.get_awesomebar_result('SweGen', '22-46615715-46615880') assert result == ('region', '22-46615715-46615880') result = lookups.get_awesomebar_result('SweGen', 'CHR22:46615715-46615880') @@ -49,14 +49,19 @@ def test_get_coverage_for_bases(): # normal coverage = lookups.get_coverage_for_bases('SweGen', '22', 46546423, 46549652) assert len(coverage) == 323 - assert coverage[0] == {'chrom': '22', 'coverage': [1.0, 1.0, 0.993, 0.91, 0.697, 0.426, 0.2, 0.009, 0.0], - 'dataset_version': 4, 'id': 2954967, 'mean': 24.94, 'median': 24.0, 'pos': 46546430} + expected = {'chrom': '22', 'coverage': [1.0, 1.0, 0.993, 0.91, 0.697, 0.426, 0.2, 0.009, 0.0], + 'dataset_version': 4, 'mean': 24.94, 'median': 24.0, 'pos': 46546430} + for val in expected: + assert coverage[0][val] == expected[val] + assert len(lookups.get_coverage_for_bases('SweGen', '22', 46615715, 46615880)) == 17 # no end_pos coverage = lookups.get_coverage_for_bases('SweGen', '22', 46546430) - assert coverage == [{'chrom': '22', 'coverage': [1.0, 1.0, 0.993, 0.91, 0.697, 0.426, 0.2, 0.009, 0.0], - 'dataset_version': 4, 'id': 2954967, 'mean': 24.94, 'median': 24.0, 'pos': 46546430}] + expected = {'chrom': '22', 'coverage': [1.0, 1.0, 0.993, 0.91, 0.697, 0.426, 0.2, 0.009, 0.0], + 'dataset_version': 4, 'mean': 24.94, 'median': 24.0, 'pos': 46546430} + for val in expected: + assert coverage[0][val] == expected[val] assert len(lookups.get_coverage_for_bases('SweGen', '22', 46615715, 46615880)) == 17 # no hits @@ -74,14 +79,18 @@ def test_get_coverage_for_transcript(): # normal coverage = lookups.get_coverage_for_transcript('SweGen', '22', 46546423, 46549652) assert len(coverage) == 323 - assert coverage[0] == {'chrom': '22', 'coverage': [1.0, 1.0, 0.993, 0.91, 0.697, 0.426, 0.2, 0.009, 0.0], - 'dataset_version': 4, 'id': 2954967, 'mean': 24.94, 'median': 24.0, 'pos': 46546430} + expected = {'chrom': '22', 'coverage': [1.0, 1.0, 0.993, 0.91, 0.697, 0.426, 0.2, 0.009, 0.0], + 'dataset_version': 4, 'mean': 24.94, 'median': 24.0, 'pos': 46546430} + for val in expected: + assert coverage[0][val] == expected[val] assert len(lookups.get_coverage_for_transcript('SweGen', '22', 46615715, 46615880)) == 17 # no end_pos coverage = lookups.get_coverage_for_transcript('SweGen', '22', 46546430) - assert coverage == [{'chrom': '22', 'coverage': [1.0, 1.0, 0.993, 0.91, 0.697, 0.426, 0.2, 0.009, 0.0], - 'dataset_version': 4, 'id': 2954967, 'mean': 24.94, 'median': 24.0, 'pos': 46546430}] + expected = {'chrom': '22', 'coverage': [1.0, 1.0, 0.993, 0.91, 0.697, 0.426, 0.2, 0.009, 0.0], + 'dataset_version': 4, 'mean': 24.94, 'median': 24.0, 'pos': 46546430} + for val in expected: + assert coverage[0][val] == expected[val] assert len(lookups.get_coverage_for_transcript('SweGen', '22', 46615715, 46615880)) == 17 # no hits @@ -148,7 +157,8 @@ def test_get_gene_by_dbid(): 'start': 16967410, 'stop': 16969212, 'strand': '+'} - result = lookups.get_gene_by_dbid(53626) + gene = lookups.get_gene('SweGen', 'ENSG00000226444') + result = lookups.get_gene_by_dbid(gene['id']) for val in expected: assert result[val] == expected[val] @@ -249,8 +259,8 @@ def test_get_transcripts_in_gene(): """ Test get_transcripts_in_gene() """ - res = lookups.get_transcripts_in_gene('SweGen', 'ENSG00000103197') - assert len(res) == 27 + res = lookups.get_transcripts_in_gene('SweGen', 'ENSG00000228314') + assert len(res) == 3 assert not lookups.get_transcripts_in_gene('bad_dataset', 'ENSG00000241670') assert not lookups.get_transcripts_in_gene('SweGen', 'ENSGASDFG') @@ -271,8 +281,9 @@ def test_get_transcripts_in_gene_by_dbid(): """ Test get_transcripts_in_gene_by_dbid() """ - res = lookups.get_transcripts_in_gene_by_dbid(53626) - assert len(res) == 2 + gene = lookups.get_gene('SweGen', 'ENSG00000228314') + res = lookups.get_transcripts_in_gene_by_dbid(gene['id']) + assert len(res) == 3 res = lookups.get_transcripts_in_gene_by_dbid(-1) assert not res @@ -298,7 +309,7 @@ def test_get_variant(): assert result['variant_id'] == '21-9435852-T-C' -def test_get_variants_by_rsid(caplog): +def test_get_variants_by_rsid(): ''' Test get_variants_by_rsid() ''' @@ -332,12 +343,6 @@ def test_get_variants_by_rsid(caplog): assert lookups.get_variants_by_rsid('SweGen', '373706802') is None assert lookups.get_variants_by_rsid('SweGen', 'rs3737o68o2') is None - expected = ('get_dataset_version(incorrect_name, version=None): cannot retrieve dataset version', - 'get_variants_by_rsid(SweGen, 373706802): rsid not starting with rs', - 'get_variants_by_rsid(SweGen, rs3737o68o2): not an integer after rs') - for comparison in zip(caplog.messages, expected): - assert comparison[0] == comparison[1] - # no variants with rsid available assert not lookups.get_variants_by_rsid('SweGen', 'rs1') diff --git a/backend/modules/browser/tests/test_pgsql.py b/backend/modules/browser/tests/test_pgsql.py index 34e41baf4..b5d677127 100644 --- a/backend/modules/browser/tests/test_pgsql.py +++ b/backend/modules/browser/tests/test_pgsql.py @@ -9,10 +9,9 @@ def test_get_autocomplete(): """ Test get_autocomplete() """ - res = pgsql.get_autocomplete('ADH') - expected = set(['ADH1A', 'ADH1B', 'ADH1C', 'ADH4', - 'ADH5', 'ADH6', 'ADH7', 'ADH5P2', - 'ADH5P3', 'ADH5P4', 'ADHFE1']) + res = pgsql.get_autocomplete('PA') + expected = set(["PABPC1P9", "PACSIN2", "PANX2", "PARP4P3", + "PARVB", "PARVG", "PATZ1", "PAXBP1", "PAXBP1-AS1"]) assert set(res) == expected From 2c66d717ce965402255202d83f5ba113ef35e85b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Fri, 22 Feb 2019 14:55:23 +0100 Subject: [PATCH 076/170] removed double / --- frontend/src/js/factory.browser.js | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/frontend/src/js/factory.browser.js b/frontend/src/js/factory.browser.js index 7c125bee2..4cb4ae1cc 100644 --- a/frontend/src/js/factory.browser.js +++ b/frontend/src/js/factory.browser.js @@ -23,19 +23,19 @@ } function getGene(dataset, version, gene) { - return $http.get(baseUrl(dataset, version) + "/gene/" + gene).then(function(data) { + return $http.get(baseUrl(dataset, version) + "gene/" + gene).then(function(data) { return data.data; }); } function getRegion(dataset, version, region) { - return $http.get(baseUrl(dataset, version) + "/region/" + region).then(function(data) { + return $http.get(baseUrl(dataset, version) + "region/" + region).then(function(data) { return data.data; }); } function getTranscript(dataset, version, transcript) { - return $http.get(baseUrl(dataset, version) + "/transcript/" + transcript).then(function(data) { + return $http.get(baseUrl(dataset, version) + "transcript/" + transcript).then(function(data) { return data.data; }); } @@ -47,31 +47,31 @@ } function search(dataset, version, query) { - return $http.get(baseUrl(dataset, version) + "/search/" + query).then(function(data) { + return $http.get(baseUrl(dataset, version) + "search/" + query).then(function(data) { return data.data; }); } function autocomplete(dataset, version, query) { - return $http.get(baseUrl(dataset, version) + "/autocomplete/" + query).then(function(data) { + return $http.get(baseUrl(dataset, version) + "autocomplete/" + query).then(function(data) { return data.data; }); } function getVariants(dataset, version, datatype, item) { - return $http.get(baseUrl(dataset, version) + "/variants/" + datatype + "/" + item).then(function(data) { + return $http.get(baseUrl(dataset, version) + "variants/" + datatype + "/" + item).then(function(data) { return data.data; }); } function getCoverage(dataset, version, datatype, item) { - return $http.get(baseUrl(dataset, version) + "/coverage/" + datatype + "/" + item).then(function(data) { + return $http.get(baseUrl(dataset, version) + "coverage/" + datatype + "/" + item).then(function(data) { return data.data; }); } function getCoveragePos(dataset, version, datatype, item) { - return $http.get(baseUrl(dataset, version) + "/coverage_pos/" + datatype + "/" + item).then(function(data) { + return $http.get(baseUrl(dataset, version) + "coverage_pos/" + datatype + "/" + item).then(function(data) { return data.data; }); } From 883406f458f4feb5fcb70fa4ce96515274cfa391 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Fri, 22 Feb 2019 19:48:06 +0100 Subject: [PATCH 077/170] give search support for versions --- backend/modules/browser/browser_handlers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/modules/browser/browser_handlers.py b/backend/modules/browser/browser_handlers.py index bb9dc7b62..ed1de23be 100644 --- a/backend/modules/browser/browser_handlers.py +++ b/backend/modules/browser/browser_handlers.py @@ -349,7 +349,7 @@ def get(self, dataset:str, query:str, ds_version:str=None): """ ret = {"dataset": dataset, "value": None, "type": None} - datatype, identifier = lookups.get_awesomebar_result(dataset, query) + datatype, identifier = lookups.get_awesomebar_result(dataset, query, ds_version) if datatype == "dbsnp_variant_set": datatype = "dbsnp" From 1d532e515133540ef09305f8d35307d14f048b47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Fri, 22 Feb 2019 19:54:55 +0100 Subject: [PATCH 078/170] id is implicit --- backend/modules/browser/lookups.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index adc4f2a52..83b00b72b 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -355,7 +355,7 @@ def get_raw_variant(dataset:str, pos:int, chrom:str, ref:str, alt:str, ds_versio (db.Variant.ref == ref) & (db.Variant.alt == alt) & (db.Variant.chrom == chrom) & - (db.Variant.dataset_version == dataset_version.id)) + (db.Variant.dataset_version == dataset_version)) .dicts() .get()) variant['genes'] = [gene['gene_id'] for gene in From d659d9a3adc633a63b514af9e44db623d6563940 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Fri, 22 Feb 2019 19:57:11 +0100 Subject: [PATCH 079/170] increase coverage of tests --- .../browser/tests/test_browser_handlers.py | 15 +++++++++++++++ backend/modules/browser/tests/test_utils.py | 1 + 2 files changed, 16 insertions(+) diff --git a/backend/modules/browser/tests/test_browser_handlers.py b/backend/modules/browser/tests/test_browser_handlers.py index 302914f4f..73a6f5ede 100644 --- a/backend/modules/browser/tests/test_browser_handlers.py +++ b/backend/modules/browser/tests/test_browser_handlers.py @@ -91,6 +91,14 @@ def test_get_region(): region = json.loads(response.text) assert region == {'region': {'chrom': '22', 'start': 46615715, 'stop': 46615880, 'limit': 100000}} + region_def = '22-46A1615715-46615880' + response = requests.get('{}/api/datasets/{}/browser/region/{}'.format(BASE_URL, dataset, region_def)) + assert response.status_code == 400 + + region_def = '22-46A1615715-46615880' + response = requests.get('{}/api/datasets/{}/browser/region/{}'.format(BASE_URL, dataset, region_def)) + assert response.status_code == 400 + def test_get_transcript(): """ @@ -163,3 +171,10 @@ def test_search(): data = json.loads(response.text) assert data['type'] == 'gene' assert data['value'] == 'ENSG00000183249' + + query = '21-9411281-T-C' + version = '20161223' + response = requests.get('{}/api/datasets/{}/version/{}/browser/search/{}'.format(BASE_URL, dataset, version, query)) + data = json.loads(response.text) + assert data['type'] == 'variant' + assert data['value'] == '21-9411281-T-C' diff --git a/backend/modules/browser/tests/test_utils.py b/backend/modules/browser/tests/test_utils.py index 5e704505a..d987e8c93 100644 --- a/backend/modules/browser/tests/test_utils.py +++ b/backend/modules/browser/tests/test_utils.py @@ -190,3 +190,4 @@ def test_worst_csq_with_vep(): res = utils.worst_csq_with_vep(veps) assert res == {'SYMBOL': '1', 'Consequence': 'frameshift_variant', 'CANONICAL': 'YES', 'major_consequence': 'frameshift_variant'} + assert not utils.worst_csq_with_vep([]) From 89ca2762964d2c75025e621794512a5b370bea5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Mon, 25 Feb 2019 15:43:43 +0100 Subject: [PATCH 080/170] remove add_rsid_to_variant() for now to avoid giving incorrect rsids --- backend/modules/browser/lookups.py | 42 +----------------------------- 1 file changed, 1 insertion(+), 41 deletions(-) diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index 83b00b72b..fa412dd4c 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -7,38 +7,6 @@ SEARCH_LIMIT = 10000 - -def add_rsid_to_variant(dataset:str, variant:str): - """ - Add rsid to a variant in the database based on position - - Args: - dataset (str): short name of the dataset - variant (dict): values for a variant - """ - refset = (db.Dataset - .select(db.ReferenceSet) - .join(db.ReferenceSet) - .where(db.Dataset.short_name == dataset) - .dicts() - .get()) - dbsnp_version = refset['dbsnp_version'] - - if not variant['rsid']: - try: - rsid = (db.DbSNP - .select() - .where((db.DbSNP.pos == variant['pos']) & - (db.DbSNP.chrom == variant['chrom']) & - (db.DbSNP.version == dbsnp_version)) - .dicts() - .get()) - variant['rsid'] = 'rs{}'.format(rsid['rsid']) - except db.DbSNP.DoesNotExist: - pass - # logging.error('add_rsid_to_variant({}, variant[dbid: {}]): unable to retrieve rsid'.format(dataset, variant['id'])) - - REGION_REGEX = re.compile(r'^\s*(\d+|X|Y|M|MT)\s*([-:]?)\s*(\d*)-?([\dACTG]*)-?([ACTG]*)') def get_awesomebar_result(dataset:str, query:str, ds_version:str=None): @@ -457,9 +425,7 @@ def get_variant(dataset:str, pos:int, chrom:str, ref:str, alt:str, ds_version:st variant = get_raw_variant(dataset, pos, chrom, ref, alt, ds_version) if not variant or 'rsid' not in variant: return variant - if variant['rsid'] == '.' or variant['rsid'] is None: - add_rsid_to_variant(dataset, variant) - else: + if variant['rsid']: if not str(variant['rsid']).startswith('rs'): variant['rsid'] = 'rs{}'.format(variant['rsid']) return variant @@ -573,8 +539,6 @@ def get_variants_in_gene(dataset:str, gene_id:str, ds_version:str=None): for variant in variants: if variant['rsid']: variant['rsid'] = 'rs{}'.format(variant['rsid']) - else: - add_rsid_to_variant(dataset, variant) remove_extraneous_information(variant) return variants @@ -616,8 +580,6 @@ def get_variants_in_region(dataset:str, chrom:str, start_pos:int, end_pos:int, d for variant in variants: if variant['rsid']: variant['rsid'] = 'rs{}'.format(variant['rsid']) - else: - add_rsid_to_variant(dataset, variant) remove_extraneous_information(variant) return variants @@ -660,8 +622,6 @@ def get_variants_in_transcript(dataset:str, transcript_id:str, ds_version:str=No variant['vep_annotations'] = [anno for anno in variant['vep_annotations'] if anno['Feature'] == transcript_id] if variant['rsid']: variant['rsid'] = 'rs{}'.format(variant['rsid']) - else: - add_rsid_to_variant(dataset, variant) remove_extraneous_information(variant) return variants From 3a740ffbbf124ceff9b2f63e94cd3c87c11dcdd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Tue, 26 Feb 2019 10:01:47 +0100 Subject: [PATCH 081/170] remove add_rsid_to_variant tests --- backend/modules/browser/tests/test_lookups.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/backend/modules/browser/tests/test_lookups.py b/backend/modules/browser/tests/test_lookups.py index f584ea87e..a22496fbe 100644 --- a/backend/modules/browser/tests/test_lookups.py +++ b/backend/modules/browser/tests/test_lookups.py @@ -5,19 +5,6 @@ from .. import lookups -def test_add_rsid_to_variant(): - """ - Test add_rsid_to_variant() - """ - variant = lookups.get_variant('SweGen', 34730985, '22', 'G', 'A') - lookups.add_rsid_to_variant('SweGen', variant) - assert variant['rsid'] == 'rs924645261' - variant = lookups.get_variant('SweGen', 16113980, '22', 'C', 'T') - variant['rsid'] = '' - lookups.add_rsid_to_variant('SweGen', variant) - assert variant['rsid'] == 'rs9680543' - - def test_get_awesomebar_result(): """ Test get_awesomebar_result() @@ -354,6 +341,8 @@ def test_get_variants_in_gene(): res = lookups.get_variants_in_gene('SweGen', 'ENSG00000198062') assert len(res) == 1185 assert not lookups.get_variants_in_gene('bad_dataset', 'ENSG00000198062') + res = lookups.get_variants_in_gene('ACpop', 'ENSG00000040608') + assert len(res) == 260 assert not lookups.get_variants_in_gene('bad_dataset', 'ENSGASDFG') From 5c4e4fee23bba00ee1de7429ed2eb41f9ba79f26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Wed, 27 Feb 2019 07:04:41 +0100 Subject: [PATCH 082/170] correct port --- backend/modules/browser/tests/test_browser_handlers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/modules/browser/tests/test_browser_handlers.py b/backend/modules/browser/tests/test_browser_handlers.py index 73a6f5ede..c4f09d59a 100644 --- a/backend/modules/browser/tests/test_browser_handlers.py +++ b/backend/modules/browser/tests/test_browser_handlers.py @@ -5,7 +5,7 @@ import requests import json -BASE_URL="http://localhost:4001" +BASE_URL="http://localhost:4000" def test_get_autocomplete(): """ From 8c9ef5fa6f463a4ae55e4176de8ad9b99bd8f89b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Wed, 27 Feb 2019 07:58:50 +0100 Subject: [PATCH 083/170] no reason to keep the mongo settings --- backend/settings.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/backend/settings.py b/backend/settings.py index 249cd91a9..ff925bb3a 100644 --- a/backend/settings.py +++ b/backend/settings.py @@ -32,13 +32,6 @@ ## Generated with base64.b64encode(uuid.uuid4().bytes + uuid.uuid4().bytes) cookie_secret = json_settings["cookieSecret"] -# Mongodb settings -mongo_host = json_settings["mongoHost"] -mongo_port = json_settings["mongoPort"] -mongo_user = json_settings["mongoUser"] -mongo_password = json_settings["mongoPassword"] -mongo_databases = json_settings["mongoDatabases"] - # PostgreSQL settings psql_host = json_settings["postgresHost"] psql_port = json_settings["postgresPort"] From 09dce469c7fcfd73c18f0dd22fbc84d66d3a8e79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Wed, 27 Feb 2019 13:34:42 +0100 Subject: [PATCH 084/170] remove number_of_variants_in_transcript --- backend/modules/browser/browser_handlers.py | 3 -- backend/modules/browser/lookups.py | 34 +++---------------- backend/modules/browser/tests/test_lookups.py | 19 ----------- 3 files changed, 5 insertions(+), 51 deletions(-) diff --git a/backend/modules/browser/browser_handlers.py b/backend/modules/browser/browser_handlers.py index ed1de23be..911d72cef 100644 --- a/backend/modules/browser/browser_handlers.py +++ b/backend/modules/browser/browser_handlers.py @@ -94,9 +94,6 @@ def get(self, dataset:str, gene:str, ds_version:str=None): for exon in sorted(transcript['exons'], key=lambda k: k['start']): ret['exons'] += [{'start':exon['start'], 'stop':exon['stop'], 'type':exon['feature_type']}] - # Variants - ret['gene']['variants'] = lookups.get_number_of_variants_in_transcript(dataset, gene['canonical_transcript'], ds_version) - # Transcripts transcripts_in_gene = lookups.get_transcripts_in_gene(dataset, gene_id) if transcripts_in_gene: diff --git a/backend/modules/browser/lookups.py b/backend/modules/browser/lookups.py index fa412dd4c..07e684ed6 100644 --- a/backend/modules/browser/lookups.py +++ b/backend/modules/browser/lookups.py @@ -183,16 +183,16 @@ def get_gene(dataset:str, gene_id:str): gene_id (str): the id of the gene Returns: - dict: values for the gene; empty if not found + dict: values for the gene; None if not found """ ref_dbid = db.get_reference_dbid_dataset(dataset) if not ref_dbid: - return {} + return None try: return db.Gene.select().where((db.Gene.gene_id == gene_id) & (db.Gene.reference_set == ref_dbid)).dicts().get() except db.Gene.DoesNotExist: - return {} + return None def get_gene_by_dbid(gene_dbid:str): @@ -272,31 +272,6 @@ def get_genes_in_region(dataset:str, chrom:str, start_pos:int, stop_pos:int): return [gene for gene in gene_query] -def get_number_of_variants_in_transcript(dataset:str, transcript_id:str, ds_version:str=None): - """ - Get the total and filtered amount of variants in a transcript - - Args: - dataset (str): short name of the dataset - transcript_id (str): id of the transcript - ds_version (str): version of the dataset - - Returns: - dict: {filtered: nr_filtered, total: nr_total}, None if error - """ - dataset_version = db.get_dataset_version(dataset, ds_version) - if not dataset_version: - return None - - variants = get_variants_in_transcript(dataset, transcript_id) - if not variants: - return None - total = len(variants) - - filtered = len(tuple(variant for variant in variants if variant['filter_string'] == 'PASS')) - return {'filtered': filtered, 'total': total} - - def get_raw_variant(dataset:str, pos:int, chrom:str, ref:str, alt:str, ds_version:str=None): """ Retrieve variant by position and change @@ -522,7 +497,8 @@ def get_variants_in_gene(dataset:str, gene_id:str, ds_version:str=None): if not ref_dbid: return None dataset_version = db.get_dataset_version(dataset, ds_version) - + if not dataset_version: + return None gene = get_gene(dataset, gene_id) variants = [variant for variant in db.Variant.select() diff --git a/backend/modules/browser/tests/test_lookups.py b/backend/modules/browser/tests/test_lookups.py index a22496fbe..6f9863d3b 100644 --- a/backend/modules/browser/tests/test_lookups.py +++ b/backend/modules/browser/tests/test_lookups.py @@ -205,23 +205,6 @@ def test_get_genes_in_region(): assert not res -def test_get_number_of_variants_in_transcript(): - """ - Test get_number_of_variants_in_transcripts() - """ - # normal - res = lookups.get_number_of_variants_in_transcript('SweGen', 'ENST00000424770') - assert res == {'filtered': 66, 'total': 309} - - # bad transcript - res = lookups.get_number_of_variants_in_transcript('SweGen', 'ENSTASDSADA') - assert res is None - - # bad dataset - res = lookups.get_number_of_variants_in_transcript('bad_dataset', 'ENST00000424770') - assert res is None - - def test_get_transcript(): """ Test get_transcript() @@ -341,8 +324,6 @@ def test_get_variants_in_gene(): res = lookups.get_variants_in_gene('SweGen', 'ENSG00000198062') assert len(res) == 1185 assert not lookups.get_variants_in_gene('bad_dataset', 'ENSG00000198062') - res = lookups.get_variants_in_gene('ACpop', 'ENSG00000040608') - assert len(res) == 260 assert not lookups.get_variants_in_gene('bad_dataset', 'ENSGASDFG') From c3fac5721ee0df9cd6d6b2f7ed12322b2002e723 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Wed, 27 Feb 2019 21:39:38 +0100 Subject: [PATCH 085/170] variant counting added to variant list --- frontend/src/js/controller.browserController.js | 2 +- frontend/templates/ng-templates/browser-variant-list.jj2 | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/frontend/src/js/controller.browserController.js b/frontend/src/js/controller.browserController.js index 95986c814..9f4cb4059 100644 --- a/frontend/src/js/controller.browserController.js +++ b/frontend/src/js/controller.browserController.js @@ -88,9 +88,9 @@ variant.isMissense = variant.majorConsequence == "missense"; }; localThis.variants.map(mapFunction); + localThis.passed = localThis.variants.filter(v => v.isPass).length; localThis.filterVariants(); - }); Browser.getCoveragePos($routeParams.dataset, $routeParams.version, localThis.itemType, localThis.item).then( function(data) { localThis.coverage.region.start = data.start; diff --git a/frontend/templates/ng-templates/browser-variant-list.jj2 b/frontend/templates/ng-templates/browser-variant-list.jj2 index 1ee8a3d9a..815ff2a19 100644 --- a/frontend/templates/ng-templates/browser-variant-list.jj2 +++ b/frontend/templates/ng-templates/browser-variant-list.jj2 @@ -1,4 +1,7 @@
+
+ Variants: {{ctrl.passed}} (including filtered: {{ctrl.variants.length}}) +
From 4ab495be509a1fb4252004b8b583bad9d02c8b4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20=C3=96stberg?= Date: Thu, 28 Feb 2019 07:22:43 +0100 Subject: [PATCH 086/170] remove number of variants from gene part --- frontend/templates/ng-templates/browser-gene.html | 3 --- 1 file changed, 3 deletions(-) diff --git a/frontend/templates/ng-templates/browser-gene.html b/frontend/templates/ng-templates/browser-gene.html index 15439acc4..50e128a5b 100644 --- a/frontend/templates/ng-templates/browser-gene.html +++ b/frontend/templates/ng-templates/browser-gene.html @@ -15,9 +15,6 @@

Gene: {{ ctrl.gene.geneName }}

{{ ctrl.gene.geneName }}
{{ ctrl.gene.fullGeneName }}
-
Number of variants
-
{{ ctrl.gene.variants.filtered }} (Including filtered: {{ ctrl.gene.variants.total }})
-
UCSC Browser