From 7ad04399ebf4efb941004fded1bddda05d3a3c53 Mon Sep 17 00:00:00 2001 From: MalinAhlberg Date: Tue, 16 Apr 2019 11:06:21 +0200 Subject: [PATCH 01/10] Update beacon schema: make counts_table a table --- backend/db.py | 12 ++++++++++++ sql/beacon_schema.sql | 31 ++++++++----------------------- 2 files changed, 20 insertions(+), 23 deletions(-) diff --git a/backend/db.py b/backend/db.py index 1ea48aa4c..9defcb0f3 100644 --- a/backend/db.py +++ b/backend/db.py @@ -68,6 +68,7 @@ class Meta: ensembl_version = CharField() gencode_version = CharField() dbnsfp_version = CharField() + reference_build = CharField(unique=True) class Gene(BaseModel): @@ -411,6 +412,17 @@ class Meta: hash = CharField() expires_on = DateTimeField() + +class BeaconCounts(BaseModel): + class Meta: + db_table = "beacon_dataset_counts_table" + schema = 'beacon' + + datasetid = CharField(primary_key=True) + callcount = IntegerField() + variantcount = IntegerField() + + ##### # Views ## diff --git a/sql/beacon_schema.sql b/sql/beacon_schema.sql index e938f3541..a25186d17 100644 --- a/sql/beacon_schema.sql +++ b/sql/beacon_schema.sql @@ -43,26 +43,11 @@ CREATE OR REPLACE VIEW beacon.beacon_dataset_table AS -- original type ; --- This seems to return correct values except that it seems to --- _always_ return 1 for callCount, even when there's no data. --- TODO: make sure that callCount can handle zero values. -CREATE OR REPLACE VIEW beacon.beacon_dataset_counts_table AS - SELECT concat_ws(':', r.reference_build, - d.short_name, - v.dataset_version) AS datasetId, -- varchar(128) - COUNT(DISTINCT(dv.chrom, - dv.ref, - dv.pos)) AS callCount, -- integer - COUNT(dv) AS variantCount -- integer - FROM data.datasets as d - JOIN data.dataset_version_current AS v - ON v.dataset = d.id - JOIN data.reference_sets AS r - ON v.reference_set = r.id - LEFT JOIN data.variants AS dv - ON dv.dataset_version = v.id - GROUP BY r.reference_build, d.short_name, v.dataset_version -; +CREATE TABLE IF NOT EXISTS beacon.beacon_dataset_counts_table ( + datasetid varchar(128) PRIMARY KEY, + callCount integer DEFAULT NULL, + variantCount integer DEFAULT NULL +); CREATE MATERIALIZED VIEW beacon.beacon_data_table AS @@ -102,9 +87,9 @@ CREATE MATERIALIZED VIEW beacon.beacon_data_table AS -- CREATE UNIQUE INDEX metadata_conflict ON beacon_dataset_table (name, datasetId); -- This gets really, really slow if not materialized. (TODO why?) -CREATE MATERIALIZED VIEW beacon.dataset_metadata(name, datasetId, description, assemblyId, - createDateTime, updateDateTime, version, - callCount, variantCount, sampleCount, externalUrl, accessType) +CREATE VIEW beacon.dataset_metadata(name, datasetId, description, assemblyId, + createDateTime, updateDateTime, version, + callCount, variantCount, sampleCount, externalUrl, accessType) AS SELECT a.name, a.datasetId, a.description, a.assemblyId, a.createDateTime, a.updateDateTime, a.version, b.callCount, b.variantCount, From b892e17abe056e0970046153cb84dac25c2a126c Mon Sep 17 00:00:00 2001 From: MalinAhlberg Date: Tue, 16 Apr 2019 11:06:51 +0200 Subject: [PATCH 02/10] Add counting of calls to import script --- .../data_importer/raw_data_importer.py | 38 +++++++++++++++++-- scripts/importer/importer.py | 3 ++ 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/scripts/importer/data_importer/raw_data_importer.py b/scripts/importer/data_importer/raw_data_importer.py index fcbdc0a8a..4b24a551f 100644 --- a/scripts/importer/data_importer/raw_data_importer.py +++ b/scripts/importer/data_importer/raw_data_importer.py @@ -27,8 +27,10 @@ def __init__(self, settings): self.dataset_version = None self.dataset = None self.sampleset = None - self.counter = {'coverage':None, - 'variants':None} + self.counter = {'coverage': None, + 'variants': None, + 'beaconvariants': 0, + 'calls': {}} def _set_dataset_info(self): """ Save dataset information given as parameters """ @@ -85,6 +87,23 @@ def _select_dataset_version(self): self.settings.set_vcf_sampleset_size = False self.settings.sampleset_size = 0 + def _create_beacon_counts(self): + """ + Add the number of unique references at each position (callcount), + the number of unique ref-alt pairs at each position (variantount) + and the datasetid (eg GRCh37:swegen:2019-01-01) + to the beacon_dataset_counts_table. + """ + callcount = sum(len(refs) for refs in self.counter['calls'].values()) + logging.info('Number of calls: %s', callcount) + + ref_build = self.dataset_version.reference_set.reference_build + datasetid = ':'.join([ref_build, self.dataset.short_name, self.dataset_version.version]) + datarow = {'datasetid': datasetid, 'callcount': callcount, 'variantcount': self.counter['beaconvariants']} + logging.info('Dataset counts: %s', datarow) + if not self.settings.dry_run: + db.BeaconCounts.insert(datarow).execute() + def _insert_coverage(self): """ Header columns are chromosome, position, mean coverage, median coverage, @@ -267,7 +286,10 @@ def _insert_variants(self): data['variant_id'] = '{}-{}-{}-{}'.format(data['chrom'], data['pos'], data['ref'], data['alt']) data['quality_metrics'] = dict([(x, info[x]) for x in METRICS if x in info]) batch += [data] - counter += 1 + if self.settings.count_calls: + self.get_callcount(data) + self.counter['beaconvariants'] += 1 # count variants (one per alternate) + counter += 1 # count variants (one per vcf row) if len(batch) >= self.settings.batch_size: if not self.settings.dry_run: @@ -331,6 +353,14 @@ def _insert_variants(self): if not self.settings.dry_run: logging.info("Inserted {} variant records in {}".format(counter, self._time_since(start))) + def get_callcount(self, data): + """Save all references at this position, in order to compute the number of calls later""" + chrompos = '%s-%s' % (data['chrom'], data['pos']) + if chrompos not in self.counter['calls']: + self.counter['calls'][chrompos] = set() + + self.counter['calls'][chrompos].add(data['ref']) + def count_entries(self): start = time.time() if self.settings.coverage_file: @@ -364,6 +394,8 @@ def start_import(self): self._set_dataset_info() if self.settings.variant_file: self._insert_variants() + if self.settings.count_calls: + self._create_beacon_counts() if not self.settings.beacon_only and self.settings.coverage_file: self._insert_coverage() diff --git a/scripts/importer/importer.py b/scripts/importer/importer.py index 95f691a97..1bf2e4303 100755 --- a/scripts/importer/importer.py +++ b/scripts/importer/importer.py @@ -91,6 +91,9 @@ PARSER.add_argument("-q", "--quiet", action="count", default=0, help="Decrease output Verbosity.") + PARSER.add_argument("--count_calls", action="store_true", + help=("Count calls for the dataset (used by Beacon)")) + # Beacon-only variants PARSER.add_argument("--beacon-only", action="store_true", help=("Variants are intended only for Beacon, loosening" From 1c87d3431acbb9b3b202a97a597296e0b1f469c2 Mon Sep 17 00:00:00 2001 From: MalinAhlberg Date: Wed, 17 Apr 2019 13:00:20 +0200 Subject: [PATCH 03/10] Remove non-usefull groupby from beacon schemas --- sql/beacon_schema.sql | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/sql/beacon_schema.sql b/sql/beacon_schema.sql index a25186d17..a87a97fde 100644 --- a/sql/beacon_schema.sql +++ b/sql/beacon_schema.sql @@ -17,6 +17,15 @@ CREATE SCHEMA IF NOT EXISTS beacon; -- These tables need to be represented as semi-complex views, as to avoid -- storing redundant data. + +CREATE TABLE IF NOT EXISTS beacon.beacon_dataset_counts_table ( + datasetid varchar(128) PRIMARY KEY, + dataset integer REFERENCES data.dataset_versions, + callCount integer DEFAULT NULL, + variantCount integer DEFAULT NULL +); + + CREATE OR REPLACE VIEW beacon.beacon_dataset_table AS -- original type SELECT v.id AS index, -- serial d.short_name AS name, -- varchar(128) @@ -43,13 +52,6 @@ CREATE OR REPLACE VIEW beacon.beacon_dataset_table AS -- original type ; -CREATE TABLE IF NOT EXISTS beacon.beacon_dataset_counts_table ( - datasetid varchar(128) PRIMARY KEY, - callCount integer DEFAULT NULL, - variantCount integer DEFAULT NULL -); - - CREATE MATERIALIZED VIEW beacon.beacon_data_table AS SELECT dv.id AS index, -- serial concat_ws(':', r.reference_build, @@ -80,13 +82,6 @@ CREATE MATERIALIZED VIEW beacon.beacon_data_table AS -------------------------------------------------------------------------------- -- Beacon views. -- --- These are kept as-is from the reference. - --- This index is part of the finnish schema, but I deactivated it so that I don't have to materialize the views --- CREATE UNIQUE INDEX data_conflict ON beacon_data_table (datasetId, chromosome, start, reference, alternate); --- CREATE UNIQUE INDEX metadata_conflict ON beacon_dataset_table (name, datasetId); --- This gets really, really slow if not materialized. (TODO why?) - CREATE VIEW beacon.dataset_metadata(name, datasetId, description, assemblyId, createDateTime, updateDateTime, version, callCount, variantCount, sampleCount, externalUrl, accessType) @@ -95,9 +90,7 @@ AS SELECT a.name, a.datasetId, a.description, a.assemblyId, a.createDateTime, b.variantCount, a.sampleCount, a.externalUrl, a.accessType FROM beacon.beacon_dataset_table a, beacon.beacon_dataset_counts_table b -WHERE a.datasetId=b.datasetId -GROUP BY a.name, a.datasetId, a.description, a.assemblyId, a.createDateTime, -a.updateDateTime, a.version, a.sampleCount, a.externalUrl, a.accessType, b.callCount, b.variantCount; +WHERE a.datasetId=b.datasetId; -------------------------------------------------------------------------------- -- Indexes From 5e4c8d8114b17be8d213a0a97284c442e14048d1 Mon Sep 17 00:00:00 2001 From: MalinAhlberg Date: Wed, 17 Apr 2019 13:41:40 +0200 Subject: [PATCH 04/10] Beacon indices --- sql/beacon_schema.sql | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/sql/beacon_schema.sql b/sql/beacon_schema.sql index a87a97fde..a96e46b16 100644 --- a/sql/beacon_schema.sql +++ b/sql/beacon_schema.sql @@ -79,6 +79,32 @@ CREATE MATERIALIZED VIEW beacon.beacon_data_table AS ; +CREATE VIEW beacon.beacon_data_table AS + SELECT dv.id AS index, -- serial + concat_ws(':', r.reference_build, + d.short_name, + v.dataset_version) AS datasetId, -- varchar(128) + dv.pos AS "start", -- integer + substr(dv.chrom, 1, 2) AS chromosome, -- varchar(2) + dv.ref AS reference, -- varchar(8192) + dv.alt AS alternate, -- varchar(8192) + dv.pos - 1 + char_length(dv.ref) AS "end", -- integer + dv.allele_num AS callCount, -- integer + dv.allele_freq AS frequency, -- integer + dv.allele_count AS alleleCount, -- integer + CASE WHEN length(dv.ref) = length(dv.alt) THEN 'SNP' + WHEN length(dv.ref) > length(dv.alt) THEN 'DEL' + WHEN length(dv.ref) < length(dv.alt) THEN 'INS' + END AS variantType -- varchar(16) + FROM data.variants AS dv + JOIN data.dataset_version_current as v + ON dv.dataset_version = v.id + JOIN data.datasets as d + ON v.dataset = d.id + JOIN data.reference_sets AS r + ON v.reference_set = r.id +; + -------------------------------------------------------------------------------- -- Beacon views. -- @@ -95,6 +121,5 @@ WHERE a.datasetId=b.datasetId; -------------------------------------------------------------------------------- -- Indexes -- - -CREATE INDEX beacon_data_chrpos ON beacon.beacon_data_table (chromosome,start); -CREATE INDEX beacon_data_chrref ON beacon.beacon_data_table (chromosome,reference); +CREATE INDEX beacon_data_chrpos ON data.variants ((substr(chrom, 1, 2)),(pos-1)); +CREATE INDEX beacon_data_chrref ON data.variants ((substr(chrom, 1, 2)),ref); From 4f31d33a1969da34c8daa78d9cd2aed63b8b41c1 Mon Sep 17 00:00:00 2001 From: MalinAhlberg Date: Wed, 17 Apr 2019 13:45:32 +0200 Subject: [PATCH 05/10] Stop using materialized view for beacon data table --- sql/beacon_schema.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/beacon_schema.sql b/sql/beacon_schema.sql index a96e46b16..4c34ad57a 100644 --- a/sql/beacon_schema.sql +++ b/sql/beacon_schema.sql @@ -52,7 +52,7 @@ CREATE OR REPLACE VIEW beacon.beacon_dataset_table AS -- original type ; -CREATE MATERIALIZED VIEW beacon.beacon_data_table AS +CREATE VIEW beacon.beacon_data_table AS SELECT dv.id AS index, -- serial concat_ws(':', r.reference_build, d.short_name, From ac4a6482357d02b528e8e49b28facd1c799392b4 Mon Sep 17 00:00:00 2001 From: MalinAhlberg Date: Wed, 17 Apr 2019 16:01:37 +0200 Subject: [PATCH 06/10] Count calls using less memory Assuming the vcf files are properly ordered (by increasing position). TODO: better error handling. --- .../data_importer/raw_data_importer.py | 38 ++++++++++++++----- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/scripts/importer/data_importer/raw_data_importer.py b/scripts/importer/data_importer/raw_data_importer.py index 4b24a551f..f59b214ee 100644 --- a/scripts/importer/data_importer/raw_data_importer.py +++ b/scripts/importer/data_importer/raw_data_importer.py @@ -30,7 +30,10 @@ def __init__(self, settings): self.counter = {'coverage': None, 'variants': None, 'beaconvariants': 0, - 'calls': {}} + 'calls': 0, + 'tmp_calls': {}} + self.lastpos = 0 + self.chrom = None def _set_dataset_info(self): """ Save dataset information given as parameters """ @@ -94,12 +97,14 @@ def _create_beacon_counts(self): and the datasetid (eg GRCh37:swegen:2019-01-01) to the beacon_dataset_counts_table. """ - callcount = sum(len(refs) for refs in self.counter['calls'].values()) - logging.info('Number of calls: %s', callcount) + # count the last calls at the last position + self.counter['calls'] += sum(len(refs) for refs in self.counter['tmp_calls'].values()) ref_build = self.dataset_version.reference_set.reference_build datasetid = ':'.join([ref_build, self.dataset.short_name, self.dataset_version.version]) - datarow = {'datasetid': datasetid, 'callcount': callcount, 'variantcount': self.counter['beaconvariants']} + datarow = {'datasetid': datasetid, + 'callcount': self.counter['calls'], + 'variantcount': self.counter['beaconvariants']} logging.info('Dataset counts: %s', datarow) if not self.settings.dry_run: db.BeaconCounts.insert(datarow).execute() @@ -287,7 +292,7 @@ def _insert_variants(self): data['quality_metrics'] = dict([(x, info[x]) for x in METRICS if x in info]) batch += [data] if self.settings.count_calls: - self.get_callcount(data) + self.get_callcount(data) # count calls (one per reference) self.counter['beaconvariants'] += 1 # count variants (one per alternate) counter += 1 # count variants (one per vcf row) @@ -354,12 +359,25 @@ def _insert_variants(self): logging.info("Inserted {} variant records in {}".format(counter, self._time_since(start))) def get_callcount(self, data): - """Save all references at this position, in order to compute the number of calls later""" - chrompos = '%s-%s' % (data['chrom'], data['pos']) - if chrompos not in self.counter['calls']: - self.counter['calls'][chrompos] = set() + """Increament the call count by the calls found at this position.""" + chrompos = f'{data["chrom"]}-{data["pos"]}' - self.counter['calls'][chrompos].add(data['ref']) + if data['chrom'] == self.chrom and data['pos'] < self.lastpos: + # TODO check this earlier, to avoid partial data to be inserted in the DB + raise Exception(f"Variant file corrupt, variants not given in incremental order.") + + if data['chrom'] != self.chrom or data['pos'] > self.lastpos: + # count the calls for the last position + callcount = sum(len(refs) for refs in self.counter['tmp_calls'].values()) + self.counter['calls'] += callcount + + # reset the counters + self.counter['tmp_calls'] = {chrompos: set()} + self.lastpos = data['pos'] + self.chrom = data['chrom'] + + # save the references for this position + self.counter['tmp_calls'][chrompos].add(data['ref']) def count_entries(self): start = time.time() From 753ac06bee4a2f117660d95b554830f9f6bcf68f Mon Sep 17 00:00:00 2001 From: MalinAhlberg Date: Wed, 24 Apr 2019 10:48:01 +0200 Subject: [PATCH 07/10] Simplify callcounting code --- .../importer/data_importer/raw_data_importer.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/scripts/importer/data_importer/raw_data_importer.py b/scripts/importer/data_importer/raw_data_importer.py index f59b214ee..f2f6aa1ca 100644 --- a/scripts/importer/data_importer/raw_data_importer.py +++ b/scripts/importer/data_importer/raw_data_importer.py @@ -31,7 +31,7 @@ def __init__(self, settings): 'variants': None, 'beaconvariants': 0, 'calls': 0, - 'tmp_calls': {}} + 'tmp_calls': set()} self.lastpos = 0 self.chrom = None @@ -97,8 +97,8 @@ def _create_beacon_counts(self): and the datasetid (eg GRCh37:swegen:2019-01-01) to the beacon_dataset_counts_table. """ - # count the last calls at the last position - self.counter['calls'] += sum(len(refs) for refs in self.counter['tmp_calls'].values()) + # count the calls at the last position + self.counter['calls'] += len(self.counter['tmp_calls']) ref_build = self.dataset_version.reference_set.reference_build datasetid = ':'.join([ref_build, self.dataset.short_name, self.dataset_version.version]) @@ -360,24 +360,21 @@ def _insert_variants(self): def get_callcount(self, data): """Increament the call count by the calls found at this position.""" - chrompos = f'{data["chrom"]}-{data["pos"]}' - if data['chrom'] == self.chrom and data['pos'] < self.lastpos: # TODO check this earlier, to avoid partial data to be inserted in the DB raise Exception(f"Variant file corrupt, variants not given in incremental order.") if data['chrom'] != self.chrom or data['pos'] > self.lastpos: - # count the calls for the last position - callcount = sum(len(refs) for refs in self.counter['tmp_calls'].values()) - self.counter['calls'] += callcount + # We are at a new position, count and save the calls for the last position + self.counter['calls'] += len(self.counter['tmp_calls']) # reset the counters - self.counter['tmp_calls'] = {chrompos: set()} + self.counter['tmp_calls'] = set() self.lastpos = data['pos'] self.chrom = data['chrom'] # save the references for this position - self.counter['tmp_calls'][chrompos].add(data['ref']) + self.counter['tmp_calls'].add(data['ref']) def count_entries(self): start = time.time() From cd0ff0afde808adafa24c5918a69711096eb7468 Mon Sep 17 00:00:00 2001 From: MalinAhlberg Date: Wed, 24 Apr 2019 13:28:41 +0200 Subject: [PATCH 08/10] Fix typo --- scripts/importer/data_importer/raw_data_importer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/importer/data_importer/raw_data_importer.py b/scripts/importer/data_importer/raw_data_importer.py index f2f6aa1ca..35c848e32 100644 --- a/scripts/importer/data_importer/raw_data_importer.py +++ b/scripts/importer/data_importer/raw_data_importer.py @@ -359,7 +359,7 @@ def _insert_variants(self): logging.info("Inserted {} variant records in {}".format(counter, self._time_since(start))) def get_callcount(self, data): - """Increament the call count by the calls found at this position.""" + """Increment the call count by the calls found at this position.""" if data['chrom'] == self.chrom and data['pos'] < self.lastpos: # TODO check this earlier, to avoid partial data to be inserted in the DB raise Exception(f"Variant file corrupt, variants not given in incremental order.") From 2e7abcb3506531c278c2ad7ca949f3eb3c159817 Mon Sep 17 00:00:00 2001 From: MalinAhlberg Date: Wed, 24 Apr 2019 13:32:38 +0200 Subject: [PATCH 09/10] Remove ref index --- sql/beacon_schema.sql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sql/beacon_schema.sql b/sql/beacon_schema.sql index 4c34ad57a..2658b1ade 100644 --- a/sql/beacon_schema.sql +++ b/sql/beacon_schema.sql @@ -122,4 +122,5 @@ WHERE a.datasetId=b.datasetId; -- Indexes -- CREATE INDEX beacon_data_chrpos ON data.variants ((substr(chrom, 1, 2)),(pos-1)); -CREATE INDEX beacon_data_chrref ON data.variants ((substr(chrom, 1, 2)),ref); +-- Use if needed: +-- CREATE INDEX beacon_data_chrref ON data.variants ((substr(chrom, 1, 2)),ref); From 38c160400aadf35190cc1529ecc34d04ac64907c Mon Sep 17 00:00:00 2001 From: MalinAhlberg Date: Wed, 24 Apr 2019 14:29:13 +0200 Subject: [PATCH 10/10] Warn for bad variant order --- scripts/importer/data_importer/raw_data_importer.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scripts/importer/data_importer/raw_data_importer.py b/scripts/importer/data_importer/raw_data_importer.py index 35c848e32..0da67b78d 100644 --- a/scripts/importer/data_importer/raw_data_importer.py +++ b/scripts/importer/data_importer/raw_data_importer.py @@ -361,10 +361,12 @@ def _insert_variants(self): def get_callcount(self, data): """Increment the call count by the calls found at this position.""" if data['chrom'] == self.chrom and data['pos'] < self.lastpos: - # TODO check this earlier, to avoid partial data to be inserted in the DB - raise Exception(f"Variant file corrupt, variants not given in incremental order.") + # If this position is smaller than the last, the file order might be invalid. + # Give a warning, but keep on counting. + msg = "VCF file not ok, variants not given in incremental order. Callcount may not be valid!!!\n\n" + logging.warning(msg) - if data['chrom'] != self.chrom or data['pos'] > self.lastpos: + if data['chrom'] != self.chrom or data['pos'] != self.lastpos: # We are at a new position, count and save the calls for the last position self.counter['calls'] += len(self.counter['tmp_calls'])