Skip to content

Commit

Permalink
Merge pull request #541 from NBISweden/feature/compute_callcount
Browse files Browse the repository at this point in the history
Feature/compute callcount
  • Loading branch information
talavis committed Apr 25, 2019
2 parents 4bf48e7 + abf7853 commit 3395599
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 42 deletions.
12 changes: 12 additions & 0 deletions backend/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ class Meta:
ensembl_version = CharField()
gencode_version = CharField()
dbnsfp_version = CharField()
reference_build = CharField(unique=True)


class Gene(BaseModel):
Expand Down Expand Up @@ -434,6 +435,17 @@ class Meta:
hash = CharField()
expires_on = DateTimeField()


class BeaconCounts(BaseModel):
class Meta:
db_table = "beacon_dataset_counts_table"
schema = 'beacon'

datasetid = CharField(primary_key=True)
callcount = IntegerField()
variantcount = IntegerField()


#####
# Views
##
Expand Down
58 changes: 55 additions & 3 deletions scripts/importer/data_importer/raw_data_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,13 @@ def __init__(self, settings):
self.dataset_version = None
self.dataset = None
self.sampleset = None
self.counter = {'coverage':None,
'variants':None}
self.counter = {'coverage': None,
'variants': None,
'beaconvariants': 0,
'calls': 0,
'tmp_calls': set()}
self.lastpos = 0
self.chrom = None

def _set_dataset_info(self):
""" Save dataset information given as parameters """
Expand Down Expand Up @@ -85,6 +90,28 @@ def _select_dataset_version(self):
self.settings.set_vcf_sampleset_size = False
self.settings.sampleset_size = 0

def _create_beacon_counts(self):
"""
Add the number of unique references at each position (callcount),
the number of unique ref-alt pairs at each position (variantount)
and the datasetid (eg GRCh37:swegen:2019-01-01)
to the beacon_dataset_counts_table.
"""
# count the calls at the last position
self.counter['calls'] += len(self.counter['tmp_calls'])

ref_build = self.dataset_version.reference_set.reference_build
if not ref_build:
logging.warning('Reference build not set for dataset version.')
ref_build = '' # avoid None
datasetid = ':'.join([ref_build, self.dataset.short_name, self.dataset_version.version])
datarow = {'datasetid': datasetid,
'callcount': self.counter['calls'],
'variantcount': self.counter['beaconvariants']}
logging.info('Dataset counts: callcount: %s, variantcount: %s', datarow['callcount'], datarow['variantcount'])
if not self.settings.dry_run:
db.BeaconCounts.insert(datarow).execute()

def _insert_coverage(self):
"""
Header columns are chromosome, position, mean coverage, median coverage,
Expand Down Expand Up @@ -267,7 +294,10 @@ def _insert_variants(self):
data['variant_id'] = '{}-{}-{}-{}'.format(data['chrom'], data['pos'], data['ref'], data['alt'])
data['quality_metrics'] = dict([(x, info[x]) for x in METRICS if x in info])
batch += [data]
counter += 1
if self.settings.count_calls:
self.get_callcount(data) # count calls (one per reference)
self.counter['beaconvariants'] += 1 # count variants (one per alternate)
counter += 1 # count variants (one per vcf row)

if len(batch) >= self.settings.batch_size:
if not self.settings.dry_run:
Expand Down Expand Up @@ -331,6 +361,26 @@ def _insert_variants(self):
if not self.settings.dry_run:
logging.info("Inserted {} variant records in {}".format(counter, self._time_since(start)))

def get_callcount(self, data):
"""Increment the call count by the calls found at this position."""
if data['chrom'] == self.chrom and data['pos'] < self.lastpos:
# If this position is smaller than the last, the file order might be invalid.
# Give a warning, but keep on counting.
msg = "VCF file not ok, variants not given in incremental order. Callcount may not be valid!!!\n\n"
logging.warning(msg)

if data['chrom'] != self.chrom or data['pos'] != self.lastpos:
# We are at a new position, count and save the calls for the last position
self.counter['calls'] += len(self.counter['tmp_calls'])

# reset the counters
self.counter['tmp_calls'] = set()
self.lastpos = data['pos']
self.chrom = data['chrom']

# save the references for this position
self.counter['tmp_calls'].add(data['ref'])

def count_entries(self):
start = time.time()
if self.settings.coverage_file:
Expand Down Expand Up @@ -364,6 +414,8 @@ def start_import(self):
self._set_dataset_info()
if self.settings.variant_file:
self._insert_variants()
if self.settings.count_calls:
self._create_beacon_counts()
if not self.settings.beacon_only and self.settings.coverage_file:
self._insert_coverage()

Expand Down
3 changes: 3 additions & 0 deletions scripts/importer/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@
PARSER.add_argument("-q", "--quiet", action="count", default=0,
help="Decrease output Verbosity.")

PARSER.add_argument("--count_calls", action="store_true",
help=("Count calls for the dataset (used by Beacon)"))

# Beacon-only variants
PARSER.add_argument("--beacon-only", action="store_true",
help=("Variants are intended only for Beacon, loosening"
Expand Down
56 changes: 17 additions & 39 deletions sql/beacon_schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,15 @@ CREATE SCHEMA IF NOT EXISTS beacon;
-- These tables need to be represented as semi-complex views, as to avoid
-- storing redundant data.


CREATE TABLE IF NOT EXISTS beacon.beacon_dataset_counts_table (
datasetid varchar(128) PRIMARY KEY,
dataset integer REFERENCES data.dataset_versions,
callCount integer DEFAULT NULL,
variantCount integer DEFAULT NULL
);


CREATE OR REPLACE VIEW beacon.beacon_dataset_table AS -- original type
SELECT v.id AS index, -- serial
d.short_name AS name, -- varchar(128)
Expand All @@ -43,29 +52,7 @@ CREATE OR REPLACE VIEW beacon.beacon_dataset_table AS -- original type
;


-- This seems to return correct values except that it seems to
-- _always_ return 1 for callCount, even when there's no data.
-- TODO: make sure that callCount can handle zero values.
CREATE OR REPLACE VIEW beacon.beacon_dataset_counts_table AS
SELECT concat_ws(':', r.reference_build,
d.short_name,
v.dataset_version) AS datasetId, -- varchar(128)
COUNT(DISTINCT(dv.chrom,
dv.ref,
dv.pos)) AS callCount, -- integer
COUNT(dv) AS variantCount -- integer
FROM data.datasets as d
JOIN data.dataset_version_current AS v
ON v.dataset = d.id
JOIN data.reference_sets AS r
ON v.reference_set = r.id
LEFT JOIN data.variants AS dv
ON dv.dataset_version = v.id
GROUP BY r.reference_build, d.short_name, v.dataset_version
;


CREATE MATERIALIZED VIEW beacon.beacon_data_table AS
CREATE OR REPLACE VIEW beacon.beacon_data_table AS
SELECT dv.id AS index, -- serial
concat_ws(':', r.reference_build,
d.short_name,
Expand Down Expand Up @@ -95,28 +82,19 @@ CREATE MATERIALIZED VIEW beacon.beacon_data_table AS
--------------------------------------------------------------------------------
-- Beacon views.
--
-- These are kept as-is from the reference.

-- This index is part of the finnish schema, but I deactivated it so that I don't have to materialize the views
-- CREATE UNIQUE INDEX data_conflict ON beacon_data_table (datasetId, chromosome, start, reference, alternate);
-- CREATE UNIQUE INDEX metadata_conflict ON beacon_dataset_table (name, datasetId);
-- This gets really, really slow if not materialized. (TODO why?)

CREATE MATERIALIZED VIEW beacon.dataset_metadata(name, datasetId, description, assemblyId,
createDateTime, updateDateTime, version,
callCount, variantCount, sampleCount, externalUrl, accessType)
CREATE OR REPLACE VIEW beacon.dataset_metadata(name, datasetId, description, assemblyId,
createDateTime, updateDateTime, version,
callCount, variantCount, sampleCount, externalUrl, accessType)
AS SELECT a.name, a.datasetId, a.description, a.assemblyId, a.createDateTime,
a.updateDateTime, a.version, b.callCount,
b.variantCount,
a.sampleCount, a.externalUrl, a.accessType
FROM beacon.beacon_dataset_table a, beacon.beacon_dataset_counts_table b
WHERE a.datasetId=b.datasetId
GROUP BY a.name, a.datasetId, a.description, a.assemblyId, a.createDateTime,
a.updateDateTime, a.version, a.sampleCount, a.externalUrl, a.accessType, b.callCount, b.variantCount;
WHERE a.datasetId=b.datasetId;

--------------------------------------------------------------------------------
-- Indexes
--

CREATE INDEX beacon_data_chrpos ON beacon.beacon_data_table (chromosome,start);
CREATE INDEX beacon_data_chrref ON beacon.beacon_data_table (chromosome,reference);
CREATE INDEX beacon_data_chrpos ON data.variants ((substr(chrom, 1, 2)),(pos-1));
-- Use if needed:
-- CREATE INDEX beacon_data_chrref ON data.variants ((substr(chrom, 1, 2)),ref);

0 comments on commit 3395599

Please sign in to comment.