diff --git a/.gitignore b/.gitignore index e07e09d6e..1e3537bf3 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,13 @@ tornado/static/js/app.min.js backend/static backend/templates static +# importer and config stuff +mysql-data* +scripts/importer/downloaded_files +# docker stuff +postgres-data +# local personal things +personal +# travis test remnants +master-schema.sql +settings.json.tmp diff --git a/.travis.yml b/.travis.yml index e9d8926e4..9323e2edd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,3 +10,8 @@ install: - pip install coverage coveralls script: - test/travis_script.sh +addons: + postgresql: "10" + apt: + packages: + - postgresql-client-10 diff --git a/Dockerfile-backend b/Dockerfile-backend index 2ccc0b728..3aeb82466 100644 --- a/Dockerfile-backend +++ b/Dockerfile-backend @@ -1,13 +1,14 @@ -FROM ubuntu:16.04 +FROM ubuntu:18.04 RUN apt-get update && apt-get install -y \ python3 \ - python3-pip \ - libmysqlclient-dev + python3-pip ADD . /code +COPY settings_sample.json /settings.json +RUN sed -i 's/"postgresHost"\s*:.*,/"postgresHost" : "db",/' /settings.json WORKDIR /code RUN pip3 install -r backend/requirements.txt -CMD ["python3", "backend/route.py", "--develop"] +CMD ["python3", "backend/route.py", "--develop", "--settings_file", "/settings.json"] diff --git a/Dockerfile-database b/Dockerfile-database new file mode 100644 index 000000000..16464f7ab --- /dev/null +++ b/Dockerfile-database @@ -0,0 +1,6 @@ +FROM postgres:10 + +ENV POSTGRES_DB swefreq +COPY sql/data_schema.sql /docker-entrypoint-initdb.d/01_data_schema.sql +COPY sql/user_schema.sql /docker-entrypoint-initdb.d/02_user_schema.sql +COPY sql/beacon_schema.sql /docker-entrypoint-initdb.d/03_beacon_schema.sql diff --git a/Dockerfile-frontend-rebuilder b/Dockerfile-frontend-rebuilder index 444619a1c..32cb88a52 100644 --- a/Dockerfile-frontend-rebuilder +++ b/Dockerfile-frontend-rebuilder @@ -1,4 +1,4 @@ -FROM ubuntu:16.04 +FROM ubuntu:18.04 RUN apt-get update && \ apt-get install -y \ @@ -7,12 +7,12 @@ RUN apt-get update && \ python3 \ python3-pip \ python3-pyinotify \ - inotify-tools \ - libmysqlclient-dev && \ + inotify-tools && \ update-alternatives --install /usr/bin/python python /usr/bin/python3 5 -RUN curl -sL https://deb.nodesource.com/setup_6.x | bash - && \ - apt-get install -y nodejs +RUN apt-get install -y \ + nodejs \ + npm ADD . /code WORKDIR /code diff --git a/backend/application.py b/backend/application.py index 74bc47a3a..f67e57c61 100644 --- a/backend/application.py +++ b/backend/application.py @@ -259,7 +259,7 @@ def get(self, dataset, version=None): for f in dataset_version.files: d = db.build_dict_from_row(f) d['dirname'] = path.dirname(d['uri']) - d['human_size'] = format_bytes(d['bytes']) + d['human_size'] = format_bytes(d['file_size']) ret.append(d) self.finish({'files': ret}) @@ -576,7 +576,7 @@ def get(self, dataset): return self.set_header("Content-Type", logo_entry.mimetype) - self.write(logo_entry.data) + self.write(logo_entry.data.tobytes()) self.finish() diff --git a/backend/db.py b/backend/db.py index ed6d600eb..d3e329c3b 100644 --- a/backend/db.py +++ b/backend/db.py @@ -1,28 +1,30 @@ -from peewee import ( - BlobField, - CharField, - DateTimeField, - Field, - FloatField, - ForeignKeyField, - IntegerField, - Model, - MySQLDatabase, - PrimaryKeyField, - TextField, - fn, - ) -import logging -import settings - -database = MySQLDatabase( - settings.mysql_schema, - host=settings.mysql_host, - user=settings.mysql_user, - password=settings.mysql_passwd, - port=settings.mysql_port - ) +#!/usr/bin/env python3 +import settings +from peewee import (BigIntegerField, + BlobField, + BooleanField, + CharField, + DateTimeField, + IntegerField, + Field, + FloatField, + ForeignKeyField, + Model, + PostgresqlDatabase, + PrimaryKeyField, + SQL, + TextField, + fn, + ) +from playhouse.postgres_ext import ArrayField, BinaryJSONField, PostgresqlExtDatabase + +database = PostgresqlExtDatabase(settings.psql_name, + user = settings.psql_user, + password = settings.psql_pass, + host = settings.psql_host, + port = settings.psql_port, + register_hstore = False) class BaseModel(Model): class Meta: @@ -32,8 +34,8 @@ class Meta: class EnumField(Field): db_field = 'string' # The same as for CharField - def __init__(self, values=None, *args, **kwargs): - self.values = values or [] + def __init__(self, choices=None, *args, **kwargs): + self.values = choices or [] super().__init__(*args, **kwargs) def db_value(self, value): @@ -46,67 +48,133 @@ def python_value(self, value): raise ValueError("Illegal value for '{}'".format(self.db_column)) return value +### +# Reference Tables +## -class User(BaseModel): - user = PrimaryKeyField(db_column='user_pk') - name = CharField(null=True) - email = CharField(unique=True) - identity = CharField(unique=True) - identity_type = EnumField(null=False, values=['google', 'elixir']) - affiliation = CharField(null=True) - country = CharField(null=True) - def is_admin(self, dataset): - return DatasetAccess.select().where( - DatasetAccess.dataset == dataset, - DatasetAccess.user == self, - DatasetAccess.is_admin - ).count() +class ReferenceSet(BaseModel): + """ + The gencode, ensembl, dbNSFP and omim data are combined to fill out the + Gene, Transcript and Feature tables. DbSNP data is separate, and may be + shared between reference sets, so it uses a foreign key instead. + """ + class Meta: + db_table = 'reference_sets' + schema = 'data' - def has_access(self, dataset): - return DatasetAccessCurrent.select().where( - DatasetAccessCurrent.dataset == dataset, - DatasetAccessCurrent.user == self, - ).count() + name = CharField(db_column="reference_name", null=True) + ensembl_version = CharField() + gencode_version = CharField() + dbnsfp_version = CharField() - def has_requested_access(self, dataset): - return DatasetAccessPending.select().where( - DatasetAccessPending.dataset == dataset, - DatasetAccessPending.user == self - ).count() +class Gene(BaseModel): class Meta: - db_table = 'user' + db_table = 'genes' + schema = 'data' + + reference_set = ForeignKeyField(ReferenceSet, db_column="reference_set", related_name="genes") + gene_id = CharField(unique=True, max_length=15) + name = CharField(db_column="gene_name", null=True) + full_name = CharField(null=True) + canonical_transcript = CharField(null=True, max_length=15) + chrom = CharField(max_length=10) + start = IntegerField(db_column="start_pos") + stop = IntegerField(db_column="end_pos") + strand = EnumField(choices=['+','-']) + +class GeneOtherNames(BaseModel): + class Meta: + db_table = 'gene_other_names' + schema = 'data' + + gene = ForeignKeyField(Gene, db_column="gene", related_name="other_names") + name = CharField(null=True) + +class Transcript(BaseModel): + class Meta: + db_table = 'transcripts' + schema = 'data' + + transcript_id = CharField(max_length=15) + gene = ForeignKeyField(Gene, db_column="gene", related_name="transcripts") + mim_gene_accession = IntegerField() + mim_annotation = CharField() + chrom = CharField(max_length=10) + start = IntegerField(db_column="start_pos") + stop = IntegerField(db_column="stop_pos") + strand = EnumField(choices = ['+', '-']) + + +class Feature(BaseModel): + class Meta: + db_table = 'features' + schema = 'data' + + gene = ForeignKeyField(Gene, db_column="gene", related_name='exons') + transcript = ForeignKeyField(Transcript, db_column="transcript", related_name='transcripts') + chrom = CharField(max_length=10) + start = IntegerField(db_column="start_pos") + stop = IntegerField(db_column="stop_pos") + strand = EnumField(choices = ['+', '-']) + feature_type = CharField() + +### +# Study and Dataset fields +## + +class Collection(BaseModel): + """ + A collection is a source of data which can be sampled into a SampleSet. + """ + class Meta: + db_table = 'collections' + schema = 'data' + + name = CharField(db_column="study_name", null = True) + ethnicity = CharField(null = True) class Study(BaseModel): - study = PrimaryKeyField(db_column='study_pk') + """ + A study is a scientific study with a PI and a description, and may include + one or more datasets. + """ + class Meta: + db_table = 'studies' + schema = 'data' + pi_name = CharField() pi_email = CharField() contact_name = CharField() contact_email = CharField() title = CharField() - description = TextField(null=True) + description = TextField(db_column="study_description", null=True) publication_date = DateTimeField() ref_doi = CharField(null=True) - class Meta: - db_table = 'study' - class Dataset(BaseModel): - dataset = PrimaryKeyField(db_column='dataset_pk') - study = ForeignKeyField(db_column='study_pk', rel_model=Study, to_field='study', related_name='datasets') + """ + A dataset is part of a study, and usually include a certain population. + Most studies only have a single dataset, but multiple are allowed. + """ + class Meta: + db_table = 'datasets' + schema = 'data' + + study = ForeignKeyField(Study, db_column="study", related_name='datasets') short_name = CharField() full_name = CharField() browser_uri = CharField(null=True) beacon_uri = CharField(null=True) + description = TextField(db_column="beacon_description", null=True) avg_seq_depth = FloatField(null=True) seq_type = CharField(null=True) seq_tech = CharField(null=True) seq_center = CharField(null=True) dataset_size = IntegerField() - mongodb_collection = CharField(null=False) def has_image(self): try: @@ -115,158 +183,270 @@ def has_image(self): except DatasetLogo.DoesNotExist: return False + +class SampleSet(BaseModel): class Meta: - db_table = 'dataset' + db_table = 'sample_sets' + schema = 'data' + + dataset = ForeignKeyField(Dataset, db_column="dataset", related_name='sample_sets') + collection = ForeignKeyField(Collection, db_column="collection", related_name='sample_sets') + sample_size = IntegerField() + phenotype = CharField(null=True) class DatasetVersion(BaseModel): - dataset_version = PrimaryKeyField(db_column='dataset_version_pk') - dataset = ForeignKeyField(db_column='dataset_pk', rel_model=Dataset, to_field='dataset', related_name='versions') - version = CharField() - description = TextField() + class Meta: + db_table = 'dataset_versions' + schema = 'data' + + dataset = ForeignKeyField(Dataset, db_column="dataset", related_name='versions') + reference_set = ForeignKeyField(ReferenceSet, db_column="reference_set", related_name='dataset_versions') + version = CharField(db_column="dataset_version") + description = TextField(db_column="dataset_description") terms = TextField() var_call_ref = CharField(null=True) available_from = DateTimeField() ref_doi = CharField(null=True) data_contact_name = CharField(null=True) data_contact_link = CharField(null=True) + num_variants = IntegerField(null=True) + coverage_levels = ArrayField(IntegerField, null=True) + +class DatasetFile(BaseModel): class Meta: - db_table = 'dataset_version' + db_table = 'dataset_files' + schema = 'data' + dataset_version = ForeignKeyField(DatasetVersion, db_column="dataset_version", related_name='files') + name = CharField(db_column="basename") + uri = CharField() + file_size = IntegerField() -class Collection(BaseModel): - collection = PrimaryKeyField(db_column = 'collection_pk') - name = CharField(null = True) - ethnicity = CharField(null = True) +class DatasetLogo(BaseModel): class Meta: - db_table = 'collection' + db_table = 'dataset_logos' + schema = 'data' + + dataset = ForeignKeyField(Dataset, db_column="dataset", related_name='logo') + mimetype = CharField() + data = BlobField(db_column="bytes") -class SampleSet(BaseModel): - sample_set = PrimaryKeyField(db_column='sample_set_pk') - dataset = ForeignKeyField(db_column='dataset_pk', rel_model=Dataset, to_field='dataset', related_name='sample_sets') - collection = ForeignKeyField(db_column='collection_pk', rel_model=Collection, to_field='collection', related_name='sample_sets') - sample_size = IntegerField() - phenotype = CharField(null=True) +### +# Variant and coverage data fields +## +class Variant(BaseModel): + class Meta: + db_table = "variants" + schema = 'data' + + dataset_version = ForeignKeyField(DatasetVersion, db_column="dataset_version", related_name="variants") + rsid = IntegerField() + chrom = CharField(max_length=10) + pos = IntegerField() + ref = CharField() + alt = CharField() + site_quality = FloatField() + orig_alt_alleles = ArrayField(CharField) + hom_count = IntegerField() + allele_freq = FloatField() + filter_string = CharField() + variant_id = CharField() + allele_count = IntegerField() + allele_num = IntegerField() + quality_metrics = BinaryJSONField() + vep_annotations = BinaryJSONField() + + +class VariantGenes(BaseModel): class Meta: - db_table = 'sample_set' + db_table = 'variant_genes' + schema = 'data' + variant = ForeignKeyField(Variant, db_column="variant", related_name="genes") + gene = ForeignKeyField(Gene, db_column="gene", related_name="variants") -class DatasetFile(BaseModel): - dataset_file = PrimaryKeyField(db_column='dataset_file_pk') - dataset_version = ForeignKeyField(db_column='dataset_version_pk', rel_model=DatasetVersion, to_field='dataset_version', related_name='files') - name = CharField() - uri = CharField() - bytes = IntegerField() +class VariantTranscripts(BaseModel): class Meta: - db_table = 'dataset_file' + db_table = 'variant_transcripts' + schema = 'data' + variant = ForeignKeyField(Variant, db_column="variant", related_name="transcripts") + transcript = ForeignKeyField(Transcript, db_column="transcript", related_name="variants") -class UserAccessLog(BaseModel): - user_access_log = PrimaryKeyField(db_column='user_access_log_pk') - user = ForeignKeyField(db_column='user_pk', rel_model=User, to_field='user', related_name='access_logs') - dataset = ForeignKeyField(db_column='dataset_pk', rel_model=Dataset, to_field='dataset', related_name='access_logs') - action = EnumField(null=True, values=['access_requested','access_granted','access_revoked','private_link']) - ts = DateTimeField() +class Coverage(BaseModel): + """ + Coverage statistics are pre-calculated for each variant for a given + dataset. + + The fields show the fraction of a population that reaches the + mapping coverages given by the variable names. + + ex. cov20 = 0.994 means that 99.4% of the population had at a mapping + coverage of at least 20 in this position. + """ class Meta: - db_table = 'user_access_log' + db_table = "coverage" + schema = 'data' + dataset_version = ForeignKeyField(DatasetVersion, db_column="dataset_version") + chrom = CharField(max_length=10) + pos = IntegerField() + mean = FloatField() + median = FloatField() + coverage = ArrayField(FloatField, null=True) -class UserConsentLog(BaseModel): - user_consent_log = PrimaryKeyField(db_column='user_access_log_pk') - user = ForeignKeyField(db_column='user_pk', rel_model=User, to_field='user', related_name='consent_logs') - dataset_version = ForeignKeyField(db_column='dataset_version_pk', rel_model=DatasetVersion, to_field='dataset_version', related_name='consent_logs') - ts = DateTimeField() +class Metrics(BaseModel): class Meta: - db_table = 'user_consent_log' + db_table = "metrics" + schema = 'data' + dataset_version = ForeignKeyField(DatasetVersion, db_column="dataset_version") + metric = CharField() + mids = ArrayField(IntegerField) + hist = ArrayField(IntegerField) -class UserDownloadLog(BaseModel): - user_download_log = PrimaryKeyField(db_column='user_download_log_pk') - user = ForeignKeyField(db_column='user_pk', rel_model=User, to_field='user', related_name='download_logs') - dataset_file = ForeignKeyField(db_column='dataset_file_pk', rel_model=DatasetFile, to_field='dataset_file', related_name='download_logs') - ts = DateTimeField() +class User(BaseModel): class Meta: - db_table = 'user_download_log' + db_table = "users" + schema = 'users' + name = CharField(db_column="username", null=True) + email = CharField(unique=True) + identity = CharField(unique=True) + identity_type = EnumField(null=False, choices=['google', 'elixir']) + affiliation = CharField(null=True) + country = CharField(null=True) -class DatasetAccess(BaseModel): - dataset_access = PrimaryKeyField(db_column='dataset_access_pk') - dataset = ForeignKeyField(db_column='dataset_pk', rel_model=Dataset, to_field='dataset', related_name='access') - user = ForeignKeyField(db_column='user_pk', rel_model=User, to_field='user', related_name='access') - wants_newsletter = IntegerField(null=True) - is_admin = IntegerField(null=True) + def is_admin(self, dataset): + return DatasetAccess.select().where( + DatasetAccess.dataset == dataset, + DatasetAccess.user == self, + DatasetAccess.is_admin + ).count() + + def has_access(self, dataset): + return DatasetAccessCurrent.select().where( + DatasetAccessCurrent.dataset == dataset, + DatasetAccessCurrent.user == self, + ).count() + def has_requested_access(self, dataset): + return DatasetAccessPending.select().where( + DatasetAccessPending.dataset == dataset, + DatasetAccessPending.user == self + ).count() + + +class SFTPUser(BaseModel): class Meta: - db_table = 'dataset_access' + db_table = "sftp_users" + schema = 'users' + user = ForeignKeyField(User, related_name='sftp_user') + user_uid = IntegerField(unique=True) + user_name = CharField(null=False) + password_hash = CharField(null=False) + account_expires = DateTimeField(null=False) -class DatasetAccessCurrent(DatasetAccess): - dataset = ForeignKeyField(db_column='dataset_pk', rel_model=Dataset, to_field='dataset', related_name='access_current') - user = ForeignKeyField(db_column='user_pk', rel_model=User, to_field='user', related_name='access_current') - has_access = IntegerField() - access_requested = DateTimeField() +class UserAccessLog(BaseModel): class Meta: - db_table = 'dataset_access_current' + db_table = "user_access_log" + schema = 'users' + user = ForeignKeyField(User, related_name='access_logs') + dataset = ForeignKeyField(Dataset, db_column='dataset', related_name='access_logs') + action = EnumField(null=True, choices=['access_granted','access_revoked','access_requested','private_link']) + ts = DateTimeField() -class DatasetAccessPending(DatasetAccess): - dataset = ForeignKeyField(db_column='dataset_pk', rel_model=Dataset, to_field='dataset', related_name='access_pending') - user = ForeignKeyField(db_column='user_pk', rel_model=User, to_field='user', related_name='access_pending') - has_access = IntegerField() - access_requested = DateTimeField() +class UserConsentLog(BaseModel): class Meta: - db_table = 'dataset_access_pending' + db_table = "user_consent_log" + schema = 'users' + user = ForeignKeyField(User, related_name='consent_logs') + dataset_version = ForeignKeyField(DatasetVersion, db_column='dataset_version', related_name='consent_logs') + ts = DateTimeField() -class DatasetLogo(BaseModel): - dataset_logo = PrimaryKeyField(db_column='dataset_logo_pk') - dataset = ForeignKeyField(db_column='dataset_pk', rel_model=Dataset, to_field='dataset', related_name='logo') - mimetype = CharField() - data = BlobField() +class UserDownloadLog(BaseModel): class Meta: - db_table = 'dataset_logo' + db_table = "user_download_log" + schema = 'users' + + user = ForeignKeyField(User, related_name='download_logs') + dataset_file = ForeignKeyField(DatasetFile, db_column='dataset_file', related_name='download_logs') + ts = DateTimeField() + + +class DatasetAccess(BaseModel): + class Meta: + db_table = "dataset_access" + schema = 'users' + + dataset = ForeignKeyField(Dataset, db_column='dataset', related_name='access') + user = ForeignKeyField(User, related_name='dataset_access') + wants_newsletter = BooleanField(null=True) + is_admin = BooleanField(null=True) class Linkhash(BaseModel): - linkhash = PrimaryKeyField(db_column='linkhash_pk') - dataset_version = ForeignKeyField(db_column='dataset_version_pk', rel_model=DatasetVersion, to_field='dataset_version', related_name='link_hashes') - user = ForeignKeyField(db_column='user_pk', rel_model=User, to_field='user', related_name='link_hashes') + class Meta: + db_table = "linkhash" + schema = 'users' + + dataset_version = ForeignKeyField(DatasetVersion, db_column='dataset_version', related_name='link_hashes') + user = ForeignKeyField(User, related_name='link_hashes') hash = CharField() expires_on = DateTimeField() +##### +# Views +## + +class DatasetVersionCurrent(DatasetVersion): class Meta: - db_table = 'linkhash' + db_table = 'dataset_version_current' + schema = 'data' + dataset = ForeignKeyField(Dataset, db_column="dataset", related_name='current_version') + reference_set = ForeignKeyField(ReferenceSet, db_column="reference_set", related_name='current_version') -class DatasetVersionCurrent(DatasetVersion): - dataset = ForeignKeyField(db_column='dataset_pk', rel_model=Dataset, to_field='dataset', related_name='current_version') +class DatasetAccessCurrent(DatasetAccess): class Meta: - db_table = 'dataset_version_current' + db_table = 'dataset_access_current' + schema = 'users' + dataset = ForeignKeyField(Dataset, db_column='dataset', related_name='access_current') + user = ForeignKeyField(User, related_name='access_current') + has_access = IntegerField() + access_requested = DateTimeField() -class SFTPUser(BaseModel): - sftp_user = PrimaryKeyField(db_column='sftp_user_pk') - user = ForeignKeyField(db_column='user_pk', rel_model=User, to_field='user', related_name='sftp_user') - user_uid = IntegerField(unique=True) - user_name = CharField(null=False) - password_hash = CharField(null=False) - account_expires = DateTimeField(null=False) +class DatasetAccessPending(DatasetAccess): class Meta: - db_table = 'sftp_user' + db_table = 'dataset_access_pending' + schema = 'users' + dataset = ForeignKeyField(Dataset, db_column='dataset', related_name='access_pending') + user = ForeignKeyField(User, related_name='access_pending') + has_access = IntegerField() + access_requested = DateTimeField() + +##### +# Help functions +## def get_next_free_uid(): """ @@ -284,46 +464,32 @@ def get_next_free_uid(): return next_uid - def get_admin_datasets(user): return DatasetAccess.select().where( DatasetAccess.user == user, DatasetAccess.is_admin) - def get_dataset(dataset): dataset = Dataset.select().where( Dataset.short_name == dataset).get() return dataset - def get_dataset_version(dataset, version=None): if version: - try: - dataset_version = (DatasetVersion - .select(DatasetVersion, Dataset) - .join(Dataset) - .where(DatasetVersion.version == version, - Dataset.short_name == dataset)).get() - except DatasetVersion.DoesNotExist: - logging.error("get_dataset_version({}, {}): ".format(dataset, version) + - "cannot retrieve dataset version") - return + dataset_version = (DatasetVersion + .select(DatasetVersion, Dataset) + .join(Dataset) + .where(DatasetVersion.version == version, + Dataset.short_name == dataset)).get() else: - try: - dataset_version = (DatasetVersionCurrent - .select(DatasetVersionCurrent, Dataset) - .join(Dataset) - .where(Dataset.short_name == dataset)).get() - except DatasetVersionCurrent.DoesNotExist: - logging.error("get_dataset_version({}, version=None): ".format(dataset) + - "cannot retrieve dataset version") - return + dataset_version = (DatasetVersionCurrent + .select(DatasetVersionCurrent, Dataset) + .join(Dataset) + .where(Dataset.short_name == dataset)).get() return dataset_version - def build_dict_from_row(row): d = {} - for field in row._meta.sorted_fields: #pylint: disable=protected-access - column = field.db_column - if column.endswith("_pk"): + + for field, value in row.__dict__['_data'].items(): + if field == "id": continue - d[column] = getattr(row, column) + d[field] = value return d diff --git a/backend/requirements.txt b/backend/requirements.txt index 18ccc26e5..2b1534135 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -4,7 +4,6 @@ appdirs==1.4.3 certifi==2017.11.5 chardet==3.0.4 idna==2.6 -mysqlclient==1.3.10 packaging==16.8 peewee==2.9.2 pymongo==3.4.0 @@ -13,3 +12,4 @@ requests==2.18.4 six==1.10.0 tornado==4.5.1 urllib3==1.22 +psycopg2-binary==2.7.5 diff --git a/backend/route.py b/backend/route.py index 1786ae722..ce6e2cf31 100644 --- a/backend/route.py +++ b/backend/route.py @@ -4,6 +4,7 @@ import tornado.web from tornado.options import define, options +import sys import application import handlers import auth @@ -109,6 +110,13 @@ def __init__(self, settings): tornado.web.Application.__init__(self, self.declared_handlers, **settings) if __name__ == '__main__': + # Make sure that the extra option to `settings` isn't upsetting tornado + if '--settings_file' in sys.argv: + flag_index = sys.argv.index('--settings_file') + # first remove flag, then argument + del sys.argv[flag_index] + del sys.argv[flag_index] + tornado.log.enable_pretty_logging() tornado.options.parse_command_line() diff --git a/backend/settings.py b/backend/settings.py index 577b93ef5..249cd91a9 100644 --- a/backend/settings.py +++ b/backend/settings.py @@ -1,9 +1,24 @@ + +import os +import sys import json +import logging + +ARG = "--settings_file" +SETTINGS_FILE = "settings.json" +if ARG in sys.argv: + try: + SETTINGS_FILE = sys.argv[sys.argv.index(ARG)+1] + except IndexError: + logging.error("No argument for --settings_file") + sys.exit(1) try: - json_settings_fh = open("settings.json") + current_dir = os.path.dirname(os.path.realpath(__file__)) + json_settings_fh = open(os.path.join(current_dir, SETTINGS_FILE)) except FileNotFoundError: - json_settings_fh = open("../settings.json") + parent_dir = os.path.join(current_dir, os.pardir) + json_settings_fh = open(os.path.join(parent_dir, SETTINGS_FILE)) json_settings = json.load(json_settings_fh) json_settings_fh.close() @@ -17,13 +32,6 @@ ## Generated with base64.b64encode(uuid.uuid4().bytes + uuid.uuid4().bytes) cookie_secret = json_settings["cookieSecret"] -# MySql settings -mysql_host = json_settings["mysqlHost"] -mysql_schema = json_settings["mysqlSchema"] -mysql_user = json_settings["mysqlUser"] -mysql_passwd = json_settings["mysqlPasswd"] -mysql_port = json_settings["mysqlPort"] - # Mongodb settings mongo_host = json_settings["mongoHost"] mongo_port = json_settings["mongoPort"] @@ -31,6 +39,20 @@ mongo_password = json_settings["mongoPassword"] mongo_databases = json_settings["mongoDatabases"] +# PostgreSQL settings +psql_host = json_settings["postgresHost"] +psql_port = json_settings["postgresPort"] +psql_name = json_settings["postgresName"] +psql_user = json_settings["postgresUser"] +psql_pass = json_settings["postgresPass"] + +# MySql settings +mysql_host = json_settings["mysqlHost"] +mysql_schema = json_settings["mysqlSchema"] +mysql_user = json_settings["mysqlUser"] +mysql_passwd = json_settings["mysqlPasswd"] +mysql_port = json_settings["mysqlPort"] + # e-mail config mail_server = json_settings["mailServer"] from_address = json_settings["fromAddress"] diff --git a/backend/test.py b/backend/test.py index 4834e81e8..6eb43006a 100644 --- a/backend/test.py +++ b/backend/test.py @@ -101,7 +101,7 @@ def tearDown(self): try: u = db.User.select(db.User).where(db.User.email==self.USER).get() try: - u.access.get().delete_instance() + u.dataset_access.get().delete_instance() except peewee.PeeweeException: pass try: diff --git a/docker-compose.yml b/docker-compose.yml index 1a861d0f3..498b3c2b1 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,14 @@ version: "3.2" services: + db: + build: + context: ./ + dockerfile: Dockerfile-database + restart: on-failure + volumes: + - ./postgres-data:/var/lib/postgresql/data + ports: + - 5432:5432 web: build: context: ./ @@ -13,12 +22,6 @@ services: - type: bind source: . target: /code - db: - image: "mysql:5.7" - volumes: - - type: volume - source: mysql-data-volume - target: /var/lib/mysql/ rebuilder: build: context: ./ @@ -27,8 +30,3 @@ services: - type: bind source: . target: /code - - -volumes: - mysql-data-volume: - external: true diff --git a/scripts/add_picture_to_db.py b/scripts/add_picture_to_db.py old mode 100644 new mode 100755 index cf768f738..819c11732 --- a/scripts/add_picture_to_db.py +++ b/scripts/add_picture_to_db.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import argparse import re import sys diff --git a/scripts/importer/data_importer/__init__.py b/scripts/importer/data_importer/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/scripts/importer/data_importer/data_importer.py b/scripts/importer/data_importer/data_importer.py new file mode 100644 index 000000000..2b4f044ab --- /dev/null +++ b/scripts/importer/data_importer/data_importer.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +''' +Baseclass for the data importers +''' + +import os +import sys +import gzip +import time +import logging +import urllib.request + +import MySQLdb + +class DataImporter(): + ''' + Baseclass for the data importers + ''' + BLOCKSIZE = 1024 + + def __init__(self, settings): + self.settings = settings + self.download_dir = settings.data_dir + self.chrom = settings.limit_chrom + self.batch_size = settings.batch_size + self.progress_bar = not settings.disable_progress + self.in_file = None + + def _connect(self, host, user, passwd, database): + try: + logging.info("Connecting to database {}".format(database)) + database = MySQLdb.connect(host=host, + user=user, + passwd=passwd, + db=database) + return database.cursor() + except MySQLdb.Error as error: + logging.error("Error connecting: {}".format(error)) + + def _download(self, base_url, version=None): + """ + Download a file into the download_dir. + """ + url = base_url.format(version=version) + filename = os.path.join(self.download_dir, url.split("/")[-1]) + if not os.path.exists(self.download_dir): + os.makedirs(self.download_dir) + try: + os.stat(filename) + logging.info("Found file: {}, not downloading".format(filename)) + return filename + except FileNotFoundError: + pass + + request = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) + response = urllib.request.urlopen(request) + filesize = None + if 'Content-length' in response.headers.keys(): + filesize = int(response.headers['Content-length']) + else: + logging.info("response lacks content-length header, but will still download.") + downloaded = 0 + logging.info("Downloading file {}".format(url)) + with open(filename, 'wb') as out: + last_progress = -1.0 + if filesize and self.progress_bar: + last_progress = self._update_progress_bar(downloaded, filesize, last_progress) + + block = response.read(DataImporter.BLOCKSIZE) + + while block: + downloaded += len(block) + if self.progress_bar and logging.getLogger().getEffectiveLevel() < 30 and filesize: + last_progress = self._update_progress_bar(downloaded, filesize, last_progress) + out.write(block) + block = response.read(DataImporter.BLOCKSIZE) + response.close() + if self.progress_bar and logging.getLogger().getEffectiveLevel() < 30 and filesize: + last_progress = self._update_progress_bar(downloaded, filesize, + last_progress, finished=True) + return filename + + def _download_and_open(self, base_url, version=None): + """ + Downloads a file and returns an open file handle + """ + filename = self._download(base_url, version) + return self._open(filename) + + def _open(self, filename): + try: + logging.debug("Opening file {}".format(filename)) + return gzip.open(filename, 'rb') if filename.endswith(".gz") else open(filename) + except IOError as error: + logging.error("IOERROR: {}".format(error)) + + def _time_format(self, seconds): + hour, rem = divmod(seconds, 3600) + mins, secs = divmod(rem, 60) + retval = "" + if hour: + retval += "{:d} hours, ".format(int(hour)) + if mins: + retval += "{:d} mins, ".format(int(mins)) + retval += "{:3.1f} secs".format(secs) + return retval + + def _time_since(self, start): + return self._time_format(time.time() - start) + + def _time_to(self, start, progress=0.01): + return self._time_format((time.time() - start)/progress) + + def _update_progress_bar(self, current_count, total, last_progress, finished=False): + if not finished: + progress = current_count/total + else: + progress = 1.001 + if last_progress < 0: + if logging.getLogger().getEffectiveLevel() < 30: + sys.stderr.write("".join(["{:<10}".format(i) for i in range(0, 101, 10)]) + "\n") + sys.stderr.write("| ------- "*10 + "|\n") + last_progress = 0 + while progress > last_progress + 0.01: + last_progress += 0.01 + sys.stderr.write("=") + sys.stderr.flush() + if finished: + sys.stderr.write("\n") + return last_progress diff --git a/scripts/importer/data_importer/old_db.py b/scripts/importer/data_importer/old_db.py new file mode 100644 index 000000000..7decaf19d --- /dev/null +++ b/scripts/importer/data_importer/old_db.py @@ -0,0 +1,303 @@ +from peewee import ( + BlobField, + CharField, + DateTimeField, + Field, + FloatField, + ForeignKeyField, + IntegerField, + Model, + MySQLDatabase, + PrimaryKeyField, + TextField, + fn, + ) +import settings + +mysql_database = MySQLDatabase( + settings.mysql_schema, + host=settings.mysql_host, + user=settings.mysql_user, + password=settings.mysql_passwd, + port=settings.mysql_port + ) + + +class MySQLModel(Model): + class Meta: + database = mysql_database + + +class EnumField(Field): + db_field = 'string' # The same as for CharField + + def __init__(self, values=None, *args, **kwargs): + self.values = values or [] + super().__init__(*args, **kwargs) + + def db_value(self, value): + if value not in self.values: + raise ValueError("Illegal value for '{}'".format(self.db_column)) + return value + + def python_value(self, value): + if value not in self.values: + raise ValueError("Illegal value for '{}'".format(self.db_column)) + return value + + +class User(MySQLModel): + user = PrimaryKeyField(db_column='user_pk') + name = CharField(null=True) + email = CharField(unique=True) + identity = CharField(unique=True) + identity_type = EnumField(null=False, values=['google', 'elixir']) + affiliation = CharField(null=True) + country = CharField(null=True) + + def is_admin(self, dataset): + return (DatasetAccess.select() + .where(DatasetAccess.dataset == dataset, + DatasetAccess.user == self, + DatasetAccess.is_admin) + .count()) + + def has_access(self, dataset): + return DatasetAccessCurrent.select().where( + DatasetAccessCurrent.dataset == dataset, + DatasetAccessCurrent.user == self, + ).count() + + def has_requested_access(self, dataset): + return DatasetAccessPending.select().where( + DatasetAccessPending.dataset == dataset, + DatasetAccessPending.user == self + ).count() + + class Meta: + db_table = 'user' + + +class Study(MySQLModel): + study = PrimaryKeyField(db_column='study_pk') + pi_name = CharField() + pi_email = CharField() + contact_name = CharField() + contact_email = CharField() + title = CharField() + description = TextField(null=True) + publication_date = DateTimeField() + ref_doi = CharField(null=True) + + class Meta: + db_table = 'study' + + +class Dataset(MySQLModel): + dataset = PrimaryKeyField(db_column='dataset_pk') + study = ForeignKeyField(Study, db_column='study_pk', to_field='study', related_name='datasets') + short_name = CharField() + full_name = CharField() + browser_uri = CharField(null=True) + beacon_uri = CharField(null=True) + avg_seq_depth = FloatField(null=True) + seq_type = CharField(null=True) + seq_tech = CharField(null=True) + seq_center = CharField(null=True) + dataset_size = IntegerField() + mongodb_collection = CharField(null=False) + + def has_image(self): + try: + DatasetLogo.get(DatasetLogo.dataset == self) + return True + except DatasetLogo.DoesNotExist: + return False + + class Meta: + db_table = 'dataset' + + +class DatasetVersion(MySQLModel): + dataset_version = PrimaryKeyField(db_column='dataset_version_pk') + dataset = ForeignKeyField(Dataset, db_column='dataset_pk', to_field='dataset', related_name='versions') + version = CharField() + description = TextField() + terms = TextField() + var_call_ref = CharField(null=True) + available_from = DateTimeField() + ref_doi = CharField(null=True) + data_contact_name = CharField(null=True) + data_contact_link = CharField(null=True) + + class Meta: + db_table = 'dataset_version' + + +class Collection(MySQLModel): + collection = PrimaryKeyField(db_column = 'collection_pk') + name = CharField(null = True) + ethnicity = CharField(null = True) + + class Meta: + db_table = 'collection' + + +class SampleSet(MySQLModel): + sample_set = PrimaryKeyField(db_column='sample_set_pk') + dataset = ForeignKeyField(Dataset, db_column='dataset_pk', to_field='dataset', related_name='sample_sets') + collection = ForeignKeyField(Collection, db_column='collection_pk', to_field='collection', related_name='sample_sets') + sample_size = IntegerField() + phenotype = CharField(null=True) + + class Meta: + db_table = 'sample_set' + + +class DatasetFile(MySQLModel): + dataset_file = PrimaryKeyField(db_column='dataset_file_pk') + dataset_version = ForeignKeyField(DatasetVersion, db_column='dataset_version_pk', to_field='dataset_version', related_name='files') + name = CharField() + uri = CharField() + bytes = IntegerField() + + class Meta: + db_table = 'dataset_file' + + +class UserAccessLog(MySQLModel): + user_access_log = PrimaryKeyField(db_column='user_access_log_pk') + user = ForeignKeyField(User, db_column='user_pk', to_field='user', related_name='access_logs') + dataset = ForeignKeyField(Dataset, db_column='dataset_pk', to_field='dataset', related_name='access_logs') + action = EnumField(null=True, values=['access_requested','access_granted','access_revoked','private_link']) + ts = DateTimeField() + + class Meta: + db_table = 'user_access_log' + + +class UserConsentLog(MySQLModel): + user_consent_log = PrimaryKeyField(db_column='user_consent_log_pk') + user = ForeignKeyField(User, db_column='user_pk', to_field='user', related_name='consent_logs') + dataset_version = ForeignKeyField(DatasetVersion, db_column='dataset_version_pk', to_field='dataset_version', related_name='consent_logs') + ts = DateTimeField() + + class Meta: + db_table = 'user_consent_log' + + +class UserDownloadLog(MySQLModel): + user_download_log = PrimaryKeyField(db_column='user_download_log_pk') + user = ForeignKeyField(User, db_column='user_pk', to_field='user', related_name='download_logs') + dataset_file = ForeignKeyField(DatasetFile, db_column='dataset_file_pk', to_field='dataset_file', related_name='download_logs') + ts = DateTimeField() + + class Meta: + db_table = 'user_download_log' + + +class DatasetAccess(MySQLModel): + dataset_access = PrimaryKeyField(db_column='dataset_access_pk') + dataset = ForeignKeyField(Dataset, db_column='dataset_pk', to_field='dataset', related_name='access') + user = ForeignKeyField(User, db_column='user_pk', to_field='user', related_name='access') + wants_newsletter = IntegerField(null=True) + is_admin = IntegerField(null=True) + + class Meta: + db_table = 'dataset_access' + + +class DatasetAccessCurrent(DatasetAccess): + dataset = ForeignKeyField(Dataset, db_column='dataset_pk', to_field='dataset', related_name='access_current') + user = ForeignKeyField(User, db_column='user_pk', to_field='user', related_name='access_current') + has_access = IntegerField() + access_requested = DateTimeField() + + class Meta: + db_table = 'dataset_access_current' + + +class DatasetAccessPending(DatasetAccess): + dataset = ForeignKeyField(Dataset, db_column='dataset_pk', to_field='dataset', related_name='access_pending') + user = ForeignKeyField(User, db_column='user_pk', to_field='user', related_name='access_pending') + has_access = IntegerField() + access_requested = DateTimeField() + + class Meta: + db_table = 'dataset_access_pending' + + +class DatasetLogo(MySQLModel): + dataset_logo = PrimaryKeyField(db_column='dataset_logo_pk') + dataset = ForeignKeyField(Dataset, db_column='dataset_pk', to_field='dataset', related_name='logo') + mimetype = CharField() + data = BlobField() + + class Meta: + db_table = 'dataset_logo' + + +class Linkhash(MySQLModel): + linkhash = PrimaryKeyField(db_column='linkhash_pk') + dataset_version = ForeignKeyField(DatasetVersion, db_column='dataset_version_pk', to_field='dataset_version', related_name='link_hashes') + user = ForeignKeyField(User, db_column='user_pk', to_field='user', related_name='link_hashes') + hash = CharField() + expires_on = DateTimeField() + + class Meta: + db_table = 'linkhash' + + +class DatasetVersionCurrent(DatasetVersion): + dataset = ForeignKeyField(Dataset, db_column='dataset_pk', to_field='dataset', related_name='current_version') + + class Meta: + db_table = 'dataset_version_current' + + +class SFTPUser(MySQLModel): + sftp_user = PrimaryKeyField(db_column='sftp_user_pk') + user = ForeignKeyField(User, db_column='user_pk', to_field='user', related_name='sftp_user') + user_uid = IntegerField(unique=True) + user_name = CharField(null=False) + password_hash = CharField(null=False) + account_expires = DateTimeField(null=False) + + class Meta: + db_table = 'sftp_user' + + +def get_next_free_uid(): + """ + Returns the next free uid >= 10000, and higher than the current uid's + from the sftp_user table in the database. + """ + default = 10000 + next_uid = default + try: + current_max_uid = SFTPUser.select(fn.MAX(SFTPUser.user_uid)).get().user_uid + if current_max_uid: + next_uid = current_max_uid+1 + except SFTPUser.DoesNotExist: + pass + + return next_uid + + +def get_admin_datasets(user): + return DatasetAccess.select().where( DatasetAccess.user == user, DatasetAccess.is_admin) + + +def get_dataset(dataset): + dataset = Dataset.select().where( Dataset.short_name == dataset).get() + return dataset + + +def build_dict_from_row(row): + d = {} + for field in row._meta.sorted_fields: #pylint: disable=protected-access + column = field.db_column + if column.endswith("_pk"): + continue + d[column] = getattr(row, column) + return d diff --git a/scripts/importer/data_importer/old_db_importer.py b/scripts/importer/data_importer/old_db_importer.py new file mode 100644 index 000000000..a1beb077c --- /dev/null +++ b/scripts/importer/data_importer/old_db_importer.py @@ -0,0 +1,447 @@ +#!/usr/bin/env python3 + +import sys +import time +import logging +import db +from peewee import OperationalError, InterfaceError +from . import old_db + +from .data_importer import DataImporter + +class OldDbImporter( DataImporter ): + + def __init__(self, settings): + super().__init__(settings) + self.reference_sets = [] + self.id_map = {'collection':{}, + 'study':{}, + 'dataset':{}, + 'dataset_version':{}, + 'dataset_file':{}, + 'user':{} + } + + def _select_reference_set(self, short_name): + if len(self.reference_sets) == 1: + logging.info(("Only one reference set is available, %s, " + "will default to this set for all datasets"), + self.reference_sets[0].name) + return self.reference_sets[0].id + elif short_name.lower() in [r.name.lower() for r in self.reference_sets]: + refset = [r for r in self.reference_sets if r.name.lower() == short_name.lower()][0] + logging.info("Auto-selecting reference set '%s' based on name.", refset.name) + return refset + else: + print("Select a reference set to use with this dataset") + retval = -1 + while retval not in [r.id for r in self.reference_sets]: + for reference_set in self.reference_sets: + print(" {} : {}".format(reference_set.id, reference_set.name)) + try: + retval = int(input("Please select a reference: ")) + except ValueError: + print("Please select a number in {}".format([r.id for r in self.reference_sets])) + return retval + + def _move_collections(self): + logging.info("Moving Collections") + for collection in old_db.Collection.select(): + logging.info(" - Moving '{}'".format(collection.name)) + + try: + new_id = db.Collection.get(name = collection.name, + ethnicity = collection.ethnicity).id + except db.Collection.DoesNotExist: + if self.settings.dry_run: + continue + new_id = (db.Collection + .insert(name = collection.name, + ethnicity = collection.ethnicity) + .execute()) + + self.id_map['collection'][collection.collection] = new_id + + def _move_studies(self): + logging.info("Moving Studies") + for study in old_db.Study.select(): + logging.info(" - Moving '{}'".format(study.title)) + + try: + new_id = db.Study.get(pi_name = study.pi_name, + pi_email = study.pi_email, + contact_name = study.contact_name, + contact_email = study.contact_email, + title = study.title, + description = study.description, + publication_date = study.publication_date, + ref_doi = study.ref_doi).id + except db.Study.DoesNotExist: + if self.settings.dry_run: + continue + new_id = (db.Study + .insert(pi_name = study.pi_name, + pi_email = study.pi_email, + contact_name = study.contact_name, + contact_email = study.contact_email, + title = study.title, + description = study.description, + publication_date = study.publication_date, + ref_doi = study.ref_doi) + .execute()) + + self.id_map['study'][study.study] = new_id + + def _move_datasets(self): + logging.info("Moving Datasets") + for dataset in old_db.Dataset.select(): + logging.info(" - Moving '{}'".format(dataset.short_name)) + try: + study_ref_id = self.id_map['study'][dataset.study.study] + except KeyError: + if not self.settings.dry_run: + raise + study_ref_id = -1 + try: + # short_name is unique, so we only really need to check that. + new_id = db.Dataset.get(study = study_ref_id, + short_name = dataset.short_name).id + except db.Dataset.DoesNotExist: + if self.settings.dry_run: + continue + new_id = (db.Dataset + .insert(study = study_ref_id, + short_name = dataset.short_name, + full_name = dataset.full_name, + browser_uri = dataset.browser_uri, + beacon_uri = dataset.beacon_uri, + avg_seq_depth = dataset.avg_seq_depth, + seq_type = dataset.seq_type, + seq_tech = dataset.seq_tech, + seq_center = dataset.seq_center, + dataset_size = dataset.dataset_size) + .execute()) + + self.id_map['dataset'][dataset.dataset] = new_id + + def _move_dataset_logos(self): + logging.info("Moving Dataset Logos") + for dataset_file in old_db.DatasetLogo.select(): + try: + dataset_ref_id = self.id_map['dataset'][dataset_file.dataset.dataset] + except KeyError: + if not self.settings.dry_run: + raise + dataset_ref_id = -1 + try: + db.DatasetLogo.get(dataset = dataset_ref_id, + mimetype = dataset_file.mimetype, + data = dataset_file.data) + except db.DatasetLogo.DoesNotExist: + if self.settings.dry_run: + continue + db.DatasetLogo.insert(dataset = dataset_ref_id, + mimetype = dataset_file.mimetype, + data = dataset_file.data).execute() + + def _move_dataset_versions(self): + logging.info("Moving Dataset Versions") + for dataset_version in old_db.DatasetVersion.select(): + try: + dataset_ref_id = self.id_map['dataset'][dataset_version.dataset.dataset] + dataset = db.Dataset.get(id = dataset_ref_id) + except KeyError: + if not self.settings.dry_run: + raise + dataset_ref_id = -1 + try: + new_id = db.DatasetVersion.get(dataset = dataset_ref_id, + version = dataset_version.version, + description = dataset_version.description, + terms = dataset_version.terms, + var_call_ref = dataset_version.var_call_ref, + available_from = dataset_version.available_from, + ref_doi = dataset_version.ref_doi, + data_contact_name = dataset_version.data_contact_name, + data_contact_link = dataset_version.data_contact_link).id + except db.DatasetVersion.DoesNotExist: + target_reference_id = self._select_reference_set(dataset.short_name) + if self.settings.dry_run: + continue + new_id = (db.DatasetVersion + .insert(dataset = dataset_ref_id, + reference_set = target_reference_id, + version = dataset_version.version, + description = dataset_version.description, + terms = dataset_version.terms, + var_call_ref = dataset_version.var_call_ref, + available_from = dataset_version.available_from, + ref_doi = dataset_version.ref_doi, + data_contact_name = dataset_version.data_contact_name, + data_contact_link = dataset_version.data_contact_link, + coverage_levels = [1,5,10,15,20,25,30,50,100] + ) + .execute()) + + self.id_map['dataset_version'][dataset_version.dataset_version] = new_id + + def _move_dataset_files(self): + logging.info("Moving Dataset Files") + for dataset_file in old_db.DatasetFile.select(): + logging.info(" - Moving '{}'".format(dataset_file.name)) + try: + dataset_version_ref_id = self.id_map['dataset_version'][dataset_file.dataset_version.dataset_version] + except KeyError: + if not self.settings.dry_run: + raise + dataset_version_ref_id = -1 + try: + new_id = db.DatasetFile.get(dataset_version = dataset_version_ref_id, + name = dataset_file.name, + uri = dataset_file.uri, + file_size = dataset_file.bytes).id + except db.DatasetFile.DoesNotExist: + if self.settings.dry_run: + continue + new_id = (db.DatasetFile + .insert(dataset_version = dataset_version_ref_id, + name = dataset_file.name, + uri = dataset_file.uri, + file_size = dataset_file.bytes).execute()) + + self.id_map['dataset_file'][dataset_file.dataset_file] = new_id + + def _move_sample_sets(self): + logging.info("Moving Sample Sets") + for sample_set in old_db.SampleSet.select(): + try: + dataset_ref_id = self.id_map['dataset'][sample_set.dataset.dataset] + collection_ref_id = self.id_map['collection'][sample_set.collection.collection] + except KeyError: + if not self.settings.dry_run: + raise + dataset_ref_id = -1 + collection_ref_id = -1 + try: + db.SampleSet.get(dataset = dataset_ref_id, + collection = collection_ref_id, + sample_size = sample_set.sample_size, + phenotype = sample_set.phenotype) + except db.SampleSet.DoesNotExist: + if self.settings.dry_run: + continue + db.SampleSet.insert(dataset = dataset_ref_id, + collection = collection_ref_id, + sample_size = sample_set.sample_size, + phenotype = sample_set.phenotype).execute() + + def _move_database(self): + self._move_collections() + self._move_studies() + self._move_datasets() + self._move_dataset_logos() + self._move_dataset_versions() + self._move_sample_sets() + self._move_dataset_files() + + def _move_users(self): + logging.info("Moving Users") + for user in old_db.User.select(): + try: + new_id = (db.User + .get(name = user.name, + email = user.email, + identity = user.identity, + identity_type = user.identity_type, + affiliation = user.affiliation, + country = user.country).id) + except db.User.DoesNotExist: + if self.settings.dry_run: + continue + new_id = (db.User + .insert(name = user.name, + email = user.email, + identity = user.identity, + identity_type = user.identity_type, + affiliation = user.affiliation, + country = user.country).execute()) + + self.id_map['user'][user.user] = new_id + + def _move_sftp_users(self): + logging.info("Moving SFTP Users") + for user in old_db.SFTPUser.select(): + try: + user_ref_id = self.id_map['user'][user.user.user] + except KeyError: + if not self.settings.dry_run: + raise + user_ref_id = -1 + try: + # user_uid is unique, so we rely on that + db.SFTPUser.get(user = user_ref_id, + user_uid = user.user_uid) + except db.SFTPUser.DoesNotExist: + if self.settings.dry_run: + continue + db.SFTPUser.insert(user = user_ref_id, + user_uid = user.user_uid, + user_name = user.user_name, + password_hash = user.password_hash, + account_expires = user.account_expires).execute() + + def _move_user_access_logs(self): + logging.info("Moving User Access Logs") + for log in old_db.UserAccessLog.select(): + try: + user_ref_id = self.id_map['user'][log.user.user] + dataset_ref_id = self.id_map['dataset'][log.dataset.dataset] + except KeyError: + if not self.settings.dry_run: + raise + user_ref_id = -1 + dataset_ref_id = -1 + try: + db.UserAccessLog.get(user = user_ref_id, + dataset = dataset_ref_id, + action = log.action, + ts = log.ts) + except db.UserAccessLog.DoesNotExist: + if self.settings.dry_run: + continue + db.UserAccessLog.insert(user = user_ref_id, + dataset = dataset_ref_id, + action = log.action, + ts = log.ts).execute() + + def _move_user_consent_logs(self): + logging.info("Moving User Consent Logs") + for log in old_db.UserConsentLog.select(): + try: + user_ref_id = self.id_map['user'][log.user.user] + version_ref_id = self.id_map['dataset_version'][log.dataset_version.dataset_version] + except KeyError: + if not self.settings.dry_run: + raise + user_ref_id = -1 + version_ref_id = -1 + try: + db.UserConsentLog.get(user = user_ref_id, + dataset_version = version_ref_id, + ts = log.ts) + except db.UserConsentLog.DoesNotExist: + if self.settings.dry_run: + continue + db.UserConsentLog.insert(user = user_ref_id, + dataset_version = version_ref_id, + ts = log.ts).execute() + + def _move_user_download_logs(self): + logging.info("Moving User Download Logs") + for log in old_db.UserDownloadLog.select(): + try: + user_ref_id = self.id_map['user'][log.user.user] + file_ref_id = self.id_map['dataset_file'][log.dataset_file.dataset_file] + except KeyError: + if not self.settings.dry_run: + raise + user_ref_id = -1 + file_ref_id = -1 + try: + db.UserDownloadLog.get(user = user_ref_id, + dataset_file = file_ref_id, + ts = log.ts) + except db.UserDownloadLog.DoesNotExist: + if self.settings.dry_run: + continue + db.UserDownloadLog.insert(user = user_ref_id, + dataset_file = file_ref_id, + ts = log.ts).execute() + + def _move_dataset_access(self): + logging.info("Moving Dataset Access Records") + for access in old_db.DatasetAccess.select(): + try: + user_ref_id = self.id_map['user'][access.user.user] + dataset_ref_id = self.id_map['dataset'][access.dataset.dataset] + except KeyError: + if not self.settings.dry_run: + raise + user_ref_id = -1 + dataset_ref_id = -1 + try: + db.DatasetAccess.get(dataset = dataset_ref_id, + user = user_ref_id, + wants_newsletter = access.wants_newsletter, + is_admin = access.is_admin) + except db.DatasetAccess.DoesNotExist: + if self.settings.dry_run: + continue + db.DatasetAccess.insert(dataset = dataset_ref_id, + user = user_ref_id, + wants_newsletter = access.wants_newsletter, + is_admin = access.is_admin).execute() + + def _move_linkhashes(self): + logging.info("Moving Linkhashes") + for linkhash in old_db.Linkhash.select(): + try: + user_ref_id = self.id_map['user'][linkhash.user.user] + version_ref_id = self.id_map['dataset_version'][linkhash.dataset_version.dataset_version] + except KeyError: + if not self.settings.dry_run: + raise + user_ref_id = -1 + version_ref_id = -1 + try: + db.Linkhash.get(dataset_version = version_ref_id, + user = user_ref_id, + hash = linkhash.hash, + expires_on = linkhash.expires_on) + except db.Linkhash.DoesNotExist: + if self.settings.dry_run: + continue + db.Linkhash.insert(dataset_version = version_ref_id, + user = user_ref_id, + hash = linkhash.hash, + expires_on = linkhash.expires_on).execute() + + def _move_userbase(self): + self._move_users() + self._move_sftp_users() + self._move_user_access_logs() + self._move_user_consent_logs() + self._move_user_download_logs() + self._move_dataset_access() + self._move_linkhashes() + + def prepare_data(self): + """ + Connects to the old and new databases. + """ + logging.info("Checking connection to old database") + try: + old_db.Collection.get() + except OperationalError: + logging.error("Could not connect to old database") + sys.exit(1) + logging.info("Checking connection to new database") + try: + db.ReferenceSet.get() + for reference_set in db.ReferenceSet.select(): + self.reference_sets += [reference_set] + except db.ReferenceSet.DoesNotExist: + logging.error(("Connection works, but no reference sets are available." + "use '--add_reference' to add a new reference set and" + "Then use this tool again.")) + sys.exit(1) + except (OperationalError, InterfaceError): + logging.error("Could not connect to new database") + sys.exit(1) + + def start_import(self): + start = time.time() + self._move_database() + self._move_userbase() + + logging.info("Moved data in {}".format(self._time_since(start))) diff --git a/scripts/importer/data_importer/raw_data_importer.py b/scripts/importer/data_importer/raw_data_importer.py new file mode 100644 index 000000000..e7d04e175 --- /dev/null +++ b/scripts/importer/data_importer/raw_data_importer.py @@ -0,0 +1,419 @@ +#!/usr/bin/env python3 +import re +import sys +import time +import logging + +import db +from .data_importer import DataImporter + +METRICS = [ + 'BaseQRankSum', + 'ClippingRankSum', + 'DP', + 'FS', + 'InbreedingCoeff', + 'MQ', + 'MQRankSum', + 'QD', + 'ReadPosRankSum', + 'VQSLOD' +] + +def get_minimal_representation(pos, ref, alt): + """ + Get the minimal representation of a variant, based on the ref + alt alleles in a VCF + This is used to make sure that multiallelic variants in different datasets, + with different combinations of alternate alleles, can always be matched directly. + + Note that chromosome is ignored here - in xbrowse, we'll probably be dealing with 1D coordinates + Args: + pos (int): genomic position in a chromosome (1-based) + ref (str): ref allele string + alt (str): alt allele string + Returns: + tuple: (pos, ref, alt) of remapped coordinate + """ + pos = int(pos) + # If it's a simple SNV, don't remap anything + if len(ref) == 1 and len(alt) == 1: + return pos, ref, alt + # strip off identical suffixes + while(alt[-1] == ref[-1] and min(len(alt), len(ref)) > 1): + alt = alt[:-1] + ref = ref[:-1] + # strip off identical prefixes and increment position + while(alt[0] == ref[0] and min(len(alt), len(ref)) > 1): + alt = alt[1:] + ref = ref[1:] + pos += 1 + return pos, ref, alt + + +class RawDataImporter(DataImporter): + def __init__(self, settings): + super().__init__(settings) + self.dataset_version = None + self.dataset = None + self.sampleset = None + self.counter = {'coverage':None, + 'variants':None} + + def _set_dataset_info(self): + """ Save dataset information given as parameters """ + if self.settings.beacon_description: + self.dataset.description = self.settings.beacon_description + self.dataset.save() + if self.settings.assembly_id: + self.dataset_version.var_call_ref = self.settings.assembly_id + self.dataset_version.save() + if self.settings.sampleset_size: + self.sampleset.sample_size = self.settings.sampleset_size + self.sampleset.save() + if self.settings.dataset_size: + self.dataset.dataset_size = self.settings.dataset_size + self.dataset.save() + + def _select_dataset_version(self): + datasets = [] + + # Make sure that the dataset exists + try: + ds = db.Dataset.get(short_name=self.settings.dataset) + except db.Dataset.DoesNotExist: + logging.error("Unknown dataset '%s'", self.settings.dataset) + logging.info("Available datasets are:") + for dataset in db.Dataset.select(): + logging.info(" * %s", dataset.short_name) + sys.exit(1) + logging.info("Using dataset {}".format(ds.short_name)) + self.dataset = ds + + versions = [v for v in db.DatasetVersion.select().where(db.DatasetVersion.dataset == ds)] + + # Make sure that the dataset version exists + if not versions: + raise db.DatasetVersion.DoesNotExist("No versions exist for this dataset") + + if self.settings.version not in [v.version for v in versions]: + logging.error("Unknown version '%s' for dataset '%s'.", self.settings.version, self.dataset.short_name) + logging.info("Available versions are:") + for version in versions: + logging.info(" * %s", version.version) + sys.exit(1) + self.dataset_version = [v for v in versions if v.version == self.settings.version][0] + + # Set the sample set's sample size + if self.settings.set_vcf_sampleset_size or self.settings.sampleset_size: + try: + samplesets = db.SampleSet.select() + self.sampleset = [s for s in samplesets if s.dataset.id == self.dataset.id][0] + except IndexError: + logging.warning("No sample set found for data set {}".format(self.dataset.id)) + logging.warning("Sample size will not be set") + self.settings.set_vcf_sampleset_size = False + self.settings.sampleset_size = 0 + + def _insert_coverage(self): + """ + Header columns are chromosome, position, mean coverage, median coverage, + and then coverage under 1, 5 10, 15, 20, 25, 30, 50, 100. + """ + start = time.time() + header = [('chrom', str), ('pos', int), ('mean', float), + ('median', float), ('cov1', float), ('cov5', float), + ('cov10', float), ('cov15', float), ('cov20', float), ('cov25', float), + ('cov30', float), ('cov50', float), ('cov100', float)] + logging.info("Inserting Coverage") + batch = [] + last_progress = -1.0 + counter = 0 + with db.database.atomic(): + for filename in self.settings.coverage_file: + for line in self._open(filename): + line = bytes(line).decode('utf8').strip() + if line.startswith("#"): + continue + + data = {} + for i, item in enumerate(line.strip().split("\t")): + if i == 0: + data['dataset_version'] = self.dataset_version + data[header[i][0]] = header[i][1](item) + + # re-format coverage for batch + data['coverage'] = [data['cov1'], data['cov5'], data['cov10'], + data['cov15'], data['cov20'], data['cov25'], + data['cov30'], data['cov50'], data['cov100']] + del data['cov1'] + del data['cov5'] + del data['cov10'] + del data['cov15'] + del data['cov20'] + del data['cov25'] + del data['cov30'] + del data['cov50'] + del data['cov100'] + + if self.counter['coverage'] is not None: + counter += 1 + + batch += [data] + if len(batch) >= self.settings.batch_size: + if not self.settings.dry_run: + db.Coverage.insert_many(batch).execute() + batch = [] + # Update progress + if self.counter['coverage'] is not None: + last_progress = self._update_progress_bar(counter, self.counter['coverage'], last_progress) + if batch and not self.settings.dry_run: + db.Coverage.insert_many(batch).execute() + if self.counter['coverage'] is not None: + last_progress = self._update_progress_bar(counter, self.counter['coverage'], last_progress, finished=True) + if not self.settings.dry_run: + logging.info("Inserted {} coverage records in {}".format(counter, self._time_since(start))) + + def _insert_variants(self): + """ + Insert variants from a VCF file + """ + logging.info("Inserting variants%s", " (dry run)" if self.settings.dry_run else "") + header = [("chrom", str), ("pos", int), ("rsid", str), ("ref", str), + ("alt", str), ("site_quality", float), ("filter_string", str)] + start = time.time() + batch = [] + genes = [] + transcripts = [] + + last_progress = -1.0 + counter = 0 + samples = 0 + vep_field_names = None + dp_mids = None + gq_mids = None + with db.database.atomic(): + for filename in self.settings.variant_file: + # Get reference set for the variant + ref_set = self.dataset_version.reference_set + + # Get all genes and transcripts for foreign keys + ref_genes = {gene.gene_id: gene.id for gene in (db.Gene.select(db.Gene.id, db.Gene.gene_id) + .where(db.Gene.reference_set == ref_set))} + ref_transcripts = {tran.transcript_id: tran.id for tran in (db.Transcript + .select(db.Transcript.id, + db.Transcript.transcript_id) + .join(db.Gene) + .where(db.Gene.reference_set == ref_set))} + for line in self._open(filename): + line = bytes(line).decode('utf8').strip() + + if line.startswith("#"): + # Check for some information that we need + if line.startswith('##INFO=').split('|') + if line.startswith('##INFO=').split('|')) + if line.startswith('##INFO=').split('|')) + if line.startswith('#CHROM'): + samples = len(line.split('\t')[9:]) + continue + + if not self.settings.beacon_only: + if vep_field_names is None: + logging.error("VEP_field_names is empty. Make sure VCF header is present.") + sys.exit(1) + + base = {} + for i, item in enumerate(line.strip().split("\t")): + if i == 0: + base['dataset_version'] = self.dataset_version + if i < 7: + base[header[i][0]] = header[i][1](item) + elif i == 7 or not self.settings.beacon_only: + # only parse column 7 (maybe also for non-beacon-import?) + info = dict([(x.split('=', 1)) if '=' in x else (x, x) for x in re.split(';(?=\w)', item)]) + + if base["chrom"].startswith('GL') or base["chrom"].startswith('MT'): + continue + + consequence_array = info['CSQ'].split(',') if 'CSQ' in info else [] + if not self.settings.beacon_only: + annotations = [dict(zip(vep_field_names, x.split('|'))) for x in consequence_array if len(vep_field_names) == len(x.split('|'))] + + alt_alleles = base['alt'].split(",") + if base['rsid'].startswith('rs'): + rsids = [int(rsid.strip('rs')) for rsid in base['rsid'].split(';')] + else: + rsids = [None] + + try: + hom_counts = [int(info['AC_Hom'])] + except KeyError: + hom_counts = None # null is better than 0, as 0 has a meaning + except ValueError: + hom_counts = [int(count) for count in info['AC_Hom'].split(',')] + + fmt_alleles = ['{}-{}-{}-{}' + .format(base['chrom'], + *get_minimal_representation(base['pos'], + base['ref'], + x)) + for x in alt_alleles] + + for i, alt in enumerate(alt_alleles): + if not self.settings.beacon_only: + vep_annotations = [ann for ann in annotations if int(ann['ALLELE_NUM']) == i + 1] + + data = dict(base) + data['pos'], data['ref'], data['alt'] = get_minimal_representation(base['pos'], base['ref'], alt) + data['orig_alt_alleles'] = fmt_alleles + + if len(rsids) <= i: + data['rsid'] = rsids[-1] # same id as the last alternate + else: + data['rsid'] = rsids[i] + + an, ac = 'AN_Adj', 'AC_Adj' + if 'AN_Adj' not in info: + an = 'AN' + if 'AC_Adj' not in info: + ac = 'AC' + + data['allele_num'] = int(info[an]) + data['allele_freq'] = None + if 'NS' in info and not samples: + # save this unless we already know the sample size + samples = int(info['NS']) + + data['allele_count'] = int(info[ac].split(',')[i]) + if 'AF' in info and data['allele_num'] > 0: + data['allele_freq'] = data['allele_count']/float(info[an]) + + if not self.settings.beacon_only: + data['vep_annotations'] = vep_annotations + + genes.append(list(set({annotation['Gene'] for annotation in vep_annotations if annotation['Gene'][:4] == 'ENSG'}))) + transcripts.append(list(set({annotation['Feature'] for annotation in vep_annotations if annotation['Feature'][:4] == 'ENST'}))) + + data['hom_count'] = hom_counts[i] if hom_counts else None + + data['variant_id'] = '{}-{}-{}-{}'.format(data['chrom'], data['pos'], data['ref'], data['alt']) + data['quality_metrics'] = dict([(x, info[x]) for x in METRICS if x in info]) + batch += [data] + counter += 1 + + if len(batch) >= self.settings.batch_size: + if not self.settings.dry_run: + if not self.settings.beacon_only: + try: + curr_id = db.Variant.select(db.Variant.id).order_by(db.Variant.id.desc()).limit(1).get().id + except db.Variant.DoesNotExist: + # assumes next id will be 1 if table is empty + curr_id = 0 + + db.Variant.insert_many(batch).execute() + + if not self.settings.beacon_only: + last_id = db.Variant.select(db.Variant.id).order_by(db.Variant.id.desc()).limit(1).get().id + if last_id-curr_id == len(batch): + indexes = list(range(curr_id+1, last_id+1)) + else: + indexes = [] + for entry in batch: + indexes.append(db.Variant.select(db.Variant.id).where(db.Variant.variant_id == entry['variant_id']).get().id) + self.add_variant_genes(indexes, genes, ref_genes) + self.add_variant_transcripts(indexes, transcripts, ref_transcripts) + + genes = [] + transcripts = [] + batch = [] + # Update progress + if self.counter['variants'] != None: + last_progress = self._update_progress_bar(counter, self.counter['variants'], last_progress) + + if batch and not self.settings.dry_run: + if not self.settings.dry_run: + if not self.settings.beacon_only: + try: + curr_id = db.Variant.select(db.Variant.id).order_by(db.Variant.id.desc()).limit(1).get().id + except db.Variant.DoesNotExist: + # assumes next id will be 1 if table is empty + curr_id = 0 + + db.Variant.insert_many(batch).execute() + + if not self.settings.beacon_only: + last_id = db.Variant.select(db.Variant.id).order_by(db.Variant.id.desc()).limit(1).get().id + if last_id-curr_id == len(batch): + indexes = list(range(curr_id+1, last_id+1)) + else: + indexes = [] + for entry in batch: + indexes.append(db.Variant.select(db.Variant.id).where(db.Variant.variant_id == entry['variant_id']).get().id) + self.add_variant_genes(indexes, genes, ref_genes) + self.add_variant_transcripts(indexes, transcripts, ref_transcripts) + + if self.settings.set_vcf_sampleset_size and samples: + self.sampleset.sample_size = samples + self.sampleset.save() + + self.dataset_version.num_variants = counter + self.dataset_version.save() + if self.counter['variants'] != None: + last_progress = self._update_progress_bar(counter, self.counter['variants'], last_progress, finished=True) + if not self.settings.dry_run: + logging.info("Inserted {} variant records in {}".format(counter, self._time_since(start))) + + def count_entries(self): + start = time.time() + if self.settings.coverage_file: + self.counter['coverage'] = 0 + logging.info("Counting coverage lines") + for filename in self.settings.coverage_file: + for line in self._open(filename): + line = bytes(line).decode('utf8').strip() + if line.startswith("#"): + continue + self.counter['coverage'] += 1 + logging.info("Found {:,} coverage lines".format(self.counter['coverage'])) + + if self.settings.variant_file: + self.counter['variants'] = 0 + logging.info("Counting variant lines") + for filename in self.settings.variant_file: + for line in self._open(filename): + line = bytes(line).decode('utf8').strip() + if line.startswith("#"): + continue + self.counter['variants'] += 1 + logging.info("Found {:,} variants".format(self.counter['variants'])) + + logging.info("Counted input data lines in {} ".format(self._time_since(start))) + + def prepare_data(self): + self._select_dataset_version() + + def start_import(self): + self._set_dataset_info() + if self.settings.variant_file: + self._insert_variants() + if not self.settings.beacon_only and self.settings.coverage_file: + self._insert_coverage() + + def add_variant_genes(self, variant_indexes:list, genes_to_add:list, ref_genes:dict): + batch = [] + for i in range(len(variant_indexes)): + connected_genes = [{'variant':variant_indexes[i], 'gene':ref_genes[gene]} for gene in genes_to_add[i] if gene] + batch += connected_genes + if not self.settings.dry_run: + db.VariantGenes.insert_many(batch).execute() + + def add_variant_transcripts(self, variant_indexes:list, transcripts_to_add:list, ref_transcripts:dict): + batch = [] + for i in range(len(variant_indexes)): + connected_transcripts = [{'variant':variant_indexes[i], 'transcript':ref_transcripts[transcript]} + for transcript in transcripts_to_add[i]] + batch += connected_transcripts + if not self.settings.dry_run: + db.VariantTranscripts.insert_many(batch).execute() diff --git a/scripts/importer/data_importer/reference_set_importer.py b/scripts/importer/data_importer/reference_set_importer.py new file mode 100644 index 000000000..66a9bed1f --- /dev/null +++ b/scripts/importer/data_importer/reference_set_importer.py @@ -0,0 +1,368 @@ +#!/usr/bin/env python3 + +import os +import re +import gzip +import time +import shutil +import logging +import zipfile +from peewee import IntegrityError, fn +import db +from .data_importer import DataImporter + +class ReferenceSetImporter(DataImporter): + GENCODE = "ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_{a.gencode_version}/gencode.v{a.gencode_version}.annotation.gtf.gz" + DBNSFP = "ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbNSFPv{a.dbnsfp_version}.zip" + ENSEMBL = ("ensembldb.ensembl.org", "anonymous", "") + + def __init__(self, settings): + super().__init__(settings) + + # counters for statistics and progress + self.numbers = {'genes': None, + 'transcripts': None, + 'features': None} + self.counters = {'genes': 0, + 'transcripts': 0, + 'features': 0} + + # dictionaries to hold data while processing + self.genes = [] + self.transcripts = [] + self.features = [] + + # file handlers for later + self.gencode = None + self.dbnsfp = None + self.ensembl = None + + def _insert_features(self): + logging.info("Inserting features into database") + start = time.time() + last_progress = -1 + batch = [] + with db.database.atomic(): + for i, feature in enumerate(self.features): + batch += [{'gene':self.gene_db_ids[feature['gene_id']], + 'transcript':self.transcript_db_ids[feature['transcript_id']], + 'chrom':feature['chrom'], + 'start':feature['start'], + 'stop':feature['stop'], + 'strand':feature['strand'], + 'feature_type':feature['feature_type']}] + + if len(batch) >= self.batch_size: + if not self.settings.dry_run: + db.Feature.insert_many(batch).execute() + batch = [] + + last_progress = self._update_progress_bar(i, len(self.features), last_progress) + if batch: + if not self.settings.dry_run: + db.Feature.insert_many(batch).execute() + last_progress = self._update_progress_bar(i, len(self.features), last_progress, finished=True) + + logging.info("Features inserted in {}".format(self._time_since(start))) + + def _insert_genes(self): + logging.info("Inserting genes into database") + start = time.time() + self.gene_db_ids = {} + last_progress = -1 + for i, gene in enumerate(self.genes): + # As far as I know I can't batch insert these and still get the id's back + db_gene = db.Gene(reference_set=self.db_reference, + gene_id=gene['gene_id'], + name=gene['name'], + full_name=gene.get('full_name', None), + canonical_transcript=gene.get('canonical_transcript', None), + chrom=gene['chrom'], + start=gene['start'], + stop=gene['stop'], + strand=gene['strand']) + + if self.settings.dry_run: + self.gene_db_ids[gene['gene_id']] = 0 + else: + db_gene.save() + self.gene_db_ids[gene['gene_id']] = db_gene.id + + try: + other_names = gene['other_names'] + if other_names: + self.add_other_names(db_gene.id, other_names) + except KeyError: + pass + + last_progress = self._update_progress_bar(i, len(self.genes), last_progress) + last_progress = self._update_progress_bar(i, len(self.genes), last_progress, finished=True) + + logging.info("Genes inserted in {}".format(self._time_since(start))) + + def _insert_reference(self): + logging.info("inserting reference header") + self.db_reference = db.ReferenceSet(name=self.settings.ref_name, + reference_build=self.settings.assembly_id, + ensembl_version=self.settings.ensembl_version, + gencode_version=self.settings.gencode_version, + dbnsfp_version=self.settings.dbnsfp_version) + + if self.settings.dry_run: + max_id = db.ReferenceSet.select(fn.Max(db.ReferenceSet.id)).get() + if max_id.id is None: + self.db_reference.id = 0 + else: + self.db_reference.id = max_id.id + 1 + else: + self.db_reference.save() + logging.info("Reference %s created", self.db_reference.id) + + def _insert_transcripts(self): + logging.info("Inserting transcripts into database") + start = time.time() + + self.transcript_db_ids = {} + last_progress = -1 + for i, transcript in enumerate(self.transcripts): + db_transcript = db.Transcript(transcript_id=transcript['transcript_id'], + gene=self.gene_db_ids[transcript['gene_id']], + mim_annotation=transcript.get('mim_annotation', None), + mim_gene_accession=transcript.get('mim_gene_accession', None), + chrom=transcript['chrom'], + start=transcript['start'], + stop=transcript['stop'], + strand=transcript['strand']) + + if self.settings.dry_run: + self.transcript_db_ids[transcript['transcript_id']] = 0 + else: + db_transcript.save() + self.transcript_db_ids[transcript['transcript_id']] = db_transcript.id + + last_progress = self._update_progress_bar(i, len(self.transcripts), last_progress) + last_progress = self._update_progress_bar(i, len(self.transcripts), last_progress, finished=True) + + logging.info("Transcripts inserted in {}".format(self._time_since(start))) + + def _open_dbnsfp(self): + """ + Downloads (if needed) and opens the given dbNSFP file. + + Only a small part, 'dbNSFP2.9_gene' of the ~13Gb file is needed, but in + order to get it we have to download the entire file, extract the part + that we want, and then discard the dbNSFP package. + """ + logging.info("----- Opening dbNSFP file -----") + url = ReferenceSetImporter.DBNSFP.format(a=self.settings) + filename = url.split("/")[-1] + match = re.match("^\d+.\d+", self.settings.dbnsfp_version) + if match: + dbnsfp_gene_version = match.group(0) + else: + raise ValueError("Cannot parse dbNSFP version") + dbnsfp_file = "dbNSFP{}_gene".format(dbnsfp_gene_version) + logging.info("Using dbNSFP gene file: {}".format(dbnsfp_file)) + dbnsfp_path = os.path.join(self.download_dir, dbnsfp_file) + dbnsfp_gzip = "{}.gz".format(dbnsfp_path) + try: + os.stat(dbnsfp_gzip) + except FileNotFoundError: + try: + package_file = os.path.join(self.download_dir, filename) + os.stat(package_file) + except FileNotFoundError: + self._download(url) + logging.info("extracting {} from {}".format(dbnsfp_file, package_file)) + package = zipfile.ZipFile(package_file) + package.extract(dbnsfp_file, self.download_dir) + logging.info("gzipping {}".format(dbnsfp_file)) + with open(dbnsfp_path, 'rb') as f_in: + with gzip.open(dbnsfp_gzip, 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + logging.info("removing non-zipped file and package file.") + os.remove(dbnsfp_path) + os.remove(package_file) + self.dbnsfp = self._open(dbnsfp_gzip) + + def _open_ensembl(self): + """ + Connects to the given ensembl database. + """ + logging.info("----- Opening ensembl database connection -----") + self.ensembl = self._connect(*(ReferenceSetImporter.ENSEMBL + (self.settings.ensembl_version,))) + + def _open_gencode(self): + """ + Downloads (if needed) and opens the given gencode file + """ + logging.info("----- Opening gencode file -----") + url = ReferenceSetImporter.GENCODE.format(a=self.settings) + filename = url.split("/")[-1] + try: + os.stat(os.path.join(self.download_dir, filename)) + self.gencode = self._open(os.path.join(self.download_dir, filename)) + except FileNotFoundError: + self.gencode = self._download_and_open(url) + + def _read_dbnsfp(self): + start = time.time() + header = None + logging.info("Adding dbNSFP annotation") + + dbnsfp_cache = {} + for line in self.dbnsfp: + raw = bytes(line).decode('utf8').strip().split("\t") + if not header: + header = raw + if header: + continue + + values = {} + for i, value in enumerate(raw): + values[header[i]] = value + + dbnsfp_cache[values['Ensembl_gene']] = { + 'other_names': values['Gene_other_names'].split(';'), + 'full_name': values['Gene_full_name'] + } + + for i, gene in enumerate(self.genes): + if gene['gene_id'] in dbnsfp_cache: + for key, item in dbnsfp_cache[gene['gene_id']].items(): + if item in ['', '.']: + item = None + self.genes[i][key] = item + + logging.info("dbNSFP information added in {}.".format(self._time_since(start))) + + def _read_ensembl(self): + """ + Reads the ensembl information into the gene dictionary + """ + query = """SELECT g.stable_id, + t.stable_id + FROM gene g + JOIN transcript t + ON (g.canonical_transcript_id = t.transcript_id) + """ + start = time.time() + + canonical_dict = {} + logging.info("Pre-fetching all canonical transcripts") + self.ensembl.execute(query) + for transcript in self.ensembl.fetchall(): + canonical_dict[transcript[0]] = transcript[1] + + last_progress = -1.0 + for i, gene in enumerate(self.genes): + if gene['gene_id'] in canonical_dict: + self.genes[i]['canonical_transcript'] = canonical_dict[gene['gene_id']] + + self.counters['genes'] += 1 + if self.numbers['genes'] != None: + last_progress = self._update_progress_bar(i, self.numbers['genes'], last_progress) + if self.numbers['genes'] != None: + last_progress = self._update_progress_bar(i, self.numbers['genes'], last_progress, finished=True) + logging.info("Canonical transcript information from ensembl added in {}.".format(self._time_since(start))) + + def count_entries(self): + logging.info("Counting features in gencode file (for progress bar)") + start = time.time() + self.numbers['genes'] = 0 + self.numbers['transcripts'] = 0 + self.numbers['features'] = 0 + for row in self.gencode: + raw = bytes(row).decode('ascii').strip() + if raw[0] == "#": + continue + values = raw.split("\t") + if len(values) < 2: + continue + + if self.chrom and values[0][3:] not in self.chrom: + continue + + if values[2] == 'gene': + self.numbers['genes'] += 1 + elif values[2] == 'transcript': + self.numbers['transcripts'] += 1 + elif values[2] in ['CDS', 'exon', 'UTR']: + self.numbers['features'] += 1 + + self.gencode.rewind() + + pad = max([len("{:,}".format(self.numbers[t])) for t in ["genes", "transcripts", "features"]]) + logging.info("Parsed file in {:3.1f}s".format(time.time()-start)) + logging.info("Genes : {0:>{pad},}".format(self.numbers['genes'], pad=pad)) + logging.info("Transcripts: {0:>{pad},}".format(self.numbers['transcripts'], pad=pad)) + logging.info("Features : {0:>{pad},}".format(self.numbers['features'], pad=pad)) + + def prepare_data(self): + self._open_gencode() + self._open_dbnsfp() + self._open_ensembl() + + def start_import(self): + start = time.time() + logging.info("Reading gencode data into buffers.") + last_progress = -1.0 + for line in self.gencode: + line = bytes(line).decode('ascii').strip() + if line.startswith("#"): + continue + try: + values = line.split("\t") + + if self.chrom and values[0][3:] not in self.chrom: + continue + + info = dict(x.strip().split() for x in values[8].split(';') if x != '') + info = {k: v.strip('"') for k, v in info.items()} + + data = {'chrom':values[0][3:], + 'start':int(values[3]), + 'stop':int(values[4]), + 'strand':values[6], + 'gene_id':info['gene_id'].split('.')[0]} + + # only progress for genes to keep it simple + if self.numbers['genes'] is not None: + last_progress = self._update_progress_bar(self.counters['genes'], self.numbers['genes'], last_progress) + if values[2] == 'gene': + data['name'] = info['gene_name'] + self.genes += [data] + self.counters['genes'] += 1 + continue + + data['transcript_id'] = info['transcript_id'].split('.')[0] + if values[2] == 'transcript': + self.transcripts += [data] + self.counters['transcripts'] += 1 + continue + + if values[2] in ['exon', 'CDS', 'UTR']: + data['feature_type'] = values[2] + self.features += [data] + self.counters['features'] += 1 + continue + + except Exception as error: + logging.error("{}".format(error)) + break + if self.numbers['genes'] is not None: + last_progress = self._update_progress_bar(self.counters['genes'], self.numbers['genes'], last_progress, finished=True) + logging.info("Gencode data read into buffers in {}.".format(self._time_since(start))) + self._read_ensembl() + self._read_dbnsfp() + self._insert_reference() + self._insert_genes() + self._insert_transcripts() + self._insert_features() + + def add_other_names(self, gene_dbid:int, other_names:list): + if not gene_dbid or not other_names: + return + batch = [{'gene':gene_dbid, 'name':other_name} for other_name in other_names if other_name != '.' and other_name] + if not self.settings.dry_run and batch: + db.GeneOtherNames.insert_many(batch).execute() diff --git a/scripts/importer/importer.py b/scripts/importer/importer.py new file mode 100755 index 000000000..95f691a97 --- /dev/null +++ b/scripts/importer/importer.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python3 +""" +Swefreq data importer. + +Requires apt-packages: + - python3-pip + - sudo apt-get install libmysqlclient-dev + +as well as pip3 packages: + - mysqlclient + - peewee-2.10.2 +""" + +from data_importer.reference_set_importer import ReferenceSetImporter +from data_importer.old_db_importer import OldDbImporter +from data_importer.raw_data_importer import RawDataImporter + +if __name__ == '__main__': + + import os + import argparse + import logging + + PARSER = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + PARSER.add_argument("--batch_size", type=int, default=5000, + help=("Where batch insertion is possible, use this " + "number of inserts per batch.")) + PARSER.add_argument("--limit_chrom", default=None, + help="Limit chromosome to insert into the database.") + PARSER.add_argument("--data_dir", + default=os.path.join(os.path.dirname(__file__), + "downloaded_files"), + help="Default directory to download and look for files." + ) + + # Reference versions + PARSER.add_argument("--gencode_version", default=19, type=int, + help="Gencode version to download and use.") + PARSER.add_argument("--ensembl_version", default="homo_sapiens_core_75_37", + help="Ensembl database to connect to.") + PARSER.add_argument("--dbnsfp_version", default="2.9.3", + help="dbNSFP version to download and use.") + + # Dataset connections and naming + PARSER.add_argument("--dataset", default="", + help="Which dataset to connect imported data to.") + PARSER.add_argument("--version", default="", + help="Which dataset version to add imported data to.") + PARSER.add_argument("--ref_name", default="", + help=("Reference name to use when creating a reference " + "set.")) + + PARSER.add_argument("--dataset_size", type=int, default=0, + help="Set dataset size for this dataset") + PARSER.add_argument("--set_vcf_sampleset_size", action="store_true", + help=("Set/update sampleset size to the value given in " + "the VCF. This is either the NS value, or the " + "number of stated samples")) + PARSER.add_argument("--sampleset_size", type=int, default=0, + help="Set sampleset size for this dataset") + PARSER.add_argument("--beacon_description", default="", + help="Set beacon description of the dataset.") + PARSER.add_argument("--assembly_id", default="", + help=("Set reference assembly id (GRC notation, e.g. " + "GRCh37)")) + + # Raw data (coverage + variants) files + PARSER.add_argument("--coverage_file", nargs="*", + help="Coverage file(s) to import.") + PARSER.add_argument("--variant_file", nargs="*", + help="Variant file(s) to import.") + + # Actions + PARSER.add_argument("--add_reference", action="store_true", + help="Insert new reference set.") + PARSER.add_argument("--add_raw_data", action="store_true", + help="Adds a Coverage and Variants to the database.") + PARSER.add_argument("--move_studies", action="store_true", + help=("Moves studies and datasets from an old database " + "to a new one.")) + PARSER.add_argument("--dry_run", action="store_true", + help="Do not insert anything into the database") + + # Logging and verbosity + PARSER.add_argument("--disable_progress", action="store_true", + help="Do not show progress bars.") + PARSER.add_argument("-v", "--verbose", action="count", default=3, + help="Increase output Verbosity.") + PARSER.add_argument("-q", "--quiet", action="count", default=0, + help="Decrease output Verbosity.") + + # Beacon-only variants + PARSER.add_argument("--beacon-only", action="store_true", + help=("Variants are intended only for Beacon, loosening" + " the requirements")) + + ARGS = PARSER.parse_args() + + logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", + level=(5-ARGS.verbose+ARGS.quiet)*10, + datefmt="%H:%M:%S") + + if ARGS.add_reference: + logging.info("Adding a new reference set using these sources:") + logging.info(" - Gencode: %s", ARGS.gencode_version) + logging.info(" - Ensembl: %s", ARGS.ensembl_version) + logging.info(" - dbNSFP: %s", ARGS.dbnsfp_version) + + IMPORTER = ReferenceSetImporter(ARGS) + IMPORTER.prepare_data() + if not ARGS.disable_progress: + IMPORTER.count_entries() + IMPORTER.start_import() + + if ARGS.move_studies: + IMPORTER = OldDbImporter(ARGS) + IMPORTER.prepare_data() + IMPORTER.start_import() + + if ARGS.add_raw_data: + logging.info("Adding raw data %s", "(dry run)" if ARGS.dry_run else '') + IMPORTER = RawDataImporter(ARGS) + IMPORTER.prepare_data() + if not ARGS.disable_progress: + IMPORTER.count_entries() + IMPORTER.start_import() diff --git a/scripts/importer/requirements.txt b/scripts/importer/requirements.txt new file mode 100644 index 000000000..6bf362d48 --- /dev/null +++ b/scripts/importer/requirements.txt @@ -0,0 +1,3 @@ +mysqlclient==1.3.13 +peewee==2.10.2 +psycopg2-binary==2.7.5 diff --git a/scripts/manage.sh b/scripts/manage.sh new file mode 100755 index 000000000..164769436 --- /dev/null +++ b/scripts/manage.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash + +# Some variables +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +export PYTHONPATH=${PYTHONPATH:+"$PYTHONPATH:"}"$SCRIPT_DIR/../backend" + +do_help () { + cat <&2 + do_help >&2 + exit 1 +esac diff --git a/settings_sample.json b/settings_sample.json index 5b7fc79bb..d3310b007 100644 --- a/settings_sample.json +++ b/settings_sample.json @@ -11,6 +11,12 @@ "mysqlUser" : "swefreq", "mysqlPort" : 3306, + "postgresHost" : "postgres host", + "postgresPort" : 5432, + "postgresUser" : "postgres", + "postgresPass" : "", + "postgresName" : "swefreq", + "mongoHost" : "mongodb host", "mongoPassword" : "password", "mongoPort" : 27017, diff --git a/sql/beacon_schema.sql b/sql/beacon_schema.sql new file mode 100644 index 000000000..56ed7cc26 --- /dev/null +++ b/sql/beacon_schema.sql @@ -0,0 +1,120 @@ +------------------------------------------------------------------------------- +-- Modified beacon schema. +-- +-- This schema is heavily based on the finnish beacon schema at: +-- https://github.com/CSCfi/beacon-python/blob/master/data/init.sql +-- +-- but has been modified to use views instead of tables for the +-- beacon_dataset_table and beacon_data_table. +-- This was done so that the main swefreq-schema could be kept, without having +-- to store duplicate information. + +CREATE SCHEMA IF NOT EXISTS beacon; + +-------------------------------------------------------------------------------- +-- Beacon dataset and data tables +-- +-- These tables need to be represented as semi-complex views, as to avoid +-- storing redundant data. + +CREATE OR REPLACE VIEW beacon.beacon_dataset_table AS -- original type + SELECT v.id AS index, -- serial + d.short_name AS name, -- varchar(128) + concat_ws(':', r.reference_build, + d.short_name, + v.dataset_version) AS datasetId, -- varchar(128) + d.beacon_description AS "description", -- varchar(512) + substr(r.reference_build, 0, 7) AS assemblyId, -- varchar(16) + v.available_from AS createDateTime, -- timestamp + v.available_from AS updateDateTime, -- timstamp + v.dataset_version AS "version", -- varchar(8) + s.sample_size AS sampleCount, -- integer + d.browser_uri AS externalUrl, -- varchar(256) + CASE WHEN v.available_from < now() THEN 'PUBLIC' + WHEN v.available_from > now() THEN 'CONTROLLED' + END AS accessType -- varchar(10) + FROM data.datasets AS d + JOIN data.dataset_version_current AS v + ON v.dataset = d.id + JOIN data.reference_sets AS r + ON v.reference_set = r.id + JOIN data.sample_sets AS s + ON s.dataset = d.id +; + + +-- This seems to return correct values except that it seems to +-- _always_ return 1 for callCount, even when there's no data. +-- TODO: make sure that callCount can handle zero values. +CREATE OR REPLACE VIEW beacon.beacon_dataset_counts_table AS + SELECT concat_ws(':', r.reference_build, + d.short_name, + v.dataset_version) AS datasetId, -- varchar(128) + COUNT(DISTINCT(dv.ref, dv.pos)) AS callCount, -- integer + COUNT(dv) AS variantCount -- integer + FROM data.datasets as d + JOIN data.dataset_version_current AS v + ON v.dataset = d.id + JOIN data.reference_sets AS r + ON v.reference_set = r.id + LEFT JOIN data.variants AS dv + ON dv.dataset_version = v.id + GROUP BY r.reference_build, d.short_name, v.dataset_version +; + + +CREATE MATERIALIZED VIEW beacon.beacon_data_table AS + SELECT dv.id AS index, -- serial + concat_ws(':', r.reference_build, + d.short_name, + v.dataset_version) AS datasetId, -- varchar(128) + dv.pos - 1 AS "start", -- integer + substr(dv.chrom, 1, 2) AS chromosome, -- varchar(2) + dv.ref AS reference, -- varchar(8192) + dv.alt AS alternate, -- varchar(8192) + dv.pos - 1 + char_length(dv.ref) AS "end", -- integer + dv.allele_num AS callCount, -- integer + dv.allele_freq AS frequency, -- integer + dv.allele_count AS alleleCount, -- integer + CASE WHEN length(dv.ref) = length(dv.alt) THEN 'SNP' + WHEN length(dv.ref) > length(dv.alt) THEN 'DEL' + WHEN length(dv.ref) < length(dv.alt) THEN 'INS' + END AS variantType -- varchar(16) + FROM data.variants AS dv + JOIN data.dataset_version_current as v + ON dv.dataset_version = v.id + JOIN data.datasets as d + ON v.dataset = d.id + JOIN data.reference_sets AS r + ON v.reference_set = r.id +; + + +-------------------------------------------------------------------------------- +-- Beacon views. +-- +-- These are kept as-is from the reference. + +-- This index is part of the finnish schema, but I deactivated it so that I don't have to materialize the views +-- CREATE UNIQUE INDEX data_conflict ON beacon_data_table (datasetId, chromosome, start, reference, alternate); +-- CREATE UNIQUE INDEX metadata_conflict ON beacon_dataset_table (name, datasetId); +-- This gets really, really slow if not materialized. (TODO why?) + +CREATE MATERIALIZED VIEW beacon.dataset_metadata(name, datasetId, description, assemblyId, + createDateTime, updateDateTime, version, + callCount, variantCount, sampleCount, externalUrl, accessType) +AS SELECT a.name, a.datasetId, a.description, a.assemblyId, a.createDateTime, + a.updateDateTime, a.version, b.callCount, + b.variantCount, + a.sampleCount, a.externalUrl, a.accessType +FROM beacon.beacon_dataset_table a, beacon.beacon_dataset_counts_table b +WHERE a.datasetId=b.datasetId +GROUP BY a.name, a.datasetId, a.description, a.assemblyId, a.createDateTime, +a.updateDateTime, a.version, a.sampleCount, a.externalUrl, a.accessType, b.callCount, b.variantCount; + +-------------------------------------------------------------------------------- +-- Indexes +-- + +CREATE INDEX beacon_data_chrpos ON beacon.beacon_data_table (chromosome,start); +CREATE INDEX beacon_data_chrref ON beacon.beacon_data_table (chromosome,reference); diff --git a/sql/data_schema.sql b/sql/data_schema.sql new file mode 100644 index 000000000..7c518df12 --- /dev/null +++ b/sql/data_schema.sql @@ -0,0 +1,222 @@ +-------------------------------------------------------------------------------- +-- Swefreq data schema -- +-- -- +-- This schema contains the studies and datasets, as well as the actual data -- +-- (reference-data, variants, and coverage) the goes into the Swefreq system. -- +-- -- +-------------------------------------------------------------------------------- +CREATE SCHEMA IF NOT EXISTS data; + +-------------------------------------------------------------------------------- +-- Reference Set tables +-- + +CREATE TABLE IF NOT EXISTS data.reference_sets ( + id integer PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY, + reference_build varchar UNIQUE, -- should be ^(GRCh[0-9]+([.]p[0-9]+)?)$ + reference_name varchar, + ensembl_version varchar, + gencode_version varchar, + dbnsfp_version varchar +); + +CREATE TABLE IF NOT EXISTS data.genes ( + id integer PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY, + reference_set integer REFERENCES data.reference_sets, + gene_id varchar(15), + gene_name varchar, + full_name varchar, + canonical_transcript varchar(15), + chrom varchar(10), + start_pos integer, + end_pos integer, + strand varchar +); + +CREATE TABLE IF NOT EXISTS data.gene_other_names ( + id integer PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY, + gene integer REFERENCES data.genes, + name varchar +); + +CREATE TABLE IF NOT EXISTS data.transcripts ( + id integer PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY, + transcript_id varchar(15), + gene integer REFERENCES data.genes, + mim_annotation varchar, + mim_gene_accession integer, + chrom varchar(10), + start_pos integer, + stop_pos integer, + strand varchar +); + +CREATE TABLE IF NOT EXISTS data.features ( + id integer PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY, + gene integer REFERENCES data.genes, + transcript integer REFERENCES data.transcripts, + chrom varchar(10), + start_pos integer, + stop_pos integer, + strand varchar, + feature_type varchar +); + +-------------------------------------------------------------------------------- +-- Study and Dataset fields +-- + +CREATE TABLE IF NOT EXISTS data.collections ( + id integer PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY, + study_name varchar, + ethnicity varchar +); + +CREATE TABLE IF NOT EXISTS data.studies ( + id integer PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY, + pi_name varchar(100) NOT NULL, + pi_email varchar(100) NOT NULL, + contact_name varchar(100) NOT NULL, + contact_email varchar(100) NOT NULL, + title varchar(100) NOT NULL, + study_description text DEFAULT NULL, + publication_date timestamp NOT NULL, + ref_doi varchar(100), + UNIQUE (pi_email, title) +); + +CREATE TABLE IF NOT EXISTS data.datasets ( + id integer PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY, + study integer NOT NULL REFERENCES data.studies, + short_name varchar(50) UNIQUE NOT NULL, + full_name varchar(100) NOT NULL, + browser_uri varchar(200) DEFAULT NULL, + beacon_uri varchar(200) DEFAULT NULL, + beacon_description text DEFAULT NULL, + avg_seq_depth real DEFAULT NULL, + seq_type varchar(50) DEFAULT NULL, + seq_tech varchar(50) DEFAULT NULL, + seq_center varchar(100) DEFAULT NULL, + dataset_size integer NOT NULL CHECK (dataset_size >= 0) +); + +CREATE TABLE IF NOT EXISTS data.dataset_logos ( + id integer PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY, + dataset integer NOT NULL REFERENCES data.datasets, + mimetype varchar(50) NOT NULL, + bytes bytea NOT NULL +); + +CREATE TABLE IF NOT EXISTS data.sample_sets ( + id integer PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY, + dataset integer NOT NULL REFERENCES data.datasets, + "collection" integer NOT NULL REFERENCES data.collections, + sample_size integer NOT NULL, + phenotype varchar(50) NOT NULL +); + +CREATE TABLE IF NOT EXISTS data.dataset_versions ( + id integer PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY, + dataset integer NOT NULL REFERENCES data.datasets, + reference_set integer NOT NULL REFERENCES data.reference_sets, + dataset_version varchar(20) NOT NULL, + dataset_description text NOT NULL, + terms text NOT NULL, + var_call_ref varchar(50) DEFAULT NULL, + available_from timestamp DEFAULT current_timestamp, + ref_doi varchar(100) DEFAULT NULL, + data_contact_name varchar(100) DEFAULT NULL, + data_contact_link varchar(100) DEFAULT NULL, + num_variants integer DEFAULT NULL, + coverage_levels integer[] DEFAULT NULL -- Levels used for data.coverage.coverage +); + +CREATE TABLE IF NOT EXISTS data.dataset_files ( + id integer PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY, + dataset_version integer NOT NULL REFERENCES data.dataset_versions, + basename varchar(100) NOT NULL, + uri varchar(200) UNIQUE NOT NULL, + file_size bigint NOT NULL +); + +-------------------------------------------------------------------------------- +-- Variant and coverage data fields +-- + +CREATE TABLE IF NOT EXISTS data.variants ( + id integer PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY, + dataset_version integer REFERENCES data.dataset_versions, + variant_type varchar, -- variants go here `"enum": ["DEL", "INS", "DUP", "INV", "CNV", "SNP", "DUP:TANDEM", "DEL:ME", "INS:ME"]` + rsid integer, + chrom varchar(10), + pos integer, + ref varchar, + alt varchar, + site_quality real, + orig_alt_alleles varchar[], + hom_count integer, + allele_freq real, + filter_string varchar, + variant_id varchar, + allele_count integer, + allele_num integer, + quality_metrics jsonb, + vep_annotations jsonb +); + +CREATE TABLE IF NOT EXISTS data.variant_genes ( + id integer PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY, + variant integer REFERENCES data.variants, + gene integer REFERENCES data.genes +); + +CREATE TABLE IF NOT EXISTS data.variant_transcripts ( + id integer PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY, + variant integer REFERENCES data.variants, + transcript integer REFERENCES data.transcripts +); + +CREATE TABLE IF NOT EXISTS data.coverage ( + id integer PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY, + dataset_version integer REFERENCES data.dataset_versions, + chrom varchar(10), + pos integer, + mean real, + median real, + coverage real[] -- These are the coverage values, for the levels defined in data.dataset_versions.coverage_levels +); + +CREATE TABLE IF NOT EXISTS data.metrics ( + id integer PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY, + dataset_version integer REFERENCES data.dataset_versions, + metric varchar, + mids integer[], + hist integer +); + +-------------------------------------------------------------------------------- +-- Data views +-- + +CREATE OR REPLACE VIEW data.dataset_version_current AS + SELECT * FROM data.dataset_versions + WHERE (dataset, id) + IN (SELECT dataset, MAX(id) FROM data.dataset_versions + WHERE available_from < now() + GROUP BY dataset); + +-------------------------------------------------------------------------------- +-- Indexes +-- + +CREATE INDEX coverage_pos_chrom ON data.coverage (chrom, pos); +CREATE INDEX features_gene ON data.features (gene); +CREATE INDEX features_transcript ON data.features (transcript); +CREATE INDEX genes_gene_id ON data.genes (gene_id); +CREATE INDEX transcripts_transcript_id ON data.transcripts (transcript_id); +CREATE INDEX variants_chrom_pos ON data.variants (chrom, pos); +CREATE INDEX variants_rsid ON data.variants (rsid); +CREATE INDEX variant_genes_gene ON data.variant_genes (gene); +CREATE INDEX variant_genes_variant ON data.variant_genes (variant); +CREATE INDEX variant_transcripts_transcript ON data.variant_transcripts (transcript); +CREATE INDEX variant_transcripts_variant ON data.variant_transcripts (variant); diff --git a/sql/schema.dot b/sql/schema.dot deleted file mode 100644 index c085353e6..000000000 --- a/sql/schema.dot +++ /dev/null @@ -1,108 +0,0 @@ -digraph { - ranksep=1 nodesep=1 rankdir=BT - node [ shape = none ] - - user [ label=< - - - - - - -
user
user_pkint
nametext|null
emailtext
affiliationtext|null
countrytext|null
> ]; - - dataset [ label=< - - - - - - - - - - - -
dataset
dataset_pkint
sample_set_pkint
name text
browser_uri text|null
beacon_uri text|null
avg_seq_depth float|null
seq_type text|null
seq_tech text|null
seq_center text|null
dataset_size uint|null
> ]; - - user_log [ label=< - - - - - - -
user_log
user_log_pkint
user_pkint
dataset_pkint
actionenum|null
tstimestamp
> ]; - - dataset_access [ label=< - - - - - - - - -
dataset_access
dataset_access_pkint
dataset_pkint
user_pkint
wants_newsletterbool|false
is_adminbool|false
has_consentedbool|false
has_accessbool|false
> ]; - - - dataset_logo [ label=< - - - - - -
dataset_logo
dataset_logo_pkint
dataset_pkint
mimetypetext
datablob
> ]; - - dataset_version [ label=< - - - - - - - - - -
dataset_version
dataset_version_pkint
dataset_pkint
versiontext
tstimestamp
is_currentbool|true
descriptiontext
termstext
var_call_reftext|null
> ]; - - dataset_file [ label=< - - - - - -
dataset_file
dataset_file_pkint
dataset_version_pkint
nametext
uritext
> ]; - - study [ label=< - - - - - - - - - - -
study
study_pkint
pi_nametext
pi_emailtext
contact_nametext
contact_emailtext
titletext
descriptiontext|null
tstimestamp
ref_doitext|null
> ]; - - sample_set [ label=< - - - - - - -
sample_set
sample_set_pkint
study_pkint
ethnicitytext|null
collectiontext|null
sample_sizeint
> ]; - - sample_set:study_pk -> study:pk; - dataset:sample_set_pk -> sample_set:pk; - user_log:user_pk -> user:pk; - dataset_access:user -> user:pk - dataset_access:dataset -> dataset:pk - dataset_version:dataset -> dataset:pk - dataset_file:dv -> dataset_version:pk - user_log:dataset -> dataset:pk - dataset_logo:dataset -> dataset:pk -} diff --git a/sql/schema.dot.png b/sql/schema.dot.png deleted file mode 100644 index aa918f5ea..000000000 Binary files a/sql/schema.dot.png and /dev/null differ diff --git a/sql/swefreq.sql b/sql/swefreq.sql deleted file mode 100644 index b253e21a0..000000000 --- a/sql/swefreq.sql +++ /dev/null @@ -1,262 +0,0 @@ --- Script for creating the swefreq tables. To run this file use: --- mysql databasename revoked.ts) - GROUP BY granted.user_pk, granted.dataset_pk, granted.action - ); - -CREATE OR REPLACE VIEW dataset_access_pending AS - SELECT DISTINCT - access.*, - FALSE AS has_access, - request.ts AS access_requested - FROM dataset_access AS access - JOIN ( SELECT user_pk, dataset_pk, MAX(ts) AS ts - FROM user_access_log WHERE action = "access_requested" - GROUP BY user_pk, dataset_pk ) AS request - ON access.user_pk = request.user_pk AND - access.dataset_pk = request.dataset_pk - WHERE (access.user_pk, access.dataset_pk) IN ( - -- get user_pk for all users that have pending access requests - SELECT requested.user_pk, requested.dataset_pk - FROM _user_access_log_summary AS requested - LEFT JOIN _user_access_log_summary AS granted - ON requested.user_pk = granted.user_pk AND - requested.dataset_pk = granted.dataset_pk AND - granted.action = 'access_granted' - LEFT JOIN _user_access_log_summary AS revoked - ON requested.user_pk = revoked.user_pk AND - requested.dataset_pk = revoked.dataset_pk AND - revoked.action = 'access_revoked' - WHERE requested.action = 'access_requested' AND - (granted.user_pk IS NULL OR requested.ts > granted.ts) AND - (revoked.user_pk IS NULL OR requested.ts > revoked.ts) - GROUP BY requested.user_pk, requested.dataset_pk, requested.action - ); - -CREATE TABLE IF NOT EXISTS dataset_logo ( - dataset_logo_pk INTEGER NOT NULL PRIMARY KEY AUTO_INCREMENT, - dataset_pk INTEGER NOT NULL, - mimetype VARCHAR(50) NOT NULL, - data MEDIUMBLOB NOT NULL, - CONSTRAINT UNIQUE (dataset_pk), - CONSTRAINT FOREIGN KEY (dataset_pk) REFERENCES dataset(dataset_pk) -) ENGINE=InnoDB DEFAULT CHARSET=latin1; - -CREATE TABLE IF NOT EXISTS linkhash ( - linkhash_pk INTEGER NOT NULL PRIMARY KEY AUTO_INCREMENT, - dataset_version_pk INTEGER NOT NULL, - user_pk INTEGER NOT NULL, - hash VARCHAR(64) NOT NULL, - expires_on TIMESTAMP NOT NULL, - CONSTRAINT UNIQUE (hash), - CONSTRAINT FOREIGN KEY (dataset_version_pk) - REFERENCES dataset_version(dataset_version_pk), - CONSTRAINT FOREIGN KEY (user_pk) REFERENCES user(user_pk) -) ENGINE=InnoDB DEFAULT CHARSET=latin1; - --- dataset_version_current, a view that only contains the (most) current --- version of each entry dataset_version - -CREATE OR REPLACE VIEW dataset_version_current AS - SELECT * FROM dataset_version - WHERE (dataset_pk, dataset_version_pk) IN ( - SELECT dataset_pk, MAX(dataset_version_pk) FROM dataset_version - WHERE available_from < now() - GROUP BY dataset_pk ); diff --git a/sql/user_schema.sql b/sql/user_schema.sql new file mode 100644 index 000000000..ccefa8197 --- /dev/null +++ b/sql/user_schema.sql @@ -0,0 +1,143 @@ +-------------------------------------------------------------------------------- +-- Swefreq user schema -- +-- -- +-- This schema contains the user tables, including the access rights to the -- +-- datasets (in the data schema) in the Swefreq system. -- +-- -- +-------------------------------------------------------------------------------- + +CREATE SCHEMA IF NOT EXISTS users; + +-------------------------------------------------------------------------------- +-- User fields +-- + +CREATE TYPE identity_enum AS ENUM('google', 'elixir'); + +CREATE TABLE IF NOT EXISTS users.users ( + id integer PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY, + username varchar(100) DEFAULT NULL, + email varchar(100) UNIQUE NOT NULL, + affiliation varchar(100) DEFAULT NULL, + country varchar(100) DEFAULT NULL, + identity varchar(100) NOT NULL, + identity_type identity_enum NOT NULL, + UNIQUE (identity, identity_type) +); + +CREATE TABLE IF NOT EXISTS users.sftp_users ( + id integer PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY, + user_id integer NOT NULL REFERENCES users.users, + user_uid integer UNIQUE NOT NULL CHECK (user_uid >= 10000), + user_name varchar(50) NOT NULL, + password_hash varchar(100) NOT NULL, + account_expires timestamp NOT NULL +); + +CREATE TABLE IF NOT EXISTS users.linkhash ( + id integer PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY, + dataset_version integer NOT NULL REFERENCES data.dataset_versions, + user_id integer NOT NULL REFERENCES users.users, + "hash" varchar(64) UNIQUE NOT NULL, + expires_on timestamp NOT NULL +); + +CREATE TABLE IF NOT EXISTS users.dataset_access ( + id integer PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY, + dataset integer NOT NULL REFERENCES data.datasets, + user_id integer NOT NULL REFERENCES users.users, + wants_newsletter boolean DEFAULT false, + is_admin boolean DEFAULT false, + UNIQUE (dataset, user_id) +); + +CREATE TYPE access_action AS ENUM('access_granted', 'access_revoked', + 'access_requested', 'private_link'); + +CREATE TABLE IF NOT EXISTS users.user_access_log ( + id integer PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY, + user_id integer NOT NULL REFERENCES users.users, + dataset integer NOT NULL REFERENCES data.datasets, + ts timestamp NOT NULL DEFAULT current_timestamp, + "action" access_action DEFAULT NULL +); + +CREATE TABLE IF NOT EXISTS users.user_consent_log ( + id integer PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY, + user_id integer NOT NULL REFERENCES users.users, + dataset_version integer NOT NULL REFERENCES data.dataset_versions, + ts timestamp NOT NULL DEFAULT current_timestamp +); + +CREATE TABLE IF NOT EXISTS users.user_download_log ( + id integer PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY, + user_id integer NOT NULL REFERENCES users.users, + dataset_file integer NOT NULL REFERENCES data.dataset_files, + ts timestamp NOT NULL DEFAULT current_timestamp +); + +-------------------------------------------------------------------------------- +-- User views +-- + +CREATE OR REPLACE VIEW users.user_access_log_summary AS + SELECT MAX(id) AS id, + user_id, + dataset, + "action", + MAX(ts) AS ts + FROM users.user_access_log + GROUP BY user_id, dataset, "action" +; + +CREATE OR REPLACE VIEW users.dataset_access_current AS + SELECT DISTINCT + access.*, + TRUE AS has_access, + request.ts AS access_requested + FROM users.dataset_access AS access + JOIN ( SELECT user_id, dataset, MAX(ts) AS ts + FROM users.user_access_log WHERE action = 'access_requested' + GROUP BY user_id, dataset ) AS request + ON access.user_id = request.user_id AND + access.dataset = request.dataset + WHERE (access.user_id, access.dataset) IN ( + SELECT granted.user_id, granted.dataset + FROM users.user_access_log_summary AS granted + LEFT JOIN users.user_access_log_summary AS revoked + ON granted.user_id = revoked.user_id AND + granted.dataset = revoked.dataset AND + revoked.action = 'access_revoked' + WHERE granted.action = 'access_granted' AND + (revoked.user_id IS NULL OR granted.ts > revoked.ts) + GROUP BY granted.user_id, granted.dataset, granted.action + ); + +CREATE OR REPLACE VIEW users.dataset_access_pending AS + SELECT DISTINCT + access.*, + FALSE AS has_access, + request.ts AS access_requested + FROM users.dataset_access AS access + JOIN ( SELECT user_id, dataset, MAX(ts) AS ts + FROM users.user_access_log WHERE action = 'access_requested' + GROUP BY user_id, dataset ) AS request + ON access.user_id = request.user_id AND + access.dataset = request.dataset + WHERE (access.user_id, access.dataset) IN ( + -- get user_id for all users that have pending access requests + SELECT requested.user_id, requested.dataset + FROM users.user_access_log_summary AS requested + LEFT JOIN users.user_access_log_summary AS granted + ON requested.user_id = granted.user_id AND + requested.dataset = granted.dataset AND + granted.action = 'access_granted' + LEFT JOIN users.user_access_log_summary AS revoked + ON requested.user_id = revoked.user_id AND + requested.dataset = revoked.dataset AND + revoked.action = 'access_revoked' + WHERE requested.action = 'access_requested' AND + (granted.user_id IS NULL OR requested.ts > granted.ts) AND + (revoked.user_id IS NULL OR requested.ts > revoked.ts) + GROUP BY requested.user_id, requested.dataset, requested.action + ); diff --git a/test/data/clean_dummy_data.sql b/test/data/clean_dummy_data.sql index 1e9e3dba5..6355b8491 100644 --- a/test/data/clean_dummy_data.sql +++ b/test/data/clean_dummy_data.sql @@ -1,21 +1,23 @@ -- Delete test data -DELETE FROM user_access_log WHERE user_pk > 1000000 OR dataset_pk > 1000000; -DELETE FROM dataset_access WHERE user_pk > 1000000 OR dataset_pk > 1000000; -DELETE FROM user WHERE user_pk > 1000000; -DELETE FROM dataset_file WHERE dataset_file_pk > 1000000; -DELETE FROM dataset_version WHERE dataset_pk > 1000000; -DELETE FROM sample_set WHERE sample_set_pk > 1000000; -DELETE FROM collection WHERE collection_pk > 1000000; -DELETE FROM dataset WHERE dataset_pk > 1000000; -DELETE FROM study WHERE study_pk > 1000000; +DELETE FROM users.user_access_log WHERE id > 1000000 OR dataset > 1000000; +DELETE FROM users.dataset_access WHERE id > 1000000 OR dataset > 1000000; +DELETE FROM users.users WHERE id > 1000000; +DELETE FROM data.dataset_files WHERE id > 1000000; +DELETE FROM data.dataset_versions WHERE id > 1000000; +DELETE FROM data.sample_sets WHERE id > 1000000; +DELETE FROM data.datasets WHERE id > 1000000; +DELETE FROM data.reference_sets WHERE id > 1000000; +DELETE FROM data.dbsnp_versions WHERE id > 1000000; +DELETE FROM data.collections WHERE id > 1000000; +DELETE FROM data.studies WHERE id > 1000000; -- Reset auto increment counters -ALTER TABLE user_access_log AUTO_INCREMENT = 1; -ALTER TABLE dataset_access AUTO_INCREMENT = 1; -ALTER TABLE user AUTO_INCREMENT = 1; -ALTER TABLE dataset_file AUTO_INCREMENT = 1; -ALTER TABLE dataset_version AUTO_INCREMENT = 1; -ALTER TABLE collection AUTO_INCREMENT = 1; -ALTER TABLE sample_set AUTO_INCREMENT = 1; -ALTER TABLE dataset AUTO_INCREMENT = 1; -ALTER TABLE study AUTO_INCREMENT = 1; +ALTER SEQUENCE data.dataset_files_id_seq RESTART WITH 1; +ALTER SEQUENCE data.dataset_versions_id_seq RESTART WITH 1; +ALTER SEQUENCE data.collections_id_seq RESTART WITH 1; +ALTER SEQUENCE data.sample_sets_id_seq RESTART WITH 1; +ALTER SEQUENCE data.datasets_id_seq RESTART WITH 1; +ALTER SEQUENCE data.studies_id_seq RESTART WITH 1; +ALTER SEQUENCE users.users_id_seq RESTART WITH 1; +ALTER SEQUENCE users.user_access_log_id_seq RESTART WITH 1; +ALTER SEQUENCE users.dataset_access_id_seq RESTART WITH 1; diff --git a/test/data/load_dummy_data.sql b/test/data/load_dummy_data.sql index d8238201b..f3342f431 100644 --- a/test/data/load_dummy_data.sql +++ b/test/data/load_dummy_data.sql @@ -1,35 +1,43 @@ -INSERT INTO study (study_pk, pi_name, pi_email, contact_name, contact_email, title, description, publication_date, ref_doi) +-- Reference Set tables + +INSERT INTO data.reference_sets (id, reference_build, reference_name, ensembl_version, gencode_version, dbnsfp_version) + VALUES (1000001, 'GRCh1p2', 'Dummyman', 'homo_sapiens_core_0_3', '11', 'b142'), + (1000002, 'GRCh2p1', 'Mummydam', 'homo_sapiens_core_1_2', '23', 'b131'); + +-- Study and Dataset fields + +INSERT INTO data.studies (id, pi_name, pi_email, contact_name, contact_email, title, study_description, publication_date, ref_doi) VALUES (1000001, 'PI_STUDY1', 'pi1@example.com', 'Contact Study 1', 'contact1@example.com', 'Study 1', 'Study 1 description', '2017-01-01', 'study1DOI'), (1000002, 'PI_STUDY2', 'pi2@example.com', 'Contact Study 2', 'contact2@example.com', 'Study 2', 'Study 2 description', '2017-02-01', 'study2DOI'); -INSERT INTO collection (collection_pk, name, ethnicity) VALUES +INSERT INTO data.collections (id, study_name, ethnicity) VALUES (1000001, 'Collection1', 'CollEth1'), (1000002, 'Collection2', 'CollEth2'), (1000003, 'Collection3', 'CollEth3'); -INSERT INTO dataset (dataset_pk, study_pk, short_name, full_name, browser_uri, beacon_uri, avg_seq_depth, seq_type, seq_tech, seq_center, dataset_size, mongodb_collection) - VALUES (1000001, 1000001, 'Dataset 1', 'Dataset 1 Long name', 'http://example.com/browser1', 'http://example.com/beacon1', 1.0, 'SeqType1', 'SeqTech1', 'SeqCenter1', 1001, 'na'), - (1000002, 1000002, 'Dataset 2', 'Dataset 2 Long name', 'http://example.com/browser2', 'http://example.com/beacon2', 2.0, 'SeqType2', 'SeqTech2', 'SeqCenter2', 1002, 'na'); +INSERT INTO data.datasets (id, study, short_name, full_name, browser_uri, beacon_uri, beacon_description, avg_seq_depth, seq_type, seq_tech, seq_center, dataset_size) + VALUES (1000001, 1000001, 'Dataset 1', 'Dataset 1 Long name', 'http://example.com/browser1', 'http://example.com/beacon1', 'Dummy Dataset 1', 1.0, 'SeqType1', 'SeqTech1', 'SeqCenter1', 1001), + (1000002, 1000002, 'Dataset 2', 'Dataset 2 Long name', 'http://example.com/browser2', 'http://example.com/beacon2', 'Dummy Dataset 2', 2.0, 'SeqType2', 'SeqTech2', 'SeqCenter2', 1002); -INSERT INTO sample_set (sample_set_pk, dataset_pk, collection_pk, sample_size, phenotype) +INSERT INTO data.sample_sets (id, dataset, "collection", sample_size, phenotype) VALUES (1000001, 1000001, 1000001, 10, 'SamplePheno1'), (1000002, 1000001, 1000002, 15, 'SamplePheno2 Coll1'), (1000003, 1000002, 1000003, 20, 'SamplePheno2 Coll2'); -INSERT INTO dataset_version (dataset_version_pk, dataset_pk, version, description, terms, var_call_ref, available_from, ref_doi, data_contact_name, data_contact_link) - VALUES (1000001, 1000001, 'Version 1-1', 'Dataset 1-1, description', 'Dataset 1-1, terms', 'CallRef11', '2017-01-01', 'datset11DOI', "Gunnar Green", "gunnar.green@example.com"), - (1000002, 1000002, 'Version 2-1', 'Dataset 2-1, description', 'Dataset 2-1, terms', 'CallRef21', '2017-02-01', 'datset21DOI', NULL, NULL), - (1000003, 1000002, 'Version 2-2', 'Dataset 2-2, description', 'Dataset 2-2, terms', 'CallRef22', '2017-02-02', 'datset22DOI', "Strummer project", "https://example.com/strummer"), - (1000004, 1000002, 'InvVer 2-3', 'Dataset 2-3, description', 'Dataset 2-3, terms', 'CallRef23', '2030-02-03', 'datset23DOI', "Drummer project", "https://example.com/drummer"); +INSERT INTO data.dataset_versions (id, dataset, reference_set, dataset_version, dataset_description, terms, var_call_ref, available_from, ref_doi, data_contact_name, data_contact_link, num_variants, coverage_levels) + VALUES (1000001, 1000001, 1000001, 'Version 1-1', 'Dataset 1-1, description', 'Dataset 1-1, terms', 'CallRef11', '2017-01-01', 'datset11DOI', 'Gunnar Green', 'gunnar.green@example.com', 10, ARRAY[1,5,10]), + (1000002, 1000002, 1000001, 'Version 2-1', 'Dataset 2-1, description', 'Dataset 2-1, terms', 'CallRef21', '2017-02-01', 'datset21DOI', NULL, NULL, 100, ARRAY[1,5,10]), + (1000003, 1000002, 1000002, 'Version 2-2', 'Dataset 2-2, description', 'Dataset 2-2, terms', 'CallRef22', '2017-02-02', 'datset22DOI', 'Strummer project', 'https://example.com/strummer', 1000, ARRAY[1,5,10]), + (1000004, 1000002, 1000002, 'InvVer 2-3', 'Dataset 2-3, description', 'Dataset 2-3, terms', 'CallRef23', '2030-02-03', 'datset23DOI', 'Drummer project', 'https://example.com/drummer', 10000, ARRAY[1,5,10]); -INSERT INTO dataset_file(dataset_file_pk, dataset_version_pk, name, uri, bytes) +INSERT INTO data.dataset_files(id, dataset_version, basename, uri, file_size) VALUES (1000001, 1000001, 'File11-1', '/release/file111.txt', 100), (1000002, 1000001, 'File11-2', '/release/file112.txt', 100000), (1000003, 1000002, 'File21-1', '/release/file211.txt', 1000000000), (1000004, 1000003, 'File22-1', '/release/file221.txt', 973826482736), (1000005, 1000004, 'File23-1', '/release/file231.txt', 239847293874293874); -INSERT INTO user(user_pk, name, email, affiliation, country, identity, identity_type) VALUES +INSERT INTO users.users(id, username, email, affiliation, country, identity, identity_type) VALUES (1000100, 'Not req yet', 'email0', 'i', '', 'email0', 'elixir'), (1000101, 'Requested access', 'email1', 'w1', '', 'email1', 'google'), (1000102, 'Approved access', 'email2', 'c1', '', 'email2', 'elixir'), @@ -47,15 +55,15 @@ INSERT INTO user(user_pk, name, email, affiliation, country, identity, identity_ (1000114, 'Admin2', 'admin2', 'Rootspace', '', 'admin2', 'elixir'), (1000115, 'Admin12', 'admin12', 'Rootspace', '', 'admin12', 'google'); -INSERT INTO dataset_access(user_pk, dataset_pk) VALUES +INSERT INTO users.dataset_access(user_id, dataset) VALUES (1000100, 1000001), (1000101, 1000001), (1000102, 1000001), (1000103, 1000001), (1000104, 1000001), (1000105, 1000001), (1000106, 1000001), (1000107, 1000001), (1000108, 1000001), (1000108, 1000002), (1000109, 1000001), (1000109, 1000002), (1000110, 1000001), (1000110, 1000002), (1000111, 1000001), (1000111, 1000002), (1000112, 1000001), (1000112, 1000002); -INSERT INTO dataset_access(user_pk, dataset_pk, is_admin) VALUES - (1000113, 1000001, 1), (1000114, 1000002, 1), (1000115, 1000001, 1), (1000115, 1000002, 1); +INSERT INTO users.dataset_access(user_id, dataset, is_admin) VALUES + (1000113, 1000001, TRUE), (1000114, 1000002, TRUE), (1000115, 1000001, TRUE), (1000115, 1000002, TRUE); -INSERT INTO user_access_log(user_pk, dataset_pk, action, ts) VALUES +INSERT INTO users.user_access_log(user_id, dataset, "action", ts) VALUES (1000101, 1000001, 'access_requested', '2017-01-01'), (1000102, 1000001, 'access_requested', '2017-01-02'), (1000103, 1000001, 'access_requested', '2017-01-03'), @@ -104,13 +112,20 @@ INSERT INTO user_access_log(user_pk, dataset_pk, action, ts) VALUES (1000115, 1000002, 'access_requested', '2017-02-15'), (1000115, 1000002, 'access_granted', '2017-02-16'); -SELECT "Waiting", user.name, user.affiliation as visibility, user.user_pk, - dataset_access_pending.dataset_pk, - dataset_access_pending.dataset_access_pk -FROM dataset_access_pending JOIN user ON (dataset_access_pending.user_pk = user.user_pk) -WHERE dataset_pk > 1000000; +SELECT 'Waiting', users.users.username, users.users.affiliation AS visibility, + users.users.id, users.dataset_access_pending.dataset, + users.dataset_access_pending.id + FROM users.dataset_access_pending + JOIN users.users + ON (users.dataset_access_pending.user_id = users.users.id) + WHERE dataset > 1000000; + +SELECT 'Current', users.users.username, users.users.affiliation AS visibility, + users.users.id, users.dataset_access_current.dataset, + users.dataset_access_current.id + FROM users.dataset_access_current + JOIN users.users + ON (users.dataset_access_current.user_id = users.users.id) + WHERE dataset > 1000000; -SELECT "Current", user.name, user.affiliation as visibility, user.user_pk, dataset_access_current.dataset_pk, - dataset_access_current.dataset_access_pk -FROM dataset_access_current JOIN user ON (dataset_access_current.user_pk = user.user_pk) -WHERE dataset_pk > 1000000; +-- Variant and coverage data fields diff --git a/test/travis_before_install.sh b/test/travis_before_install.sh index 1d3c7cadf..e59af8f6b 100755 --- a/test/travis_before_install.sh +++ b/test/travis_before_install.sh @@ -1,12 +1,8 @@ #!/bin/sh -x -docker pull mysql:5.7 -docker pull ubuntu:16.04 +PSQL_VERSION="10" +PSQL_PORT="5433" -VOLUME='mysql-data-volume' -MYSQL_PORT=3366 +docker pull "postgres:$PSQL_VERSION" -scripts/download_and_create_docker_db_volume.sh - -docker run -v "$VOLUME:/var/lib/mysql" \ - --rm --name mysql -d -p "$MYSQL_PORT:3306" mysql:5.7 +docker run --rm -d -p "$PSQL_PORT:5432" "postgres:$PSQL_VERSION" diff --git a/test/travis_script.sh b/test/travis_script.sh index b94dd8cc7..3c8692d73 100755 --- a/test/travis_script.sh +++ b/test/travis_script.sh @@ -2,9 +2,9 @@ ## SETUP SETTINGS cp settings_sample.json settings.json -sed -i 's/password//' settings.json -sed -i 's/"mysqlSchema" : "swefreq"/"mysqlSchema" : "swefreq_test"/' settings.json -sed -i 's/"mysqlPort" : 3306/"mysqlPort" : 3366/' settings.json +sed -i.tmp 's/"postgresHost" : "postgres host"/"postgresHost" : "127.0.0.1"/' settings.json +sed -i.tmp 's/"postgresPort" : 5432/"postgresPort" : 5433/' settings.json +sed -i.tmp 's/"postgresName" : "swefreq"/"postgresName" : ""/' settings.json echo 'SETTINGS' cat settings.json @@ -13,19 +13,21 @@ echo '/SETTINGS' echo '>>> Test 1. The SQL Patch' LATEST_RELEASE=$(git tag | grep '^v' | sort -V | tail -n 1) -git show "$LATEST_RELEASE:sql/swefreq.sql" >master-schema.sql +git show "$LATEST_RELEASE:sql/*_schema.sql" > master-schema.sql + +psql -U postgres -h 127.0.0.1 -p 5433 -f master-schema.sql +psql -U postgres -h 127.0.0.1 -p 5433 -f sql/patch-master-db.sql -mysql -u swefreq -h 127.0.0.1 -P 3366 swefreq_test >> Test 2. Load the swefreq schema' -mysql -u swefreq -h 127.0.0.1 -P 3366 swefreq_test >> Test 3. Check that the backend starts' @@ -48,12 +50,14 @@ exit_handler () { echo 'THE HTTP LOG WAS:' cat http_log.txt - exit $rv + exit "$rv" } trap exit_handler EXIT +RETURN_VALUE=0 python backend/test.py -v +RETURN_VALUE=$((RETURN_VALUE + $?)) # Quit the app curl localhost:4000/developer/quit @@ -63,3 +67,5 @@ if [ -f .coverage ]; then coveralls coverage report fi + +exit "$RETURN_VALUE"