From aa323f30dfbb01aee2256c3a6114e47523db527f Mon Sep 17 00:00:00 2001 From: cmungall Date: Mon, 27 Jun 2022 13:49:08 -0700 Subject: [PATCH] Making gzipped files the default for S3 upload/download --- Makefile | 12 ++++++-- pyproject.toml | 2 +- src/semsql/builder/builder.py | 13 +++++++-- src/semsql/linkml/similarity.yaml | 46 +++++++++++++++++++++++++++++++ 4 files changed, 67 insertions(+), 6 deletions(-) create mode 100644 src/semsql/linkml/similarity.yaml diff --git a/Makefile b/Makefile index 06e4f28..d139e4f 100644 --- a/Makefile +++ b/Makefile @@ -13,11 +13,16 @@ SELECTED_ONTS = obi mondo go envo ro hp mp zfa wbphenotype ecto upheno uberon_cm TEST_ONTOLOGIES = go-nucleus robot-example -all: $(patsubst %,all-%,$(ALL_OBO_ONTS)) +all: build_all stage_all +build_all: $(patsubst %,all-%,$(ALL_OBO_ONTS)) +stage_all: $(patsubst %,stage/%.db.gz,$(ALL_OBO_ONTS)) + selected: $(patsubst %,all-%,$(SELECTED_ONTS)) all-%: db/%.db sqlite3 $< "SELECT COUNT(*) FROM statements" +stage/%.db.gz: db/%.db + gzip -c $< > $@.tmp && mv $@.tmp $@ # INSTALL include install.Makefile @@ -49,6 +54,7 @@ realclean-%: # Prefixes # --- # TODO: sync with bioregistry +# NOTE: this is now managed in build folder build_prefixes: $(PREFIX_DIR)/prefixes.csv @@ -197,5 +203,5 @@ bin/%: DATE = $(shell date -u +"%Y-%m-%d") s3-deploy: - aws s3 sync db s3://bbop-sqlite --acl public-read && \ - aws s3 sync db s3://bbop-sqlite/releases/$(DATE) --acl public-read + aws s3 sync stage s3://bbop-sqlite --acl public-read && \ + aws s3 sync stage s3://bbop-sqlite/releases/$(DATE) --acl public-read diff --git a/pyproject.toml b/pyproject.toml index 8e07a7f..6856424 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "semsql" -version = "0.1.6" +version = "0.1.7" description = "" authors = ["cmungall "] diff --git a/src/semsql/builder/builder.py b/src/semsql/builder/builder.py index c16a4b8..8505b8f 100644 --- a/src/semsql/builder/builder.py +++ b/src/semsql/builder/builder.py @@ -1,5 +1,7 @@ +import gzip import logging import os +import shutil import subprocess from dataclasses import field from pathlib import Path @@ -70,9 +72,16 @@ def download_obo_sqlite(ontology: str, destination: str): :param destination: :return: """ - url = f'https://s3.amazonaws.com/bbop-sqlite/{ontology}.db' + db = f'{ontology}.db' + url = f'https://s3.amazonaws.com/bbop-sqlite/{db}.gz' r = requests.get(url, allow_redirects=True) - open(destination, 'wb').write(r.content) + destination_gzip = f'{destination}.gz' + open(destination_gzip, 'wb').write(r.content) + with gzip.open(destination_gzip, 'rb') as f_in: + with open(destination, 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + os.remove(destination_gzip) + def connect(owl_file: str): diff --git a/src/semsql/linkml/similarity.yaml b/src/semsql/linkml/similarity.yaml new file mode 100644 index 0000000..98a9bd8 --- /dev/null +++ b/src/semsql/linkml/similarity.yaml @@ -0,0 +1,46 @@ +name: semsql_similarity +description: Module for representing and calculating similarities +title: Semantic similarity module +id: https://w3id.org/semsql/nlp +imports: +- rdf +- relation_graph +license: https://creativecommons.org/publicdomain/zero/1.0/ +prefixes: + semsql_similarity: https://w3id.org/semsql/similarity + linkml: https://w3id.org/linkml/ +default_curi_maps: +- semweb_context +default_prefix: semsql_nlp +default_range: string + +classes: + node_pairwise_similarity: + abstract: true + slots: + - node1 + - node2 + node_pairwise_graph_similarity: + is_a: node_pairwise_similarity + abstract: true + slots: + - num_ancestors + - predicate1 + - predicate2 + node_pairwise_overlap: + is_a: node_pairwise_graph_similarity + comments: + - |- + sqlview>> + SELECT + e1.subject AS node1, + e2.subject AS node2, + e1.predicate AS predicate1, + e2.predicate AS predicate2, + COUNT(DISTINCT e1.object) AS num_ancestors + FROM entailed_edge AS e1 AND + entailed_edge AS e2 + WHERE e1.object = e2.object + + +