# KG-OntoML: Build Embeddings

Get requirements.

In [None]:
!pip install -q grape -U
!pip install -q plot_keras_history seedir silence_tensorflow
# !pip install -q tsnecuda==3.0.0+cu110 -f https://tsnecuda.isx.ai/tsnecuda_stable.html --no-dependencies
!pip install -q faiss

# In order to disable often useless TensorFlow warnings
import silence_tensorflow.auto

from ensmallen import Graph

Retrieve the KG-OntoML graph, decompress it, and check it.

In [None]:
!wget https://kg-hub.berkeleybop.io/kg-ontoml/20220304/KG-OntoML.tar.gz

--2022-03-08 19:23:13--  https://kg-hub.berkeleybop.io/kg-ontoml/20220304/KG-OntoML.tar.gz
Resolving kg-hub.berkeleybop.io (kg-hub.berkeleybop.io)... 13.224.154.67, 13.224.154.88, 13.224.154.116, ...
Connecting to kg-hub.berkeleybop.io (kg-hub.berkeleybop.io)|13.224.154.67|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 44235554 (42M) [application/gzip]
Saving to: ‘KG-OntoML.tar.gz’


2022-03-08 19:23:19 (8.75 MB/s) - ‘KG-OntoML.tar.gz’ saved [44235554/44235554]



In [None]:
!tar -xvzf KG-OntoML.tar.gz

merged-kg_nodes.tsv
merged-kg_edges.tsv


In [None]:
!head merged-kg_nodes.tsv

id	category	name	description	xref	provided_by	synonym	0000052	0000061	0000114	0000178	0000184	0000231	0000233	0000234	0000386	0000399	0000424	0000425	0000426	0000589	0000905	0006012	0037192	0100001	9000002	:http://attempto.ifi.uzh.ch/ace_lexicon#TV_pl	:http://attempto.ifi.uzh.ch/ace_lexicon#TV_sg	:http://attempto.ifi.uzh.ch/ace_lexicon#TV_vbg	:http://geneontology.org/formats/oboInOwl#created_by	:http://purl.obolibrary.org/obo/chebi/charge	:http://purl.obolibrary.org/obo/chebi/formula	:http://purl.obolibrary.org/obo/chebi/inchi	:http://purl.obolibrary.org/obo/chebi/inchikey	:http://purl.obolibrary.org/obo/chebi/mass	:http://purl.obolibrary.org/obo/chebi/monoisotopicmass	:http://purl.obolibrary.org/obo/chebi/smiles	:http://purl.org/spar/cito/citesAsAuthority	:http://usefulinc.com/ns/doap#revision	:http://www.ebi.ac.uk/cellline/definition	:http://www.ebi.ac.uk/cellline/definition_editor	:http://www.ebi.ac.uk/efo/alternative_term	:http://www.ebi.ac.uk/efo/definition	:http://www.ebi.ac.uk/e

In [None]:
!head merged-kg_edges.tsv

id	subject	predicate	object	category	relation	knowledge_source	logical_interpretation
OBO:FBbt_00004566-biolink:subclass_of-OBO:FBbt_00004554	OBO:FBbt_00004566	biolink:subclass_of	OBO:FBbt_00004554		rdfs:subClassOf	Graph	
urn:uuid:a564b8a3-6fee-45f8-8cc8-d0d6093c6c64	OBO:FBbt_00004566	biolink:develops_from	OBO:FBbt_00001781	biolink:Association	RO:0002202	Graph	owlstar:AllSomeInterpretation
urn:uuid:c1ef3711-892c-4470-b9b0-062fc0c95fda	OBO:FBbt_00004566	biolink:part_of	OBO:FBbt_00004565	biolink:Association	BFO:0000050	Graph	owlstar:AllSomeInterpretation
OBO:FBbt_00004554-biolink:subclass_of-OBO:FBbt_00004475	OBO:FBbt_00004554	biolink:subclass_of	OBO:FBbt_00004475		rdfs:subClassOf	Graph	
urn:uuid:1f6a4aa4-abee-4335-acef-cab6be117d77	OBO:FBbt_00004554	biolink:part_of	OBO:FBbt_00004553	biolink:Association	BFO:0000050	Graph	owlstar:AllSomeInterpretation
XPO:0136009-biolink:subclass_of-XPO:0135498	XPO:0136009	biolink:subclass_of	XPO:0135498		rdfs:subClassOf	Graph	
XPO:0136009-biolink:subclas

In [None]:
g= Graph.from_csv(
  directed=False,
  node_path='merged-kg_nodes.tsv',
  edge_path='merged-kg_edges.tsv',
  verbose=True,
  nodes_column='id',
  node_list_node_types_column='category',
  default_node_type='biolink:NamedThing',
  sources_column='subject',
  destinations_column='object',
  default_edge_type='biolink:related_to'
)
g

Now it's time to build the embeddings.

Even on a GPU, the following may take >16 min per epoch, so it's not ideal for demonstration purposes. A full SkipGram embedding on KG-OntoML with default parameters requires at least 12 epochs (>3 hours).

One option is to pass `compute_node_embedding` some extra arguments in `fit_kwargs` - this is a dictionary - and any or all of the following key:values:

* `epochs: 2` - the default is essentially forever
* `early_stopping_patience: 1` - Start checking for early stop after first epoch
* `early_stopping_min_delta: 0.5` - Minimum delta loss to stop training

See https://github.com/monarch-initiative/embiggen/blob/master/embiggen/embedders/embedder.py#L272 for other params.

In [None]:
from embiggen.pipelines import compute_node_embedding
from plot_keras_history import plot_history

node_embedding_method_name = "SkipGram"

first_order_rw_node_embedding, training_history = compute_node_embedding(
    g,
    node_embedding_method_name=node_embedding_method_name,
)

plot_history(
    training_history,
    title="First-order random walk based {} model applied to graph {}".format(
        node_embedding_method_name,
        g.get_name()
    )
)

0epoch [00:00, ?epoch/s]