In [1]:
import os
from kgtk.configure_kgtk_notebooks import ConfigureKGTK
from kgtk.functions import kgtk, kypher
from gensim.models import KeyedVectors
import tempfile
import pandas as pd
import numpy as np
import h5py, torch
from torchbiggraph.model import ComplexDiagonalDynamicOperator, DotComparator, CosComparator
import json

In [2]:
# Parameters

# Folder on local machine where to create the output and temporary folders
input_path = "/Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arnold-profiled"
# input_path = "/Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/datasets/arnold-profiled"

output_path = "/Volumes/saggu-ssd/projects"
# output_path = "/Users/pedroszekely/Downloads/kypher/projects"

project_name = "tutorial-graph-embeddings"

In [3]:
files = [
    "all",
    "label",
    "alias",
    "description",
    "item",
    "qualifiers",
    "p31",
    "p279star"
]
ck = ConfigureKGTK(files)
ck.configure_kgtk(input_graph_path=input_path,
                  output_path=output_path,
                  project_name=project_name)

User home: /Users/amandeep
Current dir: /Users/amandeep/Github/kgtk-notebooks/tutorial
KGTK dir: /Users/amandeep/Github/kgtk-notebooks
Use-cases dir: /Users/amandeep/Github/kgtk-notebooks/use-cases


In [4]:
ck.print_env_variables()

STORE: /Volumes/saggu-ssd/projects/tutorial-graph-embeddings/temp.tutorial-graph-embeddings/wikidata.sqlite3.db
TEMP: /Volumes/saggu-ssd/projects/tutorial-graph-embeddings/temp.tutorial-graph-embeddings
kgtk: kgtk
KGTK_GRAPH_CACHE: /Volumes/saggu-ssd/projects/tutorial-graph-embeddings/temp.tutorial-graph-embeddings/wikidata.sqlite3.db
KGTK_OPTION_DEBUG: false
OUT: /Volumes/saggu-ssd/projects/tutorial-graph-embeddings
GRAPH: /Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arnold-profiled
KGTK_LABEL_FILE: /Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arnold-profiled/labels.en.tsv.gz
EXAMPLES_DIR: /Users/amandeep/Github/kgtk-notebooks/examples
kypher: kgtk query --graph-cache /Volumes/saggu-ssd/projects/tutorial-graph-embeddings/temp.tutorial-graph-embeddings/wikidata.sqlite3.db
USE_CASES_DIR: /Users/amandeep/Github/kgtk-notebooks/use-cases
all: /Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arnold-profiled/all.tsv.gz
label: /Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arnold-profil

In [5]:
ck.load_files_into_cache()

kgtk query --graph-cache /Volumes/saggu-ssd/projects/tutorial-graph-embeddings/temp.tutorial-graph-embeddings/wikidata.sqlite3.db -i "/Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arnold-profiled/all.tsv.gz" --as all  -i "/Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arnold-profiled/labels.en.tsv.gz" --as label  -i "/Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arnold-profiled/aliases.en.tsv.gz" --as alias  -i "/Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arnold-profiled/descriptions.en.tsv.gz" --as description  -i "/Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arnold-profiled/claims.wikibase-item.tsv.gz" --as item  -i "/Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arnold-profiled/qualifiers.tsv.gz" --as qualifiers  -i "/Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arnold-profiled/derived.P31.tsv.gz" --as p31  -i "/Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arnold-profiled/derived.P279star.tsv.gz" --as p279star  --limit 3
node1	label	node2	id	node2;wikidatatype
P10	a

In [6]:
vector_dimension = 30
vector_output_path = f"{os.environ['OUT']}/arnold.embeddings.augmented.{vector_dimension}.tsv"
vector_output_w2v_path = f"{os.environ['OUT']}/arnold.embeddings.augmented.{vector_dimension}.w2v.tsv"
os.environ['VECTOR_DIMENSION'] = str(vector_dimension)

## Compute ComplEx Graph Embeddings

In this notebook we will compute graph embeddings using `kgtk graph-embeddings` command for the `arnold` subgraph and demonstrate a few applications.

First step is to augment the `claims.wikibase-item.tsv.gz` file with `derived.P31x.tsv` file which contains occupations for humans as `instance of (P31)`

- `claims.wikibase-item.tsv.gz`: KGTK claims file non literal edges only
- `derived.P31x.tsv`: file with additional P31x links, adding occupation as `instance of` (computed)

In [7]:
!kgtk cat -i $item \
-i $GRAPH/derived.P31x.tsv \
-o $GRAPH/claims.wikibase-item.augmented.tsv.gz

### Run `kgtk graph-embeddings`

The `kgtk graph-embeddings` command takes as input a KGTK edge file and computes graph embeddings of user specified type, producing vectors of user specified dimensions.

The following parameters are used in this instance:

- `-op ComplEx`: compute ComplEx graph embeddings
- `--dimension 30`: desired dimension of the vectors
- `-ot kgtk`: output format - kgtk
- `--retain_temporary_data True`: retain the byproduct files, which we will use in subsequent steps
- `-T <folder path>`: temporary folder where the temporary files will be stored
- `-i <file>`: input file
- `-o <file>`: output file
- `--log <file>`: log file

In [8]:
kgtk(f""" graph-embeddings
            -op ComplEx \
            --dimension $VECTOR_DIMENSION \
            -ot kgtk \
            --retain_temporary_data True \
            -T $TEMP \
            -i $GRAPH/claims.wikibase-item.augmented.tsv.gz \
            -o {vector_output_path} \
            --log $TEMP/ge.log.txt
    """)

In Processing, Please go to /Volumes/saggu-ssd/projects/tutorial-graph-embeddings/temp.tutorial-graph-embeddings/ge.log.txt to check details
Processed Finished.



#### Take a peek at the embeddings file.

In [9]:
kgtk(f"""head -i {vector_output_path}""")

Unnamed: 0,node1,label,node2
0,Q1985349,graph_embeddings,"0.095617205,0.177213788,-0.531302392,0.0149832..."
1,Q3433371,graph_embeddings,"-0.425078452,0.311085939,-0.395117164,-0.20244..."
2,Q317033,graph_embeddings,"-0.037620947,-0.034569997,-0.559339046,-0.0498..."
3,Q61587085,graph_embeddings,"-0.854337215,0.378212988,0.159523264,0.2521645..."
4,Q1788392,graph_embeddings,"-0.079702675,0.017837998,-0.579188764,0.020623..."
5,Q824239,graph_embeddings,"-0.458180159,0.988832235,0.195631877,-0.169714..."
6,Q667414,graph_embeddings,"-0.577477098,-0.201578721,0.198262930,-0.21542..."
7,Q389738,graph_embeddings,"-0.167402312,-0.300356328,0.217027649,0.213924..."
8,Q3521099,graph_embeddings,"0.500424862,-0.188686818,0.364933997,-0.397513..."
9,Q1701463,graph_embeddings,"0.245370045,0.016124398,-0.572334230,-0.147857..."


### The output is in `kgtk` format. Convert it to `word2vec` format for `gensim` similarity computation


For reference: 
- [gensim](https://radimrehurek.com/gensim/)
- [word2vec](https://en.wikipedia.org/wiki/Word2vec)

In [10]:
def convert_kgtk_to_w2v(input_path, output_path):
    """
    Convert a KGTK file (node1/label/node2) that contains embeddings to the w2v format
    """
    vector_count = 0

    # Read the file once to count the lines as we need to put them at the top of the w2v file
    with open(input_path, "r") as kgtk_file:
        next(kgtk_file)
        for line in kgtk_file:
            vector_count += 1
        kgtk_file.close()

    with open(output_path, "w") as w2v_file:
        w2v_file.write("{} {}\n".format(vector_count, vector_dimension))
        with open(input_path, "r") as kgtk_file:
            next(kgtk_file)
            for line in kgtk_file:
                items = line.split("\t")
                qnode = items[0]
                vector = items[2].replace(",", " ")
                w2v_file.write(qnode + " " + vector)
            kgtk_file.close()
        w2v_file.close()

In [11]:
convert_kgtk_to_w2v(f"{vector_output_path}", f"{vector_output_w2v_path}")

### Load the vectors into `gensim`

To find similar vectors based on cosine similarity

In [12]:
ge_vectors = KeyedVectors.load_word2vec_format(f"{vector_output_w2v_path}", binary=False)

Define a function to compute the `topn` similar vectors, and get the labels and descriptions of the matching Qnodes.

In [54]:
def kgtk_most_similar(
    vectors,
    positive,
    relation_label="similarity_score",
    add_label_description=True,
    output_path=None,
    topn=25,
):
    """
    find topn similar Qnodes, add label and decription for the Qnodes
    
    :param vectors: vector space loaded into gensim KeyedVectors model
    :param positive: vector(s) or Qnode(s) to find similar entities for
    :param relation_label: name of the property to be used for the output file
    :param add_label_description: boolean parameter to add label and description for matched entities
    :param output_path: path to store the output file
    :param topn: desirednumber of similar entities
    """
    result = []
    if add_label_description:
        fp = tempfile.NamedTemporaryFile(
            mode="w", suffix=".tsv", delete=False, encoding="utf-8"
        )
        fp.write("node1\tlabel\tnode2\n")
        for (qnode, similarity) in vectors.most_similar(positive=positive, topn=topn):
            fp.write("{}\t{}\t{}\n".format(qnode, relation_label, similarity))
        filename = fp.name
        fp.close()

        os.environ["_temp_file"] = filename

        result = !$kypher -i label -i description -i "$_temp_file" --as sim \
--match 'sim: (n1)-[]->(similarity), label: (n1)-[]->(lab), description: (n1)-[]->(des)' \
--return 'distinct n1 as node1, similarity as node2, "similarity" as label, lab as `node1;label`, des as `node1;description`' \
--order-by 'cast(similarity, float) desc' 
        
        os.remove(filename)
        
    else:
        result.append("node1\tlabel\tnode2\n")
        for (qnode, similarity) in vectors.most_similar(positive=positive, topn=topn):
            result.append("{}\t{}\t{}\n".format(qnode, relation_label, similarity))

    if output_path:
        handle = open(output_path, "w")
        for line in result:
            handle.write(line)
            handle.write("\n")
        handle.close()
    else:
        columns = result[0].split("\t")
        data = []
        for line in result[1:]:
            data.append(line.split("\t"))
        return pd.DataFrame(data, columns=columns)

### Link Prediction

The following code reads the vectors for Qnodes as `head` and Properties as `relation`.

The files used in the code are produced by `kgtk graph-embeddings` code as a byproduct, in the folder specified by the `-T` option

In [21]:
relation_names_list = json.load(open(f"{os.environ['TEMP']}/output/dynamic_rel_names.json"))
entity_names_list = json.load(open(f"{os.environ['TEMP']}/output/entity_names_all_0.json"))
prop_count = len(relation_names_list)

# operators
operator_lhs = ComplexDiagonalDynamicOperator(vector_dimension, prop_count)
operator_rhs = ComplexDiagonalDynamicOperator(vector_dimension, prop_count)
comparator = DotComparator()
cos_comparator = CosComparator()
with h5py.File(f"{os.environ['TEMP']}/output/model/model.v100.h5", "r") as hf:
    operator_state_dict_lhs = {
        "real": torch.from_numpy(hf["model/relations/0/operator/lhs/real"][...]),
        "imag": torch.from_numpy(hf["model/relations/0/operator/lhs/imag"][...]),
    }
    operator_state_dict_rhs = {
        "real": torch.from_numpy(hf["model/relations/0/operator/rhs/real"][...]),
        "imag": torch.from_numpy(hf["model/relations/0/operator/rhs/imag"][...]),
    }
    
operator_lhs.load_state_dict(operator_state_dict_lhs)
operator_rhs.load_state_dict(operator_state_dict_rhs)

# Load the embeddings
with h5py.File(f"{os.environ['TEMP']}/output/model/embeddings_all_0.v100.h5", "r") as hf:
    arnold_embedding = torch.from_numpy(hf["embeddings"][...])


entity_to_index = {}
for i, entity in enumerate(entity_names_list):
    entity_to_index[entity] = i
    

rel_index = {}
for i, rel in enumerate(relation_names_list):
    rel_index[rel] = i

The following function takes as input a `Qnode` and a `Property`, and outputs a vector which should be similar to the value of the relation.

For example, Qnode: `Q37079` = Tom Cruise, Property: `P166` = awards received and output a vector similar to awards. We will see this equation in action in the subsequent examples.

In [56]:
def get_embed(head, relation=None):
    ''' This function generate the embeddings for the tail entities:
            Head entities: Obtained from the model
            Head + relation: Obtained using torch
        :param head: subject Qnode
        :param relation: optional property
    '''
    if relation is None:
        return arnold_embedding[entity_to_index[head], :].detach().numpy()
    return  operator_lhs(
                arnold_embedding[entity_to_index[head], :].view(1, vector_dimension),
                torch.tensor([rel_index[relation]])
            ).detach().numpy()[0]

#### Get the vector for `Q37079` (Tom Cruise) + `P166` (award received), then find most similar entities

In [57]:
_vector = get_embed('Q37079', 'P166')
kgtk_most_similar(ge_vectors, positive=_vector, topn=10)

Unnamed: 0,node1,node2,label,node1;label,node1;description
0,Q1011547,0.1455029994249344,similarity,'Golden Globe Award'@en,'award of the Hollywood Foreign Press Associat...
1,Q37079,0.125749722123146,similarity,'Tom Cruise'@en,'American actor and producer'@en
2,Q640353,0.1202858388423919,similarity,'Golden Globe Cecil B. DeMille Award'@en,'honorary Golden Globe Award'@en
3,Q1790292,0.1190257370471954,similarity,'National Board of Review Award for Best Actor...,'Wikimedia list article'@en
4,Q708135,0.1127644553780555,similarity,'Silver Bear'@en,'film award'@en
5,Q586356,0.1126348748803138,similarity,'Golden Globe Award for Best Director'@en,'award'@en
6,Q1534906,0.1089578047394752,similarity,'Golden Globe Award for New Star of the Year –...,'Golden Globe award'@en
7,Q1535145,0.1072486266493797,similarity,'Golden Raspberry Award for Worst Screen Coupl...,'award'@en


#### Get the vector for `Q170564` (Terminator 2: Judgement Day) + `P161` (cast member), then find most similar entities

In [58]:
_vector = get_embed('Q170564', 'P161')
kgtk_most_similar(ge_vectors, positive=_vector, topn=10)

Unnamed: 0,node1,node2,label,node1;label,node1;description
0,Q471003,0.1324714869260788,similarity,'John Larroquette'@en,"'born 1947; American film, television and stag..."
1,Q511436,0.122238278388977,similarity,'Rick Ducommun'@en,'Canadian actor (1952-2015)'@en
2,Q575795,0.1202331632375717,similarity,'Martin Mull'@en,'American actor'@en
3,Q192165,0.119775503873825,similarity,'Danny Glover'@en,"'American actor, film director and political a..."
4,Q16238721,0.1189576089382171,similarity,'Mackenzie Davis'@en,"'Canadian film, television and stage actress'@en"
5,Q270639,0.1187439486384391,similarity,'John McTiernan'@en,'American film director and producer'@en
6,Q12035731,0.1170358955860138,similarity,'Martin Hub'@en,'Czech actor and stuntman'@en
7,Q521172,0.1167899519205093,similarity,'Franco Columbu'@en,'Italian bodybuilder and actor (1941-2019)'@en
8,Q912103,0.1104367822408676,similarity,'Brian Doyle-Murray'@en,'American actor and comedian'@en
9,Q107933,0.1089400500059127,similarity,'Jim Belushi'@en,"'American actor, comedian, singer, and musicia..."


#### Get the vector for `Q104123` (Pulp Fiction) + `P161` (cast member), then find most similar entities

In [59]:
_vector = get_embed('Q104123', 'P161')
kgtk_most_similar(ge_vectors, positive=_vector, topn=10)

Unnamed: 0,node1,node2,label,node1;label,node1;description
0,Q38111,0.1488519310951233,similarity,'Leonardo DiCaprio'@en,'American actor and film producer'@en
1,Q76478,0.1459016501903534,similarity,'Kirsten Dunst'@en,'American actress'@en
2,Q174843,0.1363454461097717,similarity,'Jeff Bridges'@en,"'American actor, singer-songwriter and produce..."
3,Q106481,0.1264046430587768,similarity,'Alan Rickman'@en,"'English film, television and stage actor, gra..."
4,Q103939,0.1258087307214737,similarity,'Charlie Sheen'@en,'American film and television actor'@en
5,Q42930,0.1236490458250045,similarity,'Dustin Hoffman'@en,'American actor'@en
6,Q160528,0.1206435486674308,similarity,'Burt Lancaster'@en,'American actor and producer (1913-1994)'@en
7,Q167520,0.1202816143631935,similarity,'Jon Voight'@en,'American actor'@en
8,Q102711,0.1198199316859245,similarity,'Dennis Hopper'@en,'American actor and filmmaker (1936–2010)'@en
9,Q108366,0.1165139973163604,similarity,'Gregory Peck'@en,'American actor'@en


#### Get the vector for `Q2685` (Arnold Schwarzenegger), then find most similar entities

In [60]:
_vector = get_embed('Q2685')
kgtk_most_similar(ge_vectors, positive=_vector, topn=10)

Unnamed: 0,node1,node2,label,node1;label,node1;description
0,Q2685,1.0,similarity,'Arnold Schwarzenegger'@en,"'Austrian-American actor, businessman, bodybui..."
1,Q243430,0.8519800901412964,similarity,'Terence Hill'@en,"'Italian actor, film director, screenwriter an..."
2,Q270639,0.8474528789520264,similarity,'John McTiernan'@en,'American film director and producer'@en
3,Q1348071,0.8190168738365173,similarity,'Jan Tříska'@en,'Czech actor (1936-2017)'@en
4,Q352010,0.8146214485168457,similarity,'David S. Goyer'@en,"'American screenwriter, film director, novelis..."
5,Q171758,0.8053231835365295,similarity,'Matthew Perry'@en,'American-Canadian actor'@en
6,Q11975,0.8032246828079224,similarity,'Britney Spears'@en,"'American singer, songwriter, dancer and actre..."
7,Q361208,0.8029232621192932,similarity,'Adolph Zukor'@en,'Hungarian-American film producer and director...
8,Q103157,0.8013789057731628,similarity,'Kurt Russell'@en,'American actor'@en
9,Q262130,0.8010067343711853,similarity,'Richard Donner'@en,'film director'@en


#### Get the vector for `Q103148` (Lahn River), then find most similar entities

In [62]:
_vector = get_embed('Q103148')
kgtk_most_similar(ge_vectors, positive=_vector, topn=10)

Unnamed: 0,node1,node2,label,node1;label,node1;description
0,Q103148,0.9999998211860656,similarity,'Lahn'@en,'right tributary of the Rhine River in Germany...
1,Q1673,0.9362663626670836,similarity,'Neckar'@en,'right tributary of Rhine river in Germany'@en
2,Q26727445,0.924948275089264,similarity,'Moersbach'@en,'river in Germany'@en
3,Q701645,0.9240193963050842,similarity,'Düssel'@en,'river in Germany'@en
4,Q326366,0.9154956340789796,similarity,'Leimbach'@en,'river'@en
5,Q319649,0.9132569432258606,similarity,'Möhlin'@en,'river in Germany'@en
6,Q563698,0.9084678292274476,similarity,'Lauter'@en,"'river in Germany and France, left tributary o..."
7,Q153945,0.9084625840187072,similarity,'Lippe'@en,'river in Germany'@en
8,Q1667,0.9077610373497008,similarity,'Moselle'@en,"'river in Germany, France and Luxembourg'@en"
9,Q570612,0.9068459272384644,similarity,'Acher'@en,'river in Germany'@en


## Prepare files for Google Projector

In this section, we will prepare `vectors` and `metadata` files for google projector.

We are focusing on the following types:

- `Q11424` (film)
- `Q33999` (actor)
- `Q4022` (river)
- `Q82955` (politician)

First step is to create a file with the following information ,

1. node1 :- Qnode
2. label :- name of the property
3. node2 :- embedding vector for node1
4. node1;label :- label for node1
5. type :- `instance of` for node1
6. type;label :- label for type

In [28]:
%%time
kgtk(f""" query -i $GRAPH/claims.wikibase-item.augmented.tsv.gz 
         -i p279star 
         -i label 
         -i {vector_output_path} 
         -i $GRAPH/derived.P31x.tsv 
         --match 'item: (n1)-[]->(), 
             P31x: (n1)-[]->(c), 
             p279star: (c)-[]->(class), 
             label: (n1)-[]->(n1_label), 
             label: (class)-[]->(class_label), embeddings: (n1)-[l]->(embedding)'
        --where 'class in ["Q11424", "Q33999", "Q4022", "Q82955"]' 
        --return 'distinct n1, 
                  l.label as label,
                  embedding as node2,
                  kgtk_lqstring_text(n1_label) as `node1;label`, 
                  group_concat(distinct class) as type, 
                  group_concat(distinct kgtk_lqstring_text(class_label)) as `type;label`'
        -o $TEMP/arnold.embeddings.google.projector.tsv
""")

CPU times: user 6.46 ms, sys: 16.8 ms, total: 23.3 ms
Wall time: 8.91 s


#### Take a peek at the file

In [30]:
kgtk("""head -i $TEMP/arnold.embeddings.google.projector.tsv""")

Unnamed: 0,node1,label,node2,node1;label,type,type;label
0,Q1000881,graph_embeddings,"0.422555715,-0.383779824,-0.226189762,-0.27205...",Erlau,Q4022,river
1,Q1001872,graph_embeddings,"0.324892789,-0.633026063,-0.019749254,-0.79175...",Buersbach,Q4022,river
2,Q1004531,graph_embeddings,"0.064922042,0.046958204,0.367239207,-0.1311449...",Bullets Over Broadway,Q11424,film
3,Q1009788,graph_embeddings,"-0.322945923,0.259725243,-0.434040457,-0.61645...",The Conversation,Q11424,film
4,Q1010099,graph_embeddings,"0.057618629,-0.116545640,0.401964843,-0.335613...",Get Carter,Q11424,film
5,Q1012216,graph_embeddings,"0.203465670,0.048769753,-0.076317504,-0.224987...",Gorillas in the Mist,Q11424,film
6,Q101410,graph_embeddings,"-0.355030477,0.267202079,0.223128796,0.1709822...",François Fillon,Q82955,politician
7,Q101797,graph_embeddings,"0.167383820,0.619198859,-0.416889578,0.3427470...",Winona Ryder,Q33999,actor
8,Q1018487,graph_embeddings,"-0.140163571,0.705846250,0.173184440,-0.254804...",Bye Bye Birdie,Q11424,film
9,Q102124,graph_embeddings,"0.120559528,0.800693333,-0.239737391,0.5290793...",Sigourney Weaver,Q33999,actor


#### Define a function to build the required files for google projector

In [63]:
def build_embedding_projector_metadata(gp_embeddings_path, metadata_path, vectors_path):
    """
    build the vector and metadata files required for google projector
    
    :param gp_embeddings_path: file path which has the embeddings and metadata in kgtk format
    :param metadata_path: output file path for metadata
    :param vectors_path: output file path for vectors
    """
    metadata_file = open(metadata_path, "w")
    metadata_file.write("tag\tqnode\ttype\ttype_label\n")

    vectors_file = open(vectors_path, "w")

    with open(gp_embeddings_path) as qnodes_file:
        next(qnodes_file)
        for line in qnodes_file:
            vals = line.split('\t')
            qnode = vals[0]
            qnode_label = vals[3]
            _type = vals[4] 
            ftype_label = vals[5]
            embeddings = "\t".join(vals[2].strip().split(","))

            if qnode.startswith("Q"):
                metadata_file.write("{}\t{}\t{}\t{}\n".format(qnode_label, qnode, _type, ftype_label.strip()))
                vectors_file.write(embeddings)
                vectors_file.write('\n')

    metadata_file.close()
    vectors_file.close()

In [46]:
build_embedding_projector_metadata(f"{os.environ['TEMP']}/arnold.embeddings.google.projector.tsv",
                                  f"{os.environ['OUT']}/arnold.metadata.{vector_dimension}.tsv",
                                  f"{os.environ['OUT']}/arnold.vectors.{vector_dimension}.tsv")

#### Peek at the metadata file

In [47]:
kgtk(f"""head -i $OUT/arnold.metadata.{vector_dimension}.tsv""")

Unnamed: 0,tag,qnode,type,type_label
0,Erlau,Q1000881,Q4022,river
1,Buersbach,Q1001872,Q4022,river
2,Bullets Over Broadway,Q1004531,Q11424,film
3,The Conversation,Q1009788,Q11424,film
4,Get Carter,Q1010099,Q11424,film
5,Gorillas in the Mist,Q1012216,Q11424,film
6,François Fillon,Q101410,Q82955,politician
7,Winona Ryder,Q101797,Q33999,actor
8,Bye Bye Birdie,Q1018487,Q11424,film
9,Sigourney Weaver,Q102124,Q33999,actor


#### Peek at the vectors file

In [52]:
!head -2 $OUT/arnold.vectors.$VECTOR_DIMENSION.tsv

0.422555715	-0.383779824	-0.226189762	-0.272052914	0.410429627	-0.056848142	-0.298868090	-0.257102609	1.498231173	0.279778272	-0.121273547	0.592083991	0.411273092	-0.401460081	0.472976536	-0.201241225	-0.331334770	-0.513577998	-0.791564167	0.508969486	-0.140303373	0.119090922	-0.167600065	0.056765910	0.385879189	0.341483742	-0.006866499	0.139508441	-0.163095295	0.138541460
0.324892789	-0.633026063	-0.019749254	-0.791751027	0.012587381	-0.247659147	-0.272751868	-0.200660244	1.513035774	0.034612011	0.283569038	-0.079536788	0.399750262	-0.570661962	0.447611094	-0.385112911	0.129972234	-0.560915053	-0.331446439	0.298282146	0.056178253	0.117032155	-0.113007426	-0.021605030	0.384068549	-0.227749139	0.438792169	0.177395970	-0.575673938	0.025991896


## Google embedding projector
- open https://projector.tensorflow.org
- Load the vect files using the load button
- configure the visualization

Here we searched on the right for arnold, and we see the closest vecotrs as well as the cluster where it belongs:
![Google embedding projector](assets/gp-arnold.png "Google embedding projector")

#### PCA visualization of the embeddings, colored by `instance of`

![UMAP Color by Type](assets/gp-color-map-types.png "UMAP Color by Type")