# Generate RDF Knowledge graph from spreadsheet

1. Original input file on Google Docs: https://docs.google.com/spreadsheets/d/11SElScyLIs0RANYcT2MpWa7QWf-gIB13fjNvV9b5ZBE/export?format=xlsx&id=11SElScyLIs0RANYcT2MpWa7QWf-gIB13fjNvV9b5ZBE 

2. We use a GitHub Actions workflow to generate the NeuroDKG: https://github.com/MaastrichtU-IDS/neuro_dkg/blob/master/.github/workflows/generate-rdf.yml


# Use this notebook

To run this notebook, start a `jupyter/all-spark-notebook` container using `docker-compose up` from the `docs/` folder

```
cd docs
docker-compose up
```

Access on http://localhost:8888

In [3]:
!pip install -r requirements.txt



# Trying out the official pyRDF2Vec

Documentation: https://pyrdf2vec.readthedocs.io/en/latest/readme.html#create-a-knowledge-graph-object

In [4]:
import os
from pyrdf2vec.graphs import KG

# Download the NeuroDKG turtle file if not present
os.system("wget -N https://raw.githubusercontent.com/MaastrichtU-IDS/neuro_dkg/6dc99ccc112b1cfcb1ae12494fe474fc7f910ee1/data/rdf/neurodkg.ttl")
label_predicates = ["http://www.w3.org/1999/02/22-rdf-syntax-ns#type"]
kg = KG(location="neurodkg.ttl", label_predicates=label_predicates)

In [27]:
from pyrdf2vec.samplers import UniformSampler
from pyrdf2vec.walkers import RandomWalker
from pyrdf2vec import RDF2VecTransformer

walkers = [RandomWalker(4, 5, UniformSampler())]

# Get entities from the KG
entities = set([])
for v in kg._entities:
    if v.name.startswith('http://www.w3id.org/neurodkg/Instances'):
        entities.add(v.name)
entities = list(entities)
print(entities)

transformer = RDF2VecTransformer(walkers=[walkers])
# "Entities should be a list of URIs that can be found in the Knowledge Graph"
embeddings = transformer.fit_transform(kg, entities)

print(embeddings)

['http://www.w3id.org/neurodkg/Instances/context138', 'http://www.w3id.org/neurodkg/Instances/TargetGroup59', 'http://www.w3id.org/neurodkg/Instances/TargetGroup83C', 'http://www.w3id.org/neurodkg/Instances/context163', 'http://www.w3id.org/neurodkg/Instances/TargetGroup45', 'http://www.w3id.org/neurodkg/Instances/context168', 'http://www.w3id.org/neurodkg/Instances/context97', 'http://www.w3id.org/neurodkg/Instances/TargetGroup85', 'http://www.w3id.org/neurodkg/Instances/context152', 'http://www.w3id.org/neurodkg/Instances/TargetGroup32A', 'http://www.w3id.org/neurodkg/Instances/context18', 'http://www.w3id.org/neurodkg/Instances/context82', 'http://www.w3id.org/neurodkg/Instances/context2', 'http://www.w3id.org/neurodkg/Instances/context33', 'http://www.w3id.org/neurodkg/Instances/context67', 'http://www.w3id.org/neurodkg/Instances/context37', 'http://www.w3id.org/neurodkg/Instances/TargetGroup164', 'http://www.w3id.org/neurodkg/Instances/TargetGroup60', 'http://www.w3id.org/neurodkg

AttributeError: 'list' object has no attribute 'extract'

# Using Remzi's pyRDF2Vec

At https://github.com/MaastrichtU-IDS/pyRDF2Vec/

In [5]:
import random
import os
import requests
import functools
import numpy as np
import rdflib
import pandas as pd
import matplotlib.pyplot as plt
import shutil

from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.manifold import TSNE

from rdf2vec.converters import rdflib_to_kg
from rdf2vec.walkers import RandomWalker
from rdf2vec import RDF2VecTransformer

import warnings
warnings.filterwarnings('ignore')

## Import the rdf file (ttl, nt, all other supported by rdflib)

In [7]:
url = 'https://raw.githubusercontent.com/MaastrichtU-IDS/neuro_dkg/master/data/output/neuro_dkg.ttl'
rdf_file ='neurodkg.ttl'
# rdf_file = url.split('/')[-1]
#rdf_file = 'input/covid19-literature-knowledge-graph/sample_kg.nt'
#fileext = '.nq.gz'

# Download the RDF file
with requests.get(url, stream=True) as r:
    with open(rdf_file, 'wb') as f:
        r.raw.read = functools.partial(r.raw.read, decode_content=True)
        shutil.copyfileobj(r.raw, f)

#predicates for Random Walker to follow
label_predicates = ['<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>']

In [8]:
kg = rdflib_to_kg(rdf_file, filetype='turtle')

100%|██████████| 1421/1421 [00:00<00:00, 24554.67it/s]


In [9]:
# We'll all possible walks of depth 2
random_walker = RandomWalker(2, 4)

# Create embeddings with random walks
transformer = RDF2VecTransformer(walkers=[random_walker], sg=1)


<SparkContext master=local[10] appName=pyspark-shell>


In [10]:
all_entities = kg.get_all_entities()

In [11]:
all_entities[:10]

['http://www.w3id.org/neurodkg/Instances/context169',
 'http://www.w3id.org/neurodkg/Instances/context8',
 'http://purl.bioontology.org/ontology/MEDDRA/10048013',
 'http://www.w3id.org/neurodkg/Instances/TargetGroup74',
 'Lithium',
 'http://www.w3id.org/neurodkg/Instances/context141',
 'http://www.w3id.org/neurodkg/Instances/TargetGroup54',
 'generalized seizures',
 'http://www.w3id.org/drugbank:DB00252',
 'http://www.w3id.org/neurodkg/Instances/context24']

In [12]:
walk_embeddings = transformer.fit_transform(kg, all_entities)

./walks/randwalks_n4_depth2_pagerank_uniform.txt
Time elapsed to generate features: 00:00:05
Extracted 0 walks for 967 instances!
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform


In [13]:
walk_embeddings[:10]

[array([ 1.23020168e-02, -4.67225090e-02, -6.15784526e-02, -2.88769137e-02,
         4.28201519e-02,  3.36477044e-03, -5.46489842e-02,  1.97499562e-02,
         2.23553963e-02,  2.38031764e-02,  2.94231791e-02,  9.18491976e-04,
        -3.66971456e-02,  9.11906138e-02, -2.88504139e-02, -3.31938341e-02,
        -5.32128885e-02,  2.83906888e-02,  1.93762854e-02,  8.44837874e-02,
         7.51941130e-02, -8.91350061e-02, -5.80437435e-03,  4.13249433e-02,
         7.89739415e-02, -4.08709496e-02, -6.06828891e-02,  2.76854243e-02,
        -8.58932287e-02,  4.13404368e-02, -2.07163226e-02,  3.98963168e-02,
        -9.30790976e-02, -1.17831184e-02,  5.38380481e-02, -3.43991034e-02,
        -1.75661184e-02,  3.79525200e-02,  4.47154902e-02, -1.76906474e-02,
         5.93155110e-03,  1.53561030e-02,  7.18699321e-02,  7.27233738e-02,
         6.78872317e-02,  2.52504461e-02,  4.35954109e-02, -4.89797406e-02,
         6.00866880e-03,  2.51026805e-02,  3.71705368e-02,  5.39823286e-02,
        -4.8

In [15]:
len(all_entities)

967

In [16]:
len(walk_embeddings)

967

## Generating a dataframe for entity embeddings

In [17]:
df =pd.DataFrame(zip(all_entities, walk_embeddings), columns=['entity', 'embedding'])
    

In [20]:
# a function for converting entity names
# if you need to provide entity names with CURIE format (e.g. DRUGBANK:DB00012)
def replace_prefix(entity):
    if entity.startswith('http://www.w3id.org/drugbank:'):
        return entity.replace('http://www.w3id.org/drugbank:', 'DRUGBANK:')
    else:
        return entity

df.entity = df.entity.apply(replace_prefix)

### Convert dataframe embeddings to JSON

And store the dataframe in a JSON file, to be imported in the OpenPredict API!

In [21]:
df.to_json('neurodkg_embedding.json',orient='records')

In [22]:
import pandas as pd
import numpy as np
df =pd.read_json('neurodkg_embedding.json',orient='records')

In [23]:
df.head()

Unnamed: 0,entity,embedding
0,http://www.w3id.org/neurodkg/Instances/context169,"[0.012302016800000001, -0.046722509, -0.061578..."
1,http://www.w3id.org/neurodkg/Instances/context8,"[0.0116026448, -0.0446706526, -0.0590210706, -..."
2,http://purl.bioontology.org/ontology/MEDDRA/10...,"[0.0079595456, -0.029062213400000002, -0.03673..."
3,http://www.w3id.org/neurodkg/Instances/TargetG...,"[0.0154349497, -0.0612307899, -0.0794833302000..."
4,Lithium,"[-0.0002488188, -8.322100000000001e-05, 0.0003..."


In [24]:
print(df['entity'])
# np.array(df['embedding'].values)

0      http://www.w3id.org/neurodkg/Instances/context169
1        http://www.w3id.org/neurodkg/Instances/context8
2      http://purl.bioontology.org/ontology/MEDDRA/10...
3      http://www.w3id.org/neurodkg/Instances/TargetG...
4                                                Lithium
                             ...                        
962                                     DRUGBANK:DB00193
963                                     DRUGBANK:DB01086
964                                           ziconotide
965    http://www.w3id.org/neurodkg/Instances/TargetG...
966    http://www.w3id.org/neurodkg/Instances/TargetG...
Name: entity, Length: 967, dtype: object


In [25]:
embedding_mat =[]
for i, row in df.iterrows():
    emb=row['embedding']
    embedding_mat.append(emb)

In [26]:
entities = df.entity.to_list()

### alternatively you can store as csv with X columns (X is the dimension of the emebdding)

In [27]:
df_emb =pd.DataFrame( embedding_mat, columns= ['feature'+str(i) for i in range(len(emb))])

In [28]:
df_emb['entity'] = entities

In [29]:
df_emb.to_csv('neurodkg_embedding.csv', index=False)

In [30]:
df_emb.head()

Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,...,feature491,feature492,feature493,feature494,feature495,feature496,feature497,feature498,feature499,entity
0,0.012302,-0.046723,-0.061578,-0.028877,0.04282,0.003365,-0.054649,0.01975,0.022355,0.023803,...,0.003829,0.007926,-0.005846,-0.000503,0.016634,-0.061863,0.035371,0.081102,0.001887,http://www.w3id.org/neurodkg/Instances/context169
1,0.011603,-0.044671,-0.059021,-0.026219,0.042432,0.002738,-0.053726,0.017745,0.022102,0.023573,...,0.004447,0.007249,-0.005877,0.000358,0.016213,-0.060918,0.033026,0.078968,0.00173,http://www.w3id.org/neurodkg/Instances/context8
2,0.00796,-0.029062,-0.036739,-0.017166,0.026013,0.003133,-0.033342,0.011258,0.014057,0.014073,...,0.002516,0.00457,-0.004586,-0.000352,0.00932,-0.039472,0.022204,0.050009,0.000899,http://purl.bioontology.org/ontology/MEDDRA/10...
3,0.015435,-0.061231,-0.079483,-0.035929,0.055599,0.004625,-0.07239,0.025209,0.029083,0.030716,...,0.005269,0.010459,-0.007804,0.000284,0.020397,-0.082394,0.045698,0.104531,0.002276,http://www.w3id.org/neurodkg/Instances/TargetG...
4,-0.000249,-8.3e-05,0.000321,0.000216,0.000773,0.000971,0.000285,0.000631,-0.00063,-0.000779,...,-0.000348,-0.000773,0.000499,-0.00013,-0.000326,-0.000675,-0.000712,0.00032,-0.000928,Lithium
