# Use this notebook

To run this notebook, start a `jupyter/all-spark-notebook` container using `docker-compose up` from the `docs/` folder

```
cd docs
docker-compose up
```

Access on http://localhost:8888

In [2]:
!pip install -r requirements.txt

Collecting pyRDF2vec
  Downloading pyrdf2vec-0.1.1-py3-none-any.whl (29 kB)
Collecting gensim
  Downloading gensim-3.8.3-cp38-cp38-manylinux1_x86_64.whl (24.2 MB)
[K     |████████████████████████████████| 24.2 MB 2.1 MB/s eta 0:00:011
Collecting pyspark
  Downloading pyspark-3.0.1.tar.gz (204.2 MB)
[K     |████████████████████████████████| 204.2 MB 88 kB/s  eta 0:00:01
[?25hCollecting tomlkit<0.8.0,>=0.7.0
  Downloading tomlkit-0.7.0-py2.py3-none-any.whl (32 kB)
Collecting rdflib<6.0.0,>=5.0.0
  Downloading rdflib-5.0.0-py3-none-any.whl (231 kB)
[K     |████████████████████████████████| 231 kB 6.4 MB/s eta 0:00:01
[?25hCollecting python-louvain<0.15,>=0.14
  Downloading python-louvain-0.14.tar.gz (19 kB)
Collecting flask<2.0.0,>=1.1.2
  Downloading Flask-1.1.2-py2.py3-none-any.whl (94 kB)
[K     |████████████████████████████████| 94 kB 2.1 MB/s eta 0:00:01
Collecting SPARQLWrapper<2.0.0,>=1.8.5
  Downloading SPARQLWrapper-1.8.5-py3-none-any.whl (26 kB)
Collecting mimeparse<0.2.0,

In [3]:
import random
import os
import requests
import functools
import numpy as np
import rdflib
import pandas as pd
import matplotlib.pyplot as plt
import shutil

from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.manifold import TSNE

from rdf2vec.converters import rdflib_to_kg
from rdf2vec.walkers import RandomWalker
from rdf2vec import RDF2VecTransformer

import warnings
warnings.filterwarnings('ignore')

## Import the rdf file (ttl, nt, all other supported by rdflib)

In [4]:
url = 'https://raw.githubusercontent.com/MaastrichtU-IDS/neuro_dkg/master/data/output/neuro_dkg.ttl'
rdf_file ='neurodkg.ttl'
# rdf_file = url.split('/')[-1]
#rdf_file = 'input/covid19-literature-knowledge-graph/sample_kg.nt'
#fileext = '.nq.gz'

# Download the RDF file
with requests.get(url, stream=True) as r:
    with open(rdf_file, 'wb') as f:
        r.raw.read = functools.partial(r.raw.read, decode_content=True)
        shutil.copyfileobj(r.raw, f)

#predicates for Random Walker to follow
label_predicates = ['<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>']

In [5]:
kg = rdflib_to_kg(rdf_file, filetype='turtle')

100%|██████████| 1421/1421 [00:00<00:00, 34703.03it/s]


In [6]:
# We'll all possible walks of depth 2
random_walker = RandomWalker(2, 4)

# Create embeddings with random walks
transformer = RDF2VecTransformer(walkers=[random_walker], sg=1)


<SparkContext master=local[10] appName=pyspark-shell>


In [7]:
all_entities = kg.get_all_entities()

In [8]:
all_entities[:10]

['clonus',
 'http://www.w3id.org/neurodkg/Instances/context37',
 'limitation of motion',
 'http://www.w3id.org/neurodkg/Instances/TargetGroup91',
 'http://purl.bioontology.org/ontology/MEDDRA/10041552',
 'Infantile Spasms',
 'http://www.w3id.org/drugbank:DB00490',
 'anergetic',
 'http://www.w3id.org/drugbank:DB00843',
 'http://www.w3id.org/drugbank:DB00371']

In [9]:
walk_embeddings = transformer.fit_transform(kg, all_entities)

./walks/randwalks_n4_depth2_pagerank_uniform.txt
Time elapsed to generate features: 00:00:04
Extracted 0 walks for 967 instances!
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform
Processing  uniform


In [10]:
walk_embeddings[:10]

[array([ 7.41582073e-04,  8.60859640e-04, -4.02746809e-04,  6.44611311e-04,
        -9.48738190e-04,  7.37897004e-04,  6.61185477e-04,  1.74433575e-04,
        -5.91042990e-05,  6.42210885e-04, -8.08407262e-04,  7.46505568e-04,
         2.70771794e-04, -5.99663217e-05,  2.69634183e-04,  3.35388526e-04,
         4.77781781e-04,  8.34209641e-05,  2.87292933e-04, -9.76035313e-04,
         6.52936869e-04, -1.48218824e-04,  5.51302859e-04, -6.41355524e-04,
         6.18040140e-05,  1.66193116e-04,  7.63053249e-04,  7.11607136e-05,
         5.52853802e-04,  8.77468265e-04, -3.54619027e-04, -1.90403098e-05,
         4.84297780e-04,  6.84853876e-04, -2.06527620e-04,  9.11613752e-04,
         3.99925775e-04, -4.53810819e-04,  9.59778845e-04,  2.64298375e-04,
        -5.46904746e-04, -1.09108900e-04,  1.64963581e-04, -7.50181556e-04,
        -2.19578840e-04, -8.25243304e-04,  8.52595374e-04, -7.61630363e-04,
        -7.12842797e-04, -4.20840341e-04,  8.65164853e-04,  4.35255381e-04,
         8.9

In [11]:
len(all_entities)

967

In [12]:
len(walk_embeddings)

967

## Generating a dataframe for entity embeddings

In [13]:
df =pd.DataFrame(zip(all_entities, walk_embeddings), columns=['entity', 'embedding'])
    

In [14]:
# a function for converting entity names
# if you need to provide entity names with CURIE format (e.g. DRUGBANK:DB00012)
def replace_prefix(entity):
    if entity.startswith('http://www.w3id.org/drugbank:'):
        return entity.replace('http://www.w3id.org/drugbank:', 'DRUGBANK:')
    else:
        return entity

df.entity = df.entity.apply(replace_prefix)

### Convert dataframe embeddings to JSON

And store the dataframe in a JSON file, to be imported in the OpenPredict API!

In [15]:
df.to_json('neurodkg_embedding.json',orient='records')

In [16]:
import pandas as pd
import numpy as np
df =pd.read_json('neurodkg_embedding.json',orient='records')

In [17]:
df.head()

Unnamed: 0,entity,embedding
0,clonus,"[0.0007415821, 0.0008608596, -0.0004027468, 0...."
1,http://www.w3id.org/neurodkg/Instances/context37,"[0.056904450100000004, -0.0100546516, 0.029304..."
2,limitation of motion,"[0.0009773159, -0.0009591056, -0.0008879253, 0..."
3,http://www.w3id.org/neurodkg/Instances/TargetG...,"[0.030828792600000002, -0.0053162156, 0.016528..."
4,http://purl.bioontology.org/ontology/MEDDRA/10...,"[0.0312968642, -0.0045549772, 0.0163740218, -0..."


In [18]:
print(df['entity'])
# np.array(df['embedding'].values)

0                                                 clonus
1       http://www.w3id.org/neurodkg/Instances/context37
2                                   limitation of motion
3      http://www.w3id.org/neurodkg/Instances/TargetG...
4      http://purl.bioontology.org/ontology/MEDDRA/10...
                             ...                        
962                                     DRUGBANK:DB00788
963                                         Carisoprodol
964                                        Buprenorphine
965                                           spasticity
966                                             Zolpidem
Name: entity, Length: 967, dtype: object


In [19]:
embedding_mat =[]
for i, row in df.iterrows():
    emb=row['embedding']
    embedding_mat.append(emb)

In [20]:
entities = df.entity.to_list()

### alternatively you can store as csv with X columns (X is the dimension of the emebdding)

In [21]:
df_emb =pd.DataFrame( embedding_mat, columns= ['feature'+str(i) for i in range(len(emb))])

In [22]:
df_emb['entity'] = entities

In [23]:
df_emb.to_csv('neurodkg_embedding.csv', index=False)

In [24]:
df_emb.head()

Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,...,feature491,feature492,feature493,feature494,feature495,feature496,feature497,feature498,feature499,entity
0,0.000742,0.000861,-0.000403,0.000645,-0.000949,0.000738,0.000661,0.000174,-5.9e-05,0.000642,...,0.00039,0.000142,-0.000467,-0.000691,-0.000204,0.000385,0.000402,-0.000373,0.000117,clonus
1,0.056904,-0.010055,0.029305,-0.059149,0.034427,0.058296,0.074192,0.032944,0.034265,-0.022276,...,0.024903,-0.017648,0.042013,-0.045887,0.000966,-0.002043,-0.000251,0.048685,0.080681,http://www.w3id.org/neurodkg/Instances/context37
2,0.000977,-0.000959,-0.000888,0.000214,0.000356,-0.000368,0.001,0.000936,-0.000507,0.000175,...,0.000465,-0.000751,0.000357,-0.000366,0.000929,-0.000456,-0.000991,-0.000911,-0.000555,limitation of motion
3,0.030829,-0.005316,0.016529,-0.033095,0.018734,0.030878,0.040578,0.016433,0.017771,-0.01237,...,0.012305,-0.009729,0.023074,-0.024915,-0.000308,-0.002128,0.000889,0.025407,0.04248,http://www.w3id.org/neurodkg/Instances/TargetG...
4,0.031297,-0.004555,0.016374,-0.032615,0.018491,0.031849,0.04098,0.01814,0.018564,-0.01205,...,0.012958,-0.008794,0.023339,-0.025106,0.000765,-0.000757,0.000308,0.027184,0.045494,http://purl.bioontology.org/ontology/MEDDRA/10...
