# Indexing Data into the Open Research Knowledge Graph

This Notebook shows how to index new Ressources into the ORKG from the TIB Hannover. The Papers we will index in this Notebook are retrieved from pubmed 

In [1]:
import pandas as pd
from Bio import Entrez, Medline
from orkg import ORKG
from orkg.utils import NamespacedClient, query_params, dict_to_url_params
from orkg.out import OrkgResponse
from orkg.client.resources import ResourcesClient
from orkg import client
from orkg.client.classes import ClassesClient

# Retrieving Papers

In [None]:
Entrez.api_key = "" # if you don´t have an API-Key just use ur email-adress beneath
Entrez.email = "dummy.email@adress.en"

In [2]:
query = input("Enter a query: ") # enter the query you want to retrieve papers to

Enter a query: microbiome obesity


In [None]:
pubmed_idlist = []
handle = Entrez.esearch(db="pubmed", term = query, retmax = "10") # here we retrieve the pubmedID of top 10 results for that query
record = Entrez.read(handle)
pubmed_idlist.append(record["IdList"])
flat_pubmed_idlist = [item for sublist in pubmed_idlist for item in sublist]

In [4]:
handle = Entrez.efetch(db = "pubmed", id = flat_pubmed_idlist, rettype = "medline", retmode = "json") # retrieving the content of the paper
records = Medline.parse(handle)

data_list =[]

for record in records:
    data_list.append(record)

In [5]:
df = pd.DataFrame(data_list)

In [7]:
df = df[["TI", "AB", "AID"]]

# Adding triples manually
The ORKG works with triples based on the Human Disease Ontology. SO we have to add them manually since there are no pythonmoduls or NLP software to do so.

In [None]:
df["keywords"] = ""

In [None]:
df["keywords"][0] = ["microbial colonization", "influences", "liver"] 
df["keywords"][1] = ["commensal microbiota", "promoting", "Inflammatory bowel disease"]
df["keywords"][2] = ["high fat-diet", "causes", "weight gain"]
df["keywords"][3] = ["gut microbiota", "influences", "diabetes"]
df["keywords"][4] = ["dietry calcium", "decreases", "obesity"]
df["keywords"][5] = ["low carb diet", "reduces", "Enterobacteriaceae"]
df["keywords"][6] = ["Trimethylamin-N-Oxid", "cause", "arrhythmia"]
df["keywords"][7] = ["probiotic bound ACE-2 delivery", "harms", "SARS-CoV-2"]
df["keywords"][8] = ["gut virome", "infuences", "obesity"]
df["keywords"][9] = ["early pubertal", "causes", "obesity"]

In [11]:
df.to_csv("papers_about_obesity.csv")

# Adding the file as a ressource to the ORKG

In [12]:
orkg = ORKG(host="https://www.orkg.org/orkg", creds=('orkg-email-address', 'orkg-password'))

In [13]:
response = orkg.resources.get()

In [14]:
response.url

'https://www.orkg.org/orkg/api/resources/'

In [15]:
response.succeeded # now we are connected to the ORKG Backend

True

In [16]:
response.status_code

200

In [None]:
# with this code we finally push the dataset into the ORKG
datasetID = orkg.resources.save_dataset(file="microbiome_obesity.csv", label="Papers about obesity", dimensions=["AID"])
datasetID