### 3. Wikidata Notebook

##### In this notebook, we:
1. load the data that we saved the extracted triples, entities, and relations.
2. write a function to get wikidata IDs.
3. make an entity dict: 
    * entity_dict[rebel_extracted_string_in_subject_or_object_position] = wikidata_id_if_any
4. make a relation dict: 
    * relation_dict[rebel_extracted_string_in_predicate_position] = wikidata_id_if_any
5. convert REBEL triples into Wikidata triples.
6. save all the extracted triples, entities and relations separately in a jsonl file.

In [1]:
import requests
import json

In [2]:
# Let's start by loading the data we extracted in the previous notebook.
data = []
with open('data/preprocessed_data_with_REBEL_extracted_triples.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line.rstrip('None\n')))
print('Number of data instances: ', len(data))

Number of data instances:  917


In [3]:
#function to get wikidata IDs
def call_wiki_api(item, item_type='entity'):
  if item_type == 'entity':
    url = f"https://www.wikidata.org/w/api.php?action=wbsearchentities&search={item}&language=en&format=json"
  if item_type == 'property':
    url = f"https://www.wikidata.org/w/api.php?action=wbsearchentities&search={item}&type=property&language=en&format=json"
  try:
    data = requests.get(url).json()
    # Return the first id (Could upgrade this in the future)
    return data['search'][0]['id']
  except:
    return 'no-wikiID'

In [4]:
#Let's test it 
ent = call_wiki_api('Barack Obama')
print(ent)

prop = call_wiki_api('instance of', item_type='property')
print(prop)

ent2 = call_wiki_api('instance of')
print(ent2)

Q76
P31
Q21503252


In [5]:
all_entities = set() # to avoid duplicates and reduce the number of API calls
all_relations = set() # to avoid duplicates and reduce the number of API calls
for line in data:
    entities = line["Extracted Entities"]
    relations = line["Extracted Relations"]
    for ent in entities:
        all_entities.add(ent)
    for rel in relations:
        all_relations.add(rel)
print(f"Total number of entities: {len(all_entities)}")
print(f"Total number of relations: {len(all_relations)}")

Total number of entities: 3738
Total number of relations: 157


In [6]:
# Let's call the API for all entities and relations to get their wikidata IDs  
rel_dict = {}
for rel in sorted(all_relations):
    rel_dict[rel] = call_wiki_api(rel, item_type='property')

In [7]:
#Let's see how it looks like:
print(len(rel_dict))
print(rel_dict['has part'])

157
P527


In [8]:
# Let's save the dictionary
with open('data/rebel_wiki_relations_dict.json', 'w') as f:
  json.dump(rel_dict, f, indent=4)

In [9]:
# It's entities' turn
ent_dict = {}

for ent in sorted(all_entities):
    ent_dict[ent] = call_wiki_api(ent)

In [10]:
# Let's save the dictionary
with open('data/rebel_wiki_entities_dict.json', 'w') as f:
  json.dump(ent_dict, f, indent=4)

In [11]:
# it's triples' turn
for line in data:
    wikidata_triple_list = []
    triple_list = line['Extracted Triples']
    for triple in triple_list:
        subj = ent_dict[triple[0]]
        rel = rel_dict[triple[1]]
        obj = ent_dict[triple[2]]
        wiki_triple = [subj, rel, obj]
        #print(wiki_triple)
        if 'no-wikiID' not in wiki_triple:
            wikidata_triple_list.append(wiki_triple)
    line['Wikidata Triples'] = sorted(wikidata_triple_list)

In [12]:
# Let's save the data
with open('data/preprocessed_data_with_REBEL_extracted_triples_plus_wikidata.jsonl', 'w', encoding='utf-8') as f:
    for line in data:
        line = json.dump(line, f, ensure_ascii=False)
        f.write(f'{line}\n')