### 5. Extraction Notebook - IBM Knowgl-large

##### In this notebook, we:
1. use the latest version of Python, Pytorch and Transformers like the Extraction Notebook - REBEL.
2. load the data that we preprocessed in the previous notebooks.
3. set the generation parameters for the model
4. define a post-processing function to shape the triples from the IBM/knowgl-large output
5. execute the extraction
6. use the function to get wikidata IDs.
7. make an entity dict: 
    * entity_dict[KnowGL_extracted_string_in_subject_or_object_position] = wikidata_id_if_any
8. make a relation dict: 
    * relation_dict[KnowGL_extracted_string_in_predicate_position] = wikidata_id_if_any
8. make a type dict: 
    * type_dict[KnowGL_extracted_string_in_subject_or_object_position_with_type_details] = wikidata_id_if_any
9. convert KnowGL triples into Wikidata triples.
10. save all the extracted triples, entities, relations and types separately in a jsonl file.

In [1]:
#KnowGL environment details:
!python --version
import torch
print('Pythorch version: ', torch.__version__)
import transformers
print('Transformers version: ', transformers.__version__)
import json
import requests

Python 3.10.9
Pythorch version:  2.0.1
Transformers version:  4.30.1


In [2]:
# Let's start by loading the data we preprocessed in the previous notebook.
data = []
with open('data/preprocessed_data_for_extraction.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line.rstrip('None\n')))
print('Number of data instances: ', len(data))

Number of data instances:  917


In [3]:
# we call the tokenizer and the model from the HuggingFace library
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("ibm/knowgl-large")
model = AutoModelForSeq2SeqLM.from_pretrained("ibm/knowgl-large").to("cuda")

In [4]:
# we set the generation parameters for the model
gen_kwargs = {
    "max_length": 1024,
    "length_penalty": 0,
    "num_beams": 10, # 10 beams is NOT the default value but we opted for it to get more diverse results
    "num_return_sequences": 10, # 10 sequences is NOT the default value but we opted for it to get long tail triple extraction
}

In [5]:
# It is a post-processing function to shape the triples from the KnowGL output
def extract_triples(decoded_preds):
    """ decoded_preds: list of strings, each string is a decoded prediction from KnowGL. len(decoded_preds) = num_return_sequences"""
    triples = set()
    entity_set = set()
    entity_triples = set()
    relation_set = set()
    for pred in decoded_preds:
        pred = pred.replace("<s>", "").replace("<pad>", "").replace("</s>", "").replace("[", "").replace("]", "").replace("(", "").replace(")", "") # we remove the brackets and parenthesisstrip() # remove special tokens
        if pred == '':
            continue
        
        if '$' in pred:
            pred = pred.split('$')
        else:
            pred = [pred]
            
        pred = [triple.split('|') for triple in pred]   
        
        for triple in pred:
            if len(triple) != 3:
                continue
            if triple[0] == '' or triple[1] == '' or triple[2] == '':
                continue
            
            sbj = triple[0].split('#')
            if len(sbj) != 3:
                continue
            entity_set.add(sbj[0])
            entity_triples.add((sbj[0], "label", sbj[1]))
            entity_triples.add((sbj[0], "type", sbj[2]))
            
            rel = triple[1]
            relation_set.add(rel)
            
            obj = triple[2].split('#')
            if len(obj) != 3:
                continue
            entity_set.add(obj[0])
            entity_triples.add((obj[0], "label", obj[1]))
            entity_triples.add((obj[0], "type", obj[2]))
            
            triples.add((sbj[0], rel, obj[0]))

    return triples, entity_triples, entity_set, relation_set

In [6]:
# Extraction time!
for line in data:
    inputs = line["Chunk text"] 
    model_inputs = tokenizer(inputs, max_length=1024, padding=True, truncation=True, return_tensors = 'pt')
    #print(model_inputs['input_ids'].size())
    generated_tokens = model.generate(
                            model_inputs["input_ids"].to('cuda'),
                            attention_mask=model_inputs["attention_mask"].to('cuda'),
                            **gen_kwargs,
                            )
    decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)
    triples, entity_triples, entity_set, relation_set = extract_triples(decoded_preds)
    line["Extracted Triples"] = list(triples)
    line["Entity Triples"] = list(entity_triples)
    line["Extracted Entities"] = list(entity_set)
    line["Extracted Relations"] = list(relation_set)
    #print(line)

In [7]:
# Let's save the data in a jsonl file
with open('data/preprocessed_data_with_KnowGL_extracted_triples.jsonl', 'w', encoding='utf-8') as f:
    for line in data:
        line = json.dump(line, f, ensure_ascii=False)
        f.write(f'{line}\n')

Let's get Wikidata IDs for KnowGL extraction:

In [8]:
all_entities = set() # to avoid duplicates and reduce the number of API calls
all_relations = set() # to avoid duplicates and reduce the number of API calls
for line in data:
    entities = line["Extracted Entities"]
    relations = line["Extracted Relations"]
    for ent in entities:
        all_entities.add(ent)
    for rel in relations:
        all_relations.add(rel)
print(f"Total number of entities: {len(all_entities)}")
print(f"Total number of relations: {len(all_relations)}")

Total number of entities: 3109
Total number of relations: 193


In [9]:
all_types = set() # to avoid duplicates and reduce the number of API calls
for line in data:
    entity_triples = line["Entity Triples"]
    for triple in entity_triples:
        if triple[1] == "type":
            ent_type = triple[2].strip()
            all_types.add(ent_type)
print(f"Total number of entity types: {len(all_types)}")

all_types = sorted(all_types) # to have a consistent order

Total number of entity types: 1092


In [10]:
#function to get wikidata IDs
def call_wiki_api(item, item_type='entity'):
  if item_type == 'entity':
    url = f"https://www.wikidata.org/w/api.php?action=wbsearchentities&search={item}&language=en&format=json"
  if item_type == 'property':
    url = f"https://www.wikidata.org/w/api.php?action=wbsearchentities&search={item}&type=property&language=en&format=json"
  try:
    data = requests.get(url).json()
    # Return the first id (Could upgrade this in the future)
    return data['search'][0]['id']
  except:
    return 'no-wikiID'

In [11]:
# Let's call the API for all entities and relations to get their wikidata IDs  
rel_dict = {}
for rel in sorted(all_relations):
    rel_dict[rel] = call_wiki_api(rel, 'property')

In [12]:
# Let's save the dictionary
with open('data/knowGL_wiki_relations_dict.json', 'w') as f:
  json.dump(rel_dict, f, indent=4)

In [13]:
# It's entities' turn
ent_dict = {}
for ent in sorted(all_entities):
    ent_dict[ent] = call_wiki_api(ent)

In [14]:
# Let's save the dictionary
with open('data/knowGL_wiki_entities_dict.json', 'w') as f:
  json.dump(ent_dict, f, indent=4)

In [15]:
#let's get the wikidata IDs for predicted types
type_dict = {}
for ent_type in sorted(all_types):
    type_dict[ent_type] = call_wiki_api(ent_type)

In [16]:
# Let's save the dictionary
with open('data/knowGL_wiki_types_dict.json', 'w') as f:
  json.dump(type_dict, f, indent=4)

In [17]:
# it's triples' turn
for line in data:
    wikidata_triple_list = []
    triple_list = line['Extracted Triples']
    ent_triples = line['Entity Triples']
    for triple in triple_list:
        subj = ent_dict[triple[0]]
        rel = rel_dict[triple[1]]
        obj = ent_dict[triple[2]]
        wiki_triple = [subj, rel, obj]
        #print(wiki_triple)
        if 'no-wikiID' not in wiki_triple:
            wikidata_triple_list.append(wiki_triple)
    for triple in ent_triples:
        if triple[1] == 'label':
            continue
        subj = ent_dict[triple[0]]
        obj = triple[2]
        if obj not in type_dict:
            obj = 'no-wikiID'
        else:
            obj = type_dict[triple[2]] 
        wiki_triple = [subj, 'P31', obj]
        if 'no-wikiID' not in wiki_triple:
            wikidata_triple_list.append(wiki_triple)  
    line['Wikidata Triples'] = sorted(wikidata_triple_list)

In [18]:
# Let's save the data
with open('data/preprocessed_data_with_KnowGL_extracted_triples_plus_wikidata.jsonl', 'w', encoding='utf-8') as f:
    for line in data:
        line = json.dump(line, f, ensure_ascii=False)
        f.write(f'{line}\n')