In [1]:
#environment details:
!python --version
import torch
print('Pythorch version: ', torch.__version__)
import transformers
print('Transformers version: ', transformers.__version__)
import json
import requests
import numpy as np

Python 3.10.9
Pythorch version:  2.0.1+cu117
Transformers version:  4.30.2


In [2]:
# Let's start by loading the data we preprocessed in the previous notebook.
data = []
with open('data/preprocessed_data_for_extraction.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line.rstrip('None\n')))
print('Number of data instances: ', len(data))

Number of data instances:  917


In [3]:
# It is a post-processing function to shape the triples from the KnowGL output
def extract_triples(pred):
    """ Just one sequence is passed as argument """
    triples = set()
    entity_set = set()
    entity_triples = set()
    relation_set = set()
    pred = pred.replace("<s>", "").replace("<pad>", "").replace("</s>", "").replace("[", "").replace("]", "").replace("(", "").replace(")", "") # we remove the brackets and parenthesisstrip() # remove special tokens
        
    if '$' in pred:
        pred = pred.split('$')
        
    else:
        pred = [pred]
            
    pred = [triple.split('|') for triple in pred]   
        
    for triple in pred:
        if len(triple) != 3:
            continue
        if triple[0] == '' or triple[1] == '' or triple[2] == '':
            continue
            
        sbj = triple[0].split('#')
        if len(sbj) != 3:
            continue
        entity_set.add(sbj[0])
        entity_triples.add((sbj[0], "label", sbj[1]))
        entity_triples.add((sbj[0], "type", sbj[2]))
            
        rel = triple[1]
        relation_set.add(rel)
            
        obj = triple[2].split('#')
        if len(obj) != 3:
            continue
        entity_set.add(obj[0])
        entity_triples.add((obj[0], "label", obj[1]))
        entity_triples.add((obj[0], "type", obj[2]))
            
        triples.add((sbj[0], rel, obj[0]))

    return triples, entity_triples, entity_set, relation_set

In [4]:
# we call the tokenizer and the model from the HuggingFace library
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("ibm/knowgl-large")
model = AutoModelForSeq2SeqLM.from_pretrained("ibm/knowgl-large").to("cuda")
#tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
#model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large").to("cuda")

In [5]:
# we set the generation parameters for the model
gen_kwargs = {
    "max_length": 1024,
    "length_penalty": 0,
    "num_beams": 10, # 10 beams is NOT the default value but we opted for it to get more diverse results
    "num_return_sequences": 10, # 10 sequences is NOT the default value but we opted for it to get long tail triple extraction
    "return_dict_in_generate": True, 
    "output_scores": True,
}

In [6]:
# Extraction time!
all_entities = set()
all_relations = set()
all_types = set()

#for line in data[:5]:
for line in data:
    inputs = line["Chunk text"] 
    model_inputs = tokenizer(inputs, max_length=1024, padding=True, truncation=True, return_tensors = 'pt')
    #print(model_inputs['input_ids'].size())
    outputs = model.generate(
                            model_inputs["input_ids"].to('cuda'),
                            attention_mask=model_inputs["attention_mask"].to('cuda'),
                            **gen_kwargs,
                            )
    #decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)
    transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, 
                                                        outputs.beam_indices, normalize_logits=False)
    
    input_length = 1 if model.config.is_encoder_decoder else inputs.input_ids.shape[1]
    output_length = input_length + np.sum(transition_scores.cpu().numpy() < 0, axis=1)
    length_penalty = model.generation_config.length_penalty
    reconstructed_scores = transition_scores.cpu().sum(axis=1) / (output_length**length_penalty)
    
    all_triples_dict = dict()
    all_entity_triples_dict = dict()
    entity_set = set()
    relation_set = set()
    type_set = set()
    for seq, seq_score, prob in zip(outputs.sequences, reconstructed_scores, np.exp(reconstructed_scores)):
        pred = tokenizer.decode(seq, skip_special_tokens=False)
        #print(f'test:{pred}')
        triples, entity_triples, entities, relations = extract_triples(pred)
        triple_dict = dict()
        entity_triple_dict = dict()
        for triple in triples:
            triple_dict[triple] = {'prob': float(f'{prob:.2f}'), 'log_prob': float(f'{seq_score:.2f}')}
        all_triples_dict.update(triple_dict)
            
        for entity_triple in entity_triples:
            if entity_triple[1] == 'type':
                type_set.add(entity_triple[2])
            entity_triple_dict[entity_triple] = {'prob': float(f'{prob:.2f}'), 'log_prob': float(f'{seq_score:.2f}')}
        all_entity_triples_dict.update(entity_triple_dict)
            
        for entity in entities:
            entity_set.add(entity)
            
        for relation in relations:  
            relation_set.add(relation)
    all_entities.update(entity_set)
    all_relations.update(relation_set)
    all_types.update(type_set)
    
    line["Extracted Triples and Probabilities"] = list(all_triples_dict.items())
    line["Entity Triples and Probabilities"] = list(all_entity_triples_dict.items())
    line["Extracted Entities"] = list(entity_set)
    line["Extracted Relations"] = list(relation_set)
    #print(line)    


In [7]:
print('Number of entities: ', len(all_entities))
print('Number of relations: ', len(all_relations))
print('Number of types: ', len(all_types))

Number of entities:  3109
Number of relations:  193
Number of types:  1097


In [8]:
#function to get wikidata IDs
def call_wiki_api(item, item_type='entity'):
  if item_type == 'entity':
    url = f"https://www.wikidata.org/w/api.php?action=wbsearchentities&search={item}&language=en&format=json"
  if item_type == 'property':
    url = f"https://www.wikidata.org/w/api.php?action=wbsearchentities&search={item}&type=property&language=en&format=json"
  try:
    data = requests.get(url).json()
    # Return the first id (Could upgrade this in the future)
    return data['search'][0]['id']
  except:
    return 'no-wikiID'

In [9]:
# Let's call the API for all entities and relations to get their wikidata IDs  
rel_dict = {}
for rel in sorted(all_relations):
    rel_dict[rel] = call_wiki_api(rel, 'property')

In [10]:
# Let's save the dictionary
with open('data/knowGL_with_probabilities_wiki_relations_dict.json', 'w') as f:
  json.dump(rel_dict, f, indent=4)

In [11]:
# It's entities' turn
ent_dict = {}
for ent in sorted(all_entities):
    ent_dict[ent] = call_wiki_api(ent)

In [12]:
# Let's save the dictionary
with open('data/knowGL_with_probabilities_wiki_entities_dict.json', 'w') as f:
  json.dump(ent_dict, f, indent=4)

In [13]:
# It's entity types' turn
type_dict = {}
for tpy in sorted(all_types):
    
    type_dict[tpy] = call_wiki_api(tpy)

In [14]:
# Let's save the dictionary
with open('data/knowGL_with_probabilities_wiki_types_dict.json', 'w') as f:
  json.dump(type_dict, f, indent=4)

In [15]:
# We got all Wikidata ids, it is time to build the triples with Wikidata ids
# We will use the saved dictionaries to get to add sameAs triples later but at this stage we need one lookup dictionary.

one_lookup_dict = ent_dict.copy()
one_lookup_dict.update(rel_dict)
one_lookup_dict.update(type_dict)

In [32]:
for line in data:
    extracted_triples = line["Extracted Triples and Probabilities"]
    ent_type_triples = line["Entity Triples and Probabilities"]
    triples_per_line = extracted_triples + ent_type_triples
    wiki_triples = []
    for triple in triples_per_line:
        #print(triple)
        #print(triple[1])
        subj = one_lookup_dict[triple[0][0]]
        if triple[0][1] == 'type':
            rel = 'P31'
        elif triple[0][1] == 'label':
            continue
        else:
            rel = one_lookup_dict[triple[0][1]]
        obj = one_lookup_dict[triple[0][2]]
        wiki_triple = (subj, rel, obj), triple[1]
        #print(wiki_triple)
        wiki_triples.append(wiki_triple)
    #print(wiki_triples)
    line["Wikidata Triples and Probabilities"] = wiki_triples
    

In [33]:
# Let's save all the data in a jsonl file
with open('data/preprocessed_data_with_KnowGL_extracted_triples_wikidataIDs_and_probabilities.jsonl', 'w', encoding='utf-8') as f:
    for line in data:
        line = json.dump(line, f, ensure_ascii=False)
        f.write(f'{line}\n')