In [1]:
#environment details:
!python --version
import torch
print('Pythorch version: ', torch.__version__)
import transformers
print('Transformers version: ', transformers.__version__)
import json
import requests
import numpy as np

Python 3.10.9
Pythorch version:  2.0.1+cu117
Transformers version:  4.30.2


In [2]:
# Let's start by loading the data we preprocessed in the previous notebook.
data = []
with open('data/preprocessed_data_for_extraction.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line.rstrip('None\n')))
print('Number of data instances: ', len(data))

Number of data instances:  917


In [3]:
# It is a post-processing function to shape the triples from the KnowGL output
def extract_triples(pred):
    """ Just one sequence is passed as argument """
    triples = set()
    entity_set = set()
    entity_triples = set()
    relation_set = set()
    pred = pred.replace("<s>", "").replace("<pad>", "").replace("</s>", "").replace("[", "").replace("]", "").replace("(", "").replace(")", "") # we remove the brackets and parenthesisstrip() # remove special tokens
        
    if '$' in pred:
        pred = pred.split('$')
        
    else:
        pred = [pred]
            
    pred = [triple.split('|') for triple in pred]   
        
    for triple in pred:
        if len(triple) != 3:
            continue
        if triple[0] == '' or triple[1] == '' or triple[2] == '':
            continue
            
        sbj = triple[0].split('#')
        if len(sbj) != 3:
            continue
        entity_set.add(sbj[0])
        entity_triples.add((sbj[0], "label", sbj[1]))
        entity_triples.add((sbj[0], "type", sbj[2]))
            
        rel = triple[1]
        relation_set.add(rel)
            
        obj = triple[2].split('#')
        if len(obj) != 3:
            continue
        entity_set.add(obj[0])
        entity_triples.add((obj[0], "label", obj[1]))
        entity_triples.add((obj[0], "type", obj[2]))
            
        triples.add((sbj[0], rel, obj[0]))

    return triples, entity_triples, entity_set, relation_set

In [4]:
# we call the tokenizer and the model from the HuggingFace library
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("ibm/knowgl-large")
model = AutoModelForSeq2SeqLM.from_pretrained("ibm/knowgl-large").to("cuda")
#tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
#model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large").to("cuda")

In [5]:
# we set the generation parameters for the model
gen_kwargs = {
    "max_length": 1024,
    "length_penalty": 0,
    "num_beams": 10, # 10 beams is NOT the default value but we opted for it to get more diverse results
    "num_return_sequences": 10, # 10 sequences is NOT the default value but we opted for it to get long tail triple extraction
    "return_dict_in_generate": True, 
    "output_scores": True,
}

In [16]:
# Extraction time!
for line in data[:5]:
    inputs = line["Chunk text"] 
    model_inputs = tokenizer(inputs, max_length=1024, padding=True, truncation=True, return_tensors = 'pt')
    #print(model_inputs['input_ids'].size())
    outputs = model.generate(
                            model_inputs["input_ids"].to('cuda'),
                            attention_mask=model_inputs["attention_mask"].to('cuda'),
                            **gen_kwargs,
                            )
    #decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)
    transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, 
                                                        outputs.beam_indices, normalize_logits=False)
    
    input_length = 1 if model.config.is_encoder_decoder else inputs.input_ids.shape[1]
    output_length = input_length + np.sum(transition_scores.cpu().numpy() < 0, axis=1)
    length_penalty = model.generation_config.length_penalty
    reconstructed_scores = transition_scores.cpu().sum(axis=1) / (output_length**length_penalty)
    
    all_triples_dict = dict()
    all_entity_triples_dict = dict()
    entity_set = set()
    relation_set = set()
    for seq, seq_score, prob in zip(outputs.sequences, reconstructed_scores, np.exp(reconstructed_scores)):
        pred = tokenizer.decode(seq, skip_special_tokens=False)
        triples, entity_triples, entities, relations = extract_triples(pred)
        triple_dict = dict()
        entity_triple_dict = dict()
        for triple in triples:
            triple_dict[triple] = {'prob': float(f'{prob:.2f}'), 'log_prob': float(f'{seq_score:.2f}')}
        all_triples_dict.update(triple_dict)
            
        for entity_triple in entity_triples:
            entity_triple_dict[entity_triple] = {'prob': float(f'{prob:.2f}'), 'log_prob': float(f'{seq_score:.2f}')}
        all_entity_triples_dict.update(entity_triple_dict)
            
        for entity in entities:
            entity_set.add(entity)
            
        for relation in relations:  
            relation_set.add(relation)
    
    line["Extracted Triples and Probabilities"] = list(all_triples_dict.items())
    line["Entity Triples and Probabilities"] = list(all_entity_triples_dict.items())
    line["Extracted Entities"] = list(entity_set)
    line["Extracted Relations"] = list(relation_set)
    print(line)    


{'Document id': 1, 'Document url': 'https://en.wikipedia.org/wiki/Adidas', 'Document name': 'Adidas', 'Chunk id': 1, 'Chunk text': 'Adidas AG (German: [ˈʔadiˌdas] (listen); stylized as adidas since 1949) is a German athletic apparel and footwear corporation headquartered in Herzogenaurach, Bavaria, Germany. Adidas AG is the largest sportswear manufacturer in Europe, and the second largest in the world, after Nike.', 'Extracted Triples and Probabilities': [(('Adidas', 'headquarters location', 'Herzogenaurach'), {'prob': 0.91, 'log_prob': -0.09}), (('Germany', 'contains administrative territorial entity', 'Bavaria'), {'prob': 0.91, 'log_prob': -0.09}), (('German', 'contains administrative territorial entity', 'Bavaria'), {'prob': 0.93, 'log_prob': -0.07}), (('Adidas', 'headquarters location', 'Herzogenaurach, Bavaria'), {'prob': 0.9, 'log_prob': -0.11}), (('Adidas AG', 'headquarters location', 'Herzogenaurach'), {'prob': 0.92, 'log_prob': -0.08})], 'Entity Triples and Probabilities': [((

In [17]:
# Let's save the data in a jsonl file
with open('data/preprocessed_data_with_KnowGL_extracted_triples_and_probabilities.jsonl', 'w', encoding='utf-8') as f:
    for line in data:
        line = json.dump(line, f, ensure_ascii=False)
        f.write(f'{line}\n')