In [1]:
import torch
import json
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
import os
import re
import json
import copy
from datasets import load_dataset
from torch.utils.data import DataLoader
import accelerate
from evaluate_utils import *
from tqdm.notebook import tqdm
from evaluate import load


# Dispatch loading

In [3]:
device = torch.device("cuda:7")
tokenizer = AutoTokenizer.from_pretrained("/harddisk/user/keminglu/bigscience_tokenizer")
model = AutoModelForCausalLM.from_pretrained("/data/home/keminglu/workspace/devcloud/finetune_7b_data_v1_epoch_1", torch_dtype=torch.float16).to(device)

TypeError: __init__() got an unexpected keyword argument 'dtype'

## Online Inference

In [3]:
dataset = load_dataset("conll2003", cache_dir="/harddisk/user/keminglu/evaluation_corpus")

Found cached dataset conll2003 (/harddisk/user/keminglu/evaluation_corpus/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)


  0%|          | 0/3 [00:00<?, ?it/s]

In [12]:
ner_idx_map = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
ner_idx_map_inv = {item:key for key, item in ner_idx_map.items()}
ner_type_map = {"B-PER": "Human", "B-ORG": "Organization", "B-LOC": "Location", "B-MISC": "MISC"}

def process(record):
    tokens = record['tokens']
    ner_tags = record['ner_tags']
    
    ner = []
    tmp = None
    for token, tag in zip(tokens, ner_tags):
        if tag != 0:
            if not tmp:
                tmp = [[token], [ner_idx_map_inv[tag]]]
            else:
                tmp[0].append(token)
                tmp[1].append(ner_idx_map_inv[tag])
        else:
            if tmp:
                x = copy.deepcopy(tmp)
                x[0] = " ".join(x[0])
                x[1] = ner_type_map[x[1][0]]
                ner.append(x)
                tmp = None
    text = " ".join(tokens)
    return {"text": text, "label": ner}
        
data = dataset['test'].map(process)

Loading cached processed dataset at /harddisk/user/keminglu/evaluation_corpus/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98/cache-d11d703ce4b96364.arrow


In [13]:
index = 2000
data[index]['text'], data[index]['label']

('A woman was charged on Friday with terrorist offences after three Irish Republican Army mortar bombs were found in a Belfast house , police said .',
 [['Irish Republican Army', 'Organization'], ['Belfast', 'Location']])

In [17]:
result = infer(data[1000]['text'], "", model, tokenizer, device)[0]

entities = json.loads(result)['entities']
outputs = [(item["mention"], item["type"]) for item in entities]
print(outputs)

[('BELFAST', ['Astronomical survey'])]


## Data-to-text Inference

In [4]:
device = torch.device("cuda:7")
tokenizer = AutoTokenizer.from_pretrained("/harddisk/user/keminglu/bigscience_tokenizer")
model = AutoModelForCausalLM.from_pretrained("/data/home/keminglu/workspace/devcloud/finetune_7b_data_v1_epoch_1").to(device)

Loading checkpoint shards:   0%|          | 0/32 [00:00<?, ?it/s]

In [3]:
dataset = load_dataset("web_nlg", "release_v2", cache_dir="/harddisk/user/keminglu/evaluation_corpus/")

Found cached dataset web_nlg (/harddisk/user/keminglu/evaluation_corpus/web_nlg/release_v2/0.0.0/28ffb892f7f42450dd9558684aa43bcaf44b1b3bf0d77cb8d73534646af88dda)


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
import re

def transform(text):
    text = text.strip()
    if "_" in text:
        return text.replace("_", " ")
    else:
        return re.sub('(?<=[a-z])[A-Z]|(?<!^)[A-Z](?=[a-z])', ' \g<0>', text).lower()

In [5]:
records = []
references = []
for each in tqdm(dataset['test']):
    record = {"entities": [], "triplets": []}
    ref = each['lex']['text']
    each = each['modified_triple_sets']['mtriple_set']
    entities = set()
    relations = defaultdict(list)
    for group in each:
        for triplet in group:
            h, r, t = triplet.split("|")
            h = transform(h)
            r = transform(r)
            t = transform(t)
            entities.add(h)
            entities.add(t)
            relations[(h, t)].append(r)
    for ent in entities:
        record["entities"].append({"mention": ent, "title": ent})
    for h, t in relations:
        record["triplets"].append({"head": h, "tail": t, "relations": [{"title": r} for r in relations[(h, t)]]})
    records.append(record)
    references.append(ref)

  0%|          | 0/1600 [00:00<?, ?it/s]

In [106]:
records[60]

{'entities': [{'mention': 'Rhythm and blues', 'title': 'Rhythm and blues'},
  {'mention': 'Andra (singer)', 'title': 'Andra (singer)'}],
 'triplets': [{'head': 'Andra (singer)',
   'tail': 'Rhythm and blues',
   'relations': [{'title': 'genre'}]}]}

In [10]:
context = json.dumps(records[60])

def generate(context):
    inputs = tokenizer(context, return_tensors="pt")['input_ids'].to(device)
    with torch.no_grad():
        outputs = model.generate(
            inputs=inputs,
            num_beams=4,
            do_sample=False,
            length_penalty=1,
            max_new_tokens=256)
    
    prompt_length = len(
        tokenizer.decode(
            inputs[0],
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True
        )
    )
    texts = tokenizer.batch_decode(
        outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    text = [text[prompt_length:].strip() for text in texts]
    return text

In [13]:
outputs = []
for record in tqdm(records):
    context = json.dumps(record)
    output = generate(context)
    outputs.append(output)

  0%|          | 0/1600 [00:00<?, ?it/s]

In [14]:
predictions = [output[0] for output in outputs]
bleu = load("bleu")
results = bleu.compute(predictions=predictions, references=references[:len(predictions)])
print(results)

{'bleu': 0.1035299838676711, 'precisions': [0.2648573692551506, 0.13361513687600643, 0.075439852700491, 0.0430324459234609], 'brevity_penalty': 1.0, 'length_ratio': 3.042613465131698, 'translation_length': 100960, 'reference_length': 33182}


In [17]:
evaluated_outputs = []
for inp, pred, ref in zip(records, outputs, references):
    report = {"input": inp, "output": pred, "annotation": ref, "source": "webnlg"}
    evaluated_outputs.append(report)

In [18]:
from pymongo import MongoClient
mongodb_config = {"host": '10.12.192.31', "port": 27017}
client = MongoClient(**mongodb_config)
evaluation_collection = client['structure_lm']['evaluation_inverse']

In [19]:
evaluation_collection.insert_many(evaluated_outputs, ordered=True)

<pymongo.results.InsertManyResult at 0x7fc0c217b430>

In [52]:
## Wiki evaluation
with open("/harddisk/user/keminglu/evaluation_corpus/wiki_eval_output.txt") as f:
    inputs = [json.loads(line) for line in f.readlines()]

In [53]:
def transform(ann):
    record = {"entities": [], "triplets": []}
    for ent in sum(ann['ents'], []):
        item = {"mention": ent["text"], "title": ent["title"]}
        if "description" in ent:
            item["description"] = ent["description"]
        if "aliases" in ent and len(ent["aliases"]) > 0:
            item["aliases"] = ent["aliases"]
        if "type" in ent and len(ent["type"]) > 0:
            item["type"] = ent["type"]
        record["entities"].append(item)
    for rel in ann["relations"]:
        item = {
            "head": rel["subject"]["name"],
            "tail": rel["object"]["name"],
            "relations": [{
                "title": each["name"],
                "description": {"english": each["description"]["english"]},
                "aliases": {"english": each["aliases"]["english"]}
                } for each in rel["property"]]}
        record["triplets"].append(item)
        
    return record

In [54]:
records = []
for each in inputs:
    records.append(transform(each["annotates"]))

In [55]:
outputs = []
for record in tqdm(records):
    context = json.dumps(record)
    output = generate(context)
    outputs.append(output)

  0%|          | 0/100 [00:00<?, ?it/s]

In [73]:
evaluated_outputs = []
for inp, pred, annotate in zip(records, outputs, inputs):
    annotate = annotate["input"]
    report = {"input": inp, "output": pred, "annotate": annotate, "source": "wiki"}
    evaluated_outputs.append(report)

In [74]:
from pymongo import MongoClient
mongodb_config = {"host": '10.12.192.31', "port": 27017}
client = MongoClient(**mongodb_config)
evaluation_collection = client['structure_lm']['evaluation_inverse']

In [75]:
evaluation_collection.insert_many(evaluated_outputs, ordered=True)

<pymongo.results.InsertManyResult at 0x7fc0c21d9520>

## Online Inference

In [4]:
device = torch.device("cuda:7")
tokenizer = AutoTokenizer.from_pretrained("/harddisk/user/keminglu/bigscience_tokenizer")
model = AutoModelForCausalLM.from_pretrained("/data/home/keminglu/workspace/devcloud/finetune_7b_data_v1_epoch_1").to(device)

Loading checkpoint shards:   0%|          | 0/32 [00:00<?, ?it/s]

In [8]:
context = "百度3月16号即将发布中国第一个大模型文言一心。"
infer(context, "", model, tokenizer, device, length_penalty=3)

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 7; 31.75 GiB total capacity; 30.14 GiB already allocated; 11.75 MiB free; 30.26 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [9]:
corpus_file_path = "/harddisk/user/keminglu/evaluation_corpus/wiki_eval.txt"
ann_file_path = "/harddisk/user/keminglu/evaluation_corpus/wiki_eval.ann"
data = open(corpus_file_path).readlines()
annotates = open(ann_file_path).readlines()

In [10]:
all_res = []
data = data[:100]
pbar = tqdm(total=len(data))
for each, ann in zip(data, annotates):
    res = infer(each, "", model, tokenizer, device, num_beams=1)
    all_res.append({"input": each, "output": res, "annotates": json.loads(ann)})
    pbar.update(1)

  0%|          | 0/100 [00:00<?, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 7; 31.75 GiB total capacity; 30.16 GiB already allocated; 11.75 MiB free; 30.26 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [26]:
with open(corpus_file_path.replace(".txt", "_output.txt"), "w") as f:
    for res in all_res:
        f.write(json.dumps(res) + "\n")

# Evaluation

In [11]:
property_mapping_file_path = "/harddisk/data/nlp_data/kb/wikidata/20210520/mapping/property_names.json"
entity_mapping_file_path = "/harddisk/data/nlp_data/kb/wikidata/20210520/mapping/qid2sitelinks.enwiki.title.json"
mongodb_config = {"host": '10.12.192.31', "port": 27017}
extractor = KGExtractor(mongodb_config, entity_mapping_file_path, property_mapping_file_path)

In [12]:
evaluator = Evaluator(extractor)

In [33]:
with open("/harddisk/user/keminglu/evaluation_corpus/wiki_eval_output.txt") as f:
    outputs = [json.loads(line) for line in f.readlines()]
print(outputs[0]['annotates'])

{'id': '29900074', 'title': 'Alfa Romeo Tipo 312', 'ents': [[{'end': 162, 'id': '10386732', 'start': 143, 'text': 'Alfa Romeo Tipo 308', 'title': 'Alfa Romeo Tipo 308', 'qid': 'Q785879', 'description': {'english': 'car'}, 'aliases': {}, 'type': []}, {'end': 209, 'id': '29901461', 'start': 190, 'text': 'Alfa Romeo Tipo 316', 'title': 'Alfa Romeo Tipo 316', 'qid': 'Q3611023', 'description': {}, 'aliases': {}, 'type': []}, {'end': 227, 'id': '199965', 'start': 217, 'text': 'V16 engine', 'title': 'V16 engine', 'qid': 'Q2263764', 'description': {'english': 'piston engine with 16 cylinders in vee configuration'}, 'aliases': {'english': ['V16 engine', 'V-16']}, 'type': ['Engine configuration']}], [{'end': 51, 'id': '10578247', 'start': 34, 'text': 'Alfa Romeo 12C-37', 'title': 'Alfa Romeo 12C', 'qid': 'Q2306794', 'description': {'english': 'motor vehicle'}, 'aliases': {}, 'type': ['Car model']}], []], 'n_ents': 4, 'n_mapped_ent': 4, 'relations': [], 'n_rel_pair': 0, 'n_rel': 0}


In [32]:
with open("/harddisk/user/keminglu/evaluation_corpus/wiki_eval_output_with_evaluation.txt", "w") as f:
    for output in outputs:
        report = evaluator(output)
        output.update({"evaluation": report})
        f.write(json.dumps(output) + "\n")

## Push evaluation results into DB

In [34]:
from pymongo import MongoClient

In [35]:
mongodb_config = {"host": '10.12.192.31', "port": 27017}
client = MongoClient(**mongodb_config)
evaluation_collection = client['structure_lm']['evaluation']

In [36]:
with open("/harddisk/user/keminglu/evaluation_corpus/wiki_eval_output_with_evaluation.txt") as f:
    data = [json.loads(line) for line in f.readlines()]
for i in range(len(data)):
    data[i]["output"][0] = json.loads(data[i]["output"][0])
    data[i].update({"source": "wiki"})

In [37]:
evaluation_collection.insert_many(data, ordered=True)

<pymongo.results.InsertManyResult at 0x7f3b45ee9fa0>