In [8]:
## multilingual

import csv
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from nltk.tokenize import sent_tokenize
import json

In [22]:
with open("../data/sentences.csv", "r", encoding="utf-8") as f:
    data = list(csv.DictReader(f=f, delimiter=","))

In [3]:
tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
tagger = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")
nlp = pipeline("ner", model=tagger, tokenizer=tokenizer, aggregation_strategy="simple")

In [7]:
text = data[0]["sentence"]
sentences = sent_tokenize(text)

for sentence in sentences:
    entities = nlp(sentence)
entities

[{'entity_group': 'LOC',
  'score': 0.9993399,
  'word': 'Florence',
  'start': 3,
  'end': 11},
 {'entity_group': 'PER',
  'score': 0.59895015,
  'word': 'Frate',
  'start': 30,
  'end': 35},
 {'entity_group': 'PER',
  'score': 0.9934189,
  'word': 'Michelagnolo',
  'start': 224,
  'end': 236},
 {'entity_group': 'PER',
  'score': 0.9826374,
  'word': 'Fra Giovanni Agnolo',
  'start': 304,
  'end': 323}]

In [24]:
output = []

pbar = tqdm(total=len(data))
for sample in data:
    doc_id = sample["id"]
    text = sample["sentence"]
    sentences = [s for s in sent_tokenize(text)]
    curr_pos = 0
    for sentence_id, sentence in enumerate(sentences):
        if sentence_id>0:
            curr_pos = curr_pos + len(sentences[sentence_id-1])+1
        ner = nlp(sentence)
        for ent in ner:
            if len(output)>0:
                if output[-1]["sent_end_pos"]==ent["start"]:
                    output[-1]={
                        "doc_id":doc_id,
                        "doc_start_pos":output[-1]["doc_start_pos"],
                        "doc_end_pos":curr_pos+ent["end"],
                        "sentence_id":sentence_id,
                        "sent_start_pos":output[-1]["sent_start_pos"],
                        "sent_end_pos":ent["end"],
                        "surface":output[-1]["surface"]+ent["word"][2:],
                        "type":ent["entity_group"],
                        "score":ent["score"]
                    }
                else:
                    output.append({
                        "doc_id":doc_id,
                        "doc_start_pos":curr_pos+ent["start"],
                        "doc_end_pos":curr_pos+ent["end"],
                        "sentence_id":sentence_id,
                        "sent_start_pos":ent["start"],
                        "sent_end_pos":ent["end"],
                        "surface":ent["word"],
                        "type":ent["entity_group"],
                        "score":ent["score"]
                    })
            else:
                output.append({
                    "doc_id":doc_id,
                    "doc_start_pos":ent["start"],
                    "doc_end_pos":ent["end"],
                    "sentence_id":sentence_id,
                    "sent_start_pos":ent["start"],
                    "sent_end_pos":ent["end"],
                    "surface":ent["word"],
                    "type":ent["entity_group"],
                    "score":ent["score"]
                })
    pbar.update(1)
pbar.close()

keys = output[0].keys()
a_file = open("../results/wikineural/output.csv", "w", encoding="utf-8")
dict_writer = csv.DictWriter(a_file, keys)
dict_writer.writeheader()
dict_writer.writerows(output)
a_file.close()
    

 62%|██████████████████████████████████████████████████▋                               | 34/55 [01:05<00:40,  1.92s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 55/55 [01:36<00:00,  1.75s/it]


## this is only for DECENT

In [14]:
output = []

pbar = tqdm(total=len(data))
for sample in data:
    doc_id = sample["id"]
    text = sample["sentence"]
    sentences = [s for s in sent_tokenize(text)]
    curr_pos = 0
    for sentence_id, sentence in enumerate(sentences):
        if sentence_id>0:
            curr_pos = curr_pos + len(sentences[sentence_id-1])+1
        ner = nlp(sentence)
        for ent in ner:
            if len(output)>0:
                if output[-1]["end_pos"]==curr_pos+ent["start"]:
                    output[-1]={
                        "y_category":[ent["entity_group"]],
                        "ex_id":doc_id,
                        "start_pos":output[-1]["start_pos"],
                        "end_pos":curr_pos+ent["end"],
                        "word":output[-1]["word"]+ent["word"][2:],
                        "left_context_text":output[-1]["left_context_text"],
                        "right_context_text":sentence[ent["end"]:]
                    }
                else:
                    output.append({
                        "y_category":[ent["entity_group"]],
                        "ex_id":doc_id,
                        "start_pos":curr_pos+ent["start"],
                        "end_pos":curr_pos+ent["end"],
                        "word":ent["word"],
                        "left_context_text":sentence[:ent["start"]],
                        "right_context_text":sentence[ent["end"]:]
                    })
            else:
                output.append({
                    "y_category":[ent["entity_group"]],
                    "ex_id":doc_id,
                    "start_pos":ent["start"],
                    "end_pos":ent["end"],
                    "word":ent["word"],
                    "left_context_text":sentence[:ent["start"]],
                    "right_context_text":sentence[ent["end"]:]
                })
    pbar.update(1)
pbar.close()

with open("../results/wikineural/output.json", 'w', encoding="utf-8") as f:
    for d in output:
        json.dump(d, f)
        f.write('\n')
    

100%|██████████████████████████████████████████████████████████████████████████████████| 55/55 [01:31<00:00,  1.67s/it]


In [25]:
import json

data = []
with open("../results/wikineural_ufet/123-output-best_ufet-t_9.json", "r", encoding="utf-8") as f:
    data = json.load(f)
        
print(len(output))
print(len(data))
          
output_new = []
for wiki_res, ufet_res in zip(output, data):
          output_new.append(
          {
            "doc_id":wiki_res["doc_id"],
            "doc_start_pos":wiki_res["doc_start_pos"],
            "doc_end_pos":wiki_res["doc_end_pos"],
            "sent_start_pos":wiki_res["sent_start_pos"],
            "sent_end_pos":wiki_res["sent_end_pos"],
            "surface":wiki_res["surface"],
            "type":wiki_res["type"],
            "labels":ufet_res["pred"]
          }
          )

1198
1198


In [28]:
output_new[0]

{'doc_id': '0',
 'doc_start_pos': 14,
 'doc_end_pos': 21,
 'sent_start_pos': 14,
 'sent_end_pos': 21,
 'surface': 'Clement',
 'type': 'PER',
 'labels': ['person',
  'politician',
  'religious leader',
  'adult',
  'bishop',
  'cleric',
  'emperor',
  'father',
  'leader',
  'male',
  'man',
  'master',
  'official',
  'pope',
  'president',
  'ruler',
  'spiritual leader',
  'father figure',
  'world leader',
  'clergyman']}

In [56]:
import json


with open('../results/ufet/art_types.txt') as f:
    data_txt = f.read()
    types = data_txt.split("\n")

In [57]:
print(types)

['art', 'art collection', 'artwork', 'art form', 'cartoon', 'canvas', 'concept art', 'creation', 'expression', 'design', 'drawing', 'fine art', 'figure', 'illustration', 'image', 'life story', 'model', 'modern art', 'monument', 'oil painting', 'paint', 'painting', 'performance art', 'perspective', 'picture', 'portrait', 'representation', 'scene', 'sculpture', 'statue', 'symbol', 'traditional art', 'trinity', 'visual art', 'art glass', 'madonna', 'oil paint', 'artifact', 'action painting', 'painted lady', 'museum piece']


In [58]:
works_t_1 = []
works_t_2 = []
works_t_3 = []
works_t_4 = []
works_t_5 = []

for row in output_new:
    if len(set(types).intersection(set(row["labels"])))==1:
            works_t_1.append(row)
    elif len(set(types).intersection(set(row["labels"])))==2:
            works_t_1.append(row)
            works_t_2.append(row)
    elif len(set(types).intersection(set(row["labels"])))==3:
            works_t_1.append(row)
            works_t_2.append(row)
            works_t_3.append(row)
    elif len(set(types).intersection(set(row["labels"])))==4:
            works_t_1.append(row)
            works_t_2.append(row)
            works_t_3.append(row)
            works_t_4.append(row)
    elif len(set(types).intersection(set(row["labels"])))==5:
            works_t_1.append(row)
            works_t_2.append(row)
            works_t_3.append(row)
            works_t_4.append(row)
            works_t_5.append(row)
             
           

In [64]:
keys = works_t_5[0].keys()
a_file = open("../results/wikineural_ufet_t5/output.csv", "w", encoding="utf-8")
dict_writer = csv.DictWriter(a_file, keys)
dict_writer.writeheader()
dict_writer.writerows(works_t_5)
a_file.close()