In [59]:
import pandas as pd
import json
import numpy as np
import spacy
from spacy.training.example import Example #holds information for one training instance
import random
from tqdm import tqdm
from spacy.scorer import Scorer
import warnings
warnings.filterwarnings('ignore')

In [60]:
# Authenticate
from google.colab import drive
drive.mount('/content/drive')
from google.colab import auth
auth.authenticate_user()

# Get email of current Colab user
import requests
gcloud_token = !gcloud auth print-access-token
gcloud_tokeninfo = requests.get('https://www.googleapis.com/oauth2/v3/tokeninfo?access_token=' + gcloud_token[0]).json()
email = gcloud_tokeninfo['email']

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [61]:
# Define filepath
if email == 'tamaraexterkate93@gmail.com':
  filename = "/content/drive/MyDrive/TUe/TM/Exports/export_41675_project-41675-at-2023-10-04-09-37-9bbbec63.json"
elif email == 'n.v.diermen@student.tue.nl':
  filename = "/content/drive/MyDrive/Text Mining/export_41675_project-41675-at-2023-10-04-12-08-05f5e3f5.json"

In [62]:
DATA = []

# read json file
with open(filename, 'rb') as fp:
  training_data = json.load(fp)

  # get text, labels, relations, benchmark for each article
  for article in training_data:
    entities = []
    relations = []
    original_text = article.get('data').get('text')
    if len(article.get('annotations')) == 5: # N=5
      benchmark = True
    else:
      benchmark = False
    for annotation in article.get('annotations'):
      if annotation.get('ground_truth') == True: # only include ground truth
        for ind, label in enumerate(annotation.get('result')):
          if label.get('type') == 'labels':
            start = label.get('value').get('start')
            end = label.get('value').get('end')
            label = label.get('value').get('labels')[0] # note: cannot deal with multiple labels
            entities.append((start, end, label))
          elif label.get('type') == 'relation':
            from_id = label.get('from_id')
            to_id = label.get('to_id')
            relation = label.get('labels')[0] # note: cannot deal with multiple relations
            relations.append((from_id, to_id, relation))
          else:
            print("found unknown label type (no label or relation)")

    # append article to training data as dictionary
    DATA.append({
    'text': original_text,
    'entities': entities,
    'relations': relations,
    'benchmark': benchmark
    })

In [63]:
# Remove entities with smallest span in case of overlapping entities
def remove_overlap(entities):
  # Initialize a list to store the final non-overlapping entities
  final_entities = []

  # Sort the entities by their start position in ascending order
  entities.sort(key=lambda entity: entity[0])

  # Iterate through the sorted entities
  for entity in entities:
      overlaps = False
      for existing_entity in final_entities:
          # Check for overlapping entities
          if (entity[0] >= existing_entity[0] and entity[0] < existing_entity[1]) or \
            (entity[1] > existing_entity[0] and entity[1] <= existing_entity[1]):
              overlaps = True
              break
      if not overlaps:
          final_entities.append(entity)

  return final_entities

for article in DATA:
  article['entities'] = remove_overlap(article['entities'])

In [64]:
# TODO: deal with overlapping entities
DATA[1]

{'text': 'Max Karl Ernst Ludwig Planck (23 April 1858 – 4 October 1947) was a German theoretical physicist whose discovery of energy quanta won him the Nobel Prize in Physics in 1918.\nPlanck made many substantial contributions to theoretical physics, but his fame as a physicist rests primarily on his role as the originator of quantum theory, which revolutionized human understanding of atomic and subatomic processes. In 1948, the German scientific institution Kaiser Wilhelm Society (of which Planck was twice president) was renamed Max Planck Society (MPG). The MPG now includes 83 institutions representing a wide range of scientific directions.\nLife and career.\nPlanck came from a traditional, intellectual family. His paternal great-grandfather and grandfather were both theology professors in Göttingen; his father was a law professor at the University of Kiel and Munich. One of his uncles was also a judge.',
 'entities': [(0, 28, 'Winner'),
  (30, 43, 'Date'),
  (46, 60, 'Date'),
  (68

In [65]:
model = None
if model is not None:
  nlp = spacy.load(model)
  print(f"Loaded model '{model}'")
else:
  nlp = spacy.blank('en')
  print("Created blank 'en' model")

# set up the pipeline
if 'ner' not in nlp.pipe_names:
  ner = nlp.add_pipe('ner')
# nlp.add_pipe(ner, last=True)
else:
  ner = nlp.get_pipe('ner')

Created blank 'en' model


In [66]:
def train_test_split(data,test_size):
  train_end= int(len(data)*(1-test_size))
  return data[0:train_end], data[train_end:len(data)]

In [67]:
train, test =  train_test_split(DATA,0.5)

In [68]:
def train_ner_model(train_data, n_iter=1):

  for article in train_data:
    for ent in article['entities']:
        ner.add_label(ent[2])

  other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

  with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
      random.shuffle(train_data)
      losses = {}
      for article in tqdm(train_data):
        doc = nlp.make_doc(article['text'])
        example = Example.from_dict(doc, {'entities': article['entities']})
        nlp.update([example], losses=losses, drop=0.2)
      print(losses)

    return nlp

nlp_model = train_ner_model(train, n_iter=50)

100%|██████████| 20/20 [00:03<00:00,  6.41it/s]


{'ner': 2685.040282539646}


100%|██████████| 20/20 [00:02<00:00,  6.92it/s]


{'ner': 973.4237232905207}


100%|██████████| 20/20 [00:01<00:00, 10.07it/s]


{'ner': 783.5939331435125}


100%|██████████| 20/20 [00:02<00:00,  9.30it/s]


{'ner': 424.76798531107625}


100%|██████████| 20/20 [00:02<00:00,  9.55it/s]


{'ner': 405.30327285028955}


100%|██████████| 20/20 [00:02<00:00,  9.73it/s]


{'ner': 333.2957408739247}


100%|██████████| 20/20 [00:02<00:00,  8.31it/s]


{'ner': 377.5617239390766}


100%|██████████| 20/20 [00:03<00:00,  6.15it/s]


{'ner': 160.66694000371288}


100%|██████████| 20/20 [00:02<00:00,  8.29it/s]


{'ner': 266.9853231458209}


100%|██████████| 20/20 [00:02<00:00,  9.97it/s]


{'ner': 190.8130326186776}


100%|██████████| 20/20 [00:02<00:00,  9.71it/s]


{'ner': 194.08020921531661}


100%|██████████| 20/20 [00:02<00:00,  9.54it/s]


{'ner': 110.93850665638664}


100%|██████████| 20/20 [00:02<00:00,  9.82it/s]


{'ner': 85.89829778587638}


100%|██████████| 20/20 [00:03<00:00,  6.26it/s]


{'ner': 102.57038124515704}


100%|██████████| 20/20 [00:02<00:00,  7.05it/s]


{'ner': 81.03081482992502}


100%|██████████| 20/20 [00:02<00:00,  9.17it/s]


{'ner': 52.90190515596842}


100%|██████████| 20/20 [00:02<00:00,  9.21it/s]


{'ner': 69.02997573406431}


100%|██████████| 20/20 [00:02<00:00,  9.86it/s]


{'ner': 54.56632618911274}


100%|██████████| 20/20 [00:02<00:00,  8.95it/s]


{'ner': 48.77621714795192}


100%|██████████| 20/20 [00:03<00:00,  6.63it/s]


{'ner': 37.3993439100205}


100%|██████████| 20/20 [00:03<00:00,  6.18it/s]


{'ner': 32.43742185030183}


100%|██████████| 20/20 [00:02<00:00,  9.27it/s]


{'ner': 25.402170302287452}


100%|██████████| 20/20 [00:02<00:00,  9.69it/s]


{'ner': 20.022595856405275}


100%|██████████| 20/20 [00:02<00:00,  9.16it/s]


{'ner': 31.89085794361296}


100%|██████████| 20/20 [00:02<00:00,  9.24it/s]


{'ner': 17.082704600340694}


100%|██████████| 20/20 [00:03<00:00,  6.16it/s]


{'ner': 23.883103405610942}


100%|██████████| 20/20 [00:02<00:00,  7.10it/s]


{'ner': 32.21812297071684}


100%|██████████| 20/20 [00:02<00:00,  9.68it/s]


{'ner': 17.014296664122938}


100%|██████████| 20/20 [00:02<00:00,  9.60it/s]


{'ner': 21.38583337385714}


100%|██████████| 20/20 [00:02<00:00,  9.88it/s]


{'ner': 14.825050250717181}


100%|██████████| 20/20 [00:02<00:00,  9.83it/s]


{'ner': 22.75092482521414}


100%|██████████| 20/20 [00:02<00:00,  7.47it/s]


{'ner': 17.852218625719406}


100%|██████████| 20/20 [00:03<00:00,  6.44it/s]


{'ner': 18.557612130495514}


100%|██████████| 20/20 [00:02<00:00,  9.01it/s]


{'ner': 10.24742319331877}


100%|██████████| 20/20 [00:02<00:00,  9.11it/s]


{'ner': 11.251659151188095}


100%|██████████| 20/20 [00:02<00:00,  8.68it/s]


{'ner': 6.066713264514894}


100%|██████████| 20/20 [00:02<00:00,  8.93it/s]


{'ner': 15.563904144341576}


100%|██████████| 20/20 [00:02<00:00,  7.12it/s]


{'ner': 5.960341316809275}


100%|██████████| 20/20 [00:03<00:00,  6.06it/s]


{'ner': 13.356872176994905}


100%|██████████| 20/20 [00:02<00:00,  9.82it/s]


{'ner': 11.179845771651479}


100%|██████████| 20/20 [00:02<00:00,  9.02it/s]


{'ner': 18.71956198259903}


100%|██████████| 20/20 [00:02<00:00,  8.99it/s]


{'ner': 27.3269049162576}


100%|██████████| 20/20 [00:01<00:00, 10.08it/s]


{'ner': 7.206189941240821}


100%|██████████| 20/20 [00:02<00:00,  6.96it/s]


{'ner': 7.19402338415167}


100%|██████████| 20/20 [00:03<00:00,  6.50it/s]


{'ner': 20.145863039022633}


100%|██████████| 20/20 [00:02<00:00,  9.16it/s]


{'ner': 7.536869504096421}


100%|██████████| 20/20 [00:02<00:00,  9.86it/s]


{'ner': 7.683405850240364}


100%|██████████| 20/20 [00:02<00:00,  9.24it/s]


{'ner': 6.427298835525169}


100%|██████████| 20/20 [00:02<00:00,  9.81it/s]


{'ner': 11.005749802328081}


100%|██████████| 20/20 [00:02<00:00,  8.27it/s]

{'ner': 6.880598471882337}





In [69]:
for article in test:
  doc = nlp_model(article['text'])
  print('Entities', [(ent.text, ent.label_) for ent in doc.ents])


Entities [('12, 2020', 'Date'), ('German', 'Nationality'), ('American', 'Nationality'), ('work with neutrinos, the subatomic particles considered to be elementary constituents of matter.', 'Reason'), ('He', 'Winner'), ('1988', 'Date'), ('Nobel Prize in Physics', 'Prizetype'), ('the discovery of the muon neutrino.', 'Reason'), ('Through', 'Winner'), ('he', 'Winner'), ('1950', 'Date'), ('1968', 'Date'), ('1968', 'Date'), ('1986', 'Date'), ('He', 'Winner'), ('the United States', 'Nationality'), ('1988', 'Date'), ('1990', 'Date'), ('Steinberger', 'Winner'), ('him', 'Winner'), ('Steinberger', 'Winner'), ('the United States', 'Nationality'), ('the U.S.', 'Nationality'), ('him', 'Winner'), ('Steinberger', 'Winner'), ('He', 'Winner'), ('1938', 'Date')]
Entities [('Kullmann Five', 'Winner'), ('13 April 1951', 'Date'), ('19 February 2017', 'Date'), ('1997', 'Date'), ('1994', 'Date'), ('1997', 'Date'), ('2003', 'Date'), ('Nobel Foundation\xa0', 'Prizetype'), ('2009', 'Date'), ('Nobel Peace Prize'

In [70]:
def evaluate(nlp, test):
  scorer = Scorer()
  example = []
  for article in test:
    pred = nlp(article['text'])
    temp = Example.from_dict(pred, {'entities': article['entities']})
    example.append(temp)
  scores = scorer.score(example)
  return scores

results = evaluate(nlp_model,test)
print(results)

{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'sents_p': None, 'sents_r': None, 'sents_f': None, 'tag_acc': None, 'pos_acc': None, 'morph_acc': None, 'morph_micro_p': None, 'morph_micro_r': None, 'morph_micro_f': None, 'morph_per_feat': None, 'dep_uas': None, 'dep_las': None, 'dep_las_per_type': None, 'ents_p': 0.6658097686375322, 'ents_r': 0.6540404040404041, 'ents_f': 0.6598726114649682, 'ents_per_type': {'Date': {'p': 0.831081081081081, 'r': 0.924812030075188, 'f': 0.8754448398576512}, 'Nationality': {'p': 0.4931506849315068, 'r': 0.42857142857142855, 'f': 0.45859872611464964}, 'Reason': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'Winner': {'p': 0.6183206106870229, 'r': 0.54, 'f': 0.5765124555160143}, 'Prizetype': {'p': 0.8636363636363636, 'r': 0.95, 'f': 0.9047619047619048}}, 'cats_score': 0.0, 'cats_score_desc': 'macro F', 'cats_micro_p': 0.0, 'cats_micro_r': 0.0, 'cats_micro_f': 0.0, 'cats_macro_p': 0.0, 'cats_macro_r': 0.0, 'cats_macro_f': 0.0, 'cats_macro_auc': 0.0, '

In [71]:
doc = nlp_model(test[0]['text'])
colors = {'Winner': '#F67DE3', 'Date':'#7DF6D9', 'Prizetype': '#DC143C', 'Reason': '#00FF00', 'Nationality': '#E287433' }
options = {'colors':colors}
spacy.displacy.render(doc,style='ent', options=options, jupyter=True)