<a href="https://colab.research.google.com/github/IvanDePivan/2AMM30-groep-2-component-1/blob/main/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import json
import numpy as np
import spacy
from spacy.training.example import Example #holds information for one training instance
import random
from tqdm import tqdm
from spacy.scorer import Scorer
import warnings
warnings.filterwarnings('ignore')
import os
import glob
import re

In [None]:
# Authenticate
from google.colab import drive
drive.mount('/content/drive')
from google.colab import auth
auth.authenticate_user()

# Get email of current Colab user
import requests
gcloud_token = !gcloud auth print-access-token
gcloud_tokeninfo = requests.get('https://www.googleapis.com/oauth2/v3/tokeninfo?access_token=' + gcloud_token[0]).json()
email = gcloud_tokeninfo['email']

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Define filepath
if email == 'tamaraexterkate93@gmail.com':
  filename = "/content/drive/MyDrive/TUe/TM/Exports/export_41675_project-41675-at-2023-10-04-09-37-9bbbec63.json"
elif email == 'n.v.diermen@student.tue.nl':
  filename = "/content/drive/MyDrive/Text Mining/export_41675_project-41675-at-2023-10-04-12-08-05f5e3f5.json"

In [None]:
DATA = []

# read json file
with open(filename, 'rb') as fp:
  training_data = json.load(fp)

  # get text, labels, relations, benchmark for each article
  for article in training_data:
    entities = []
    relations = []
    original_text = article.get('data').get('text')
    if len(article.get('annotations')) == 5: # N=5
      benchmark = True
    else:
      benchmark = False
    for annotation in article.get('annotations'):
      if annotation.get('ground_truth') == True: # only include ground truth
        for ind, label in enumerate(annotation.get('result')):
          if label.get('type') == 'labels':
            start = label.get('value').get('start')
            end = label.get('value').get('end')
            label = label.get('value').get('labels')[0] # note: cannot deal with multiple labels
            entities.append((start, end, label))
          elif label.get('type') == 'relation':
            from_id = label.get('from_id')
            to_id = label.get('to_id')
            relation = label.get('labels')[0] # note: cannot deal with multiple relations
            relations.append((from_id, to_id, relation))
          else:
            print("found unknown label type (no label or relation)")

    # append article to training data as dictionary
    DATA.append({
    'text': original_text,
    'entities': entities,
    'relations': relations,
    'benchmark': benchmark
    })

In [None]:
# Remove entities with smallest span in case of overlapping entities
def remove_overlap(entities):
  # Initialize a list to store the final non-overlapping entities
  final_entities = []

  # Sort the entities by their start position in ascending order
  entities.sort(key=lambda entity: entity[0])

  # Iterate through the sorted entities
  for entity in entities:
      overlaps = False
      for existing_entity in final_entities:
          # Check for overlapping entities
          if (entity[0] >= existing_entity[0] and entity[0] < existing_entity[1]) or \
            (entity[1] > existing_entity[0] and entity[1] <= existing_entity[1]):
              overlaps = True
              break
      if not overlaps:
          final_entities.append(entity)

  return final_entities

for article in DATA:
  article['entities'] = remove_overlap(article['entities'])

In [None]:
model = None
if model is not None:
  nlp = spacy.load(model)
  print(f"Loaded model '{model}'")
else:
  nlp = spacy.blank('en')
  print("Created blank 'en' model")

# set up the pipeline
if 'ner' not in nlp.pipe_names:
  ner = nlp.add_pipe('ner')
# nlp.add_pipe(ner, last=True)
else:
  ner = nlp.get_pipe('ner')

Created blank 'en' model


In [None]:
def train_test_split(data,test_size):
  train_end= int(len(data)*(1-test_size))
  return data[0:train_end], data[train_end:len(data)]

In [None]:
train, test =  train_test_split(DATA,0.5)

In [None]:
def train_ner_model(train_data, n_iter=1):

  for article in train_data:
    for ent in article['entities']:
        ner.add_label(ent[2])

  other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

  with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
      random.shuffle(train_data)
      losses = {}
      for article in tqdm(train_data):
        doc = nlp.make_doc(article['text'])
        example = Example.from_dict(doc, {'entities': article['entities']})
        nlp.update([example], losses=losses, drop=0.2)
      print(losses)

    return nlp

nlp_model = train_ner_model(train, n_iter=1)

100%|██████████| 20/20 [00:01<00:00, 11.78it/s]

{'ner': 2992.163251189742}





In [None]:
for article in test:
  doc = nlp_model(article['text'])
  print('Entities', [(ent.text, ent.label_) for ent in doc.ents])


Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []


In [None]:
def evaluate(nlp, test):
  scorer = Scorer()
  example = []
  for article in test:
    pred = nlp(article['text'])
    temp = Example.from_dict(pred, {'entities': article['entities']})
    example.append(temp)
  scores = scorer.score(example)
  return scores

results = evaluate(nlp_model,test)
print(results)

{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'sents_p': None, 'sents_r': None, 'sents_f': None, 'tag_acc': None, 'pos_acc': None, 'morph_acc': None, 'morph_micro_p': None, 'morph_micro_r': None, 'morph_micro_f': None, 'morph_per_feat': None, 'dep_uas': None, 'dep_las': None, 'dep_las_per_type': None, 'ents_p': 0.0, 'ents_r': 0.0, 'ents_f': 0.0, 'ents_per_type': {'Date': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'Nationality': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'Winner': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'Reason': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'Prizetype': {'p': 0.0, 'r': 0.0, 'f': 0.0}}, 'cats_score': 0.0, 'cats_score_desc': 'macro F', 'cats_micro_p': 0.0, 'cats_micro_r': 0.0, 'cats_micro_f': 0.0, 'cats_macro_p': 0.0, 'cats_macro_r': 0.0, 'cats_macro_f': 0.0, 'cats_macro_auc': 0.0, 'cats_f_per_type': {}, 'cats_auc_per_type': {}}


In [None]:
doc = nlp_model(test[0]['text'])
colors = {'Winner': '#F67DE3', 'Date':'#7DF6D9', 'Prizetype': '#DC143C', 'Reason': '#00FF00', 'Nationality': '#E287433' }
options = {'colors':colors}
spacy.displacy.render(doc,style='ent', options=options, jupyter=True)

In [None]:
!pip install -U spacy[transformers]



In [None]:
from spacy.tokens import DocBin

nlp = spacy.blank('en')
db = DocBin()

for article in tqdm(train):
  doc = nlp.make_doc(article['text'])
  ents = []
  for start, end, label in article['entities']:
    span = doc.char_span(start, end, label=label, alignment_mode='contract')
    if span is None:
      pass
    else:
      ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk('./train.spacy')

100%|██████████| 20/20 [00:01<00:00, 18.47it/s]


In [None]:
nlp = spacy.blank('en')
db = DocBin()

for article in tqdm(test):
  doc = nlp.make_doc(article['text'])
  ents = []
  for start, end, label in article['entities']:
    span = doc.char_span(start, end, label=label, alignment_mode='contract')
    if span is None:
      pass
    else:
      ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk('./test.spacy')

100%|██████████| 20/20 [00:01<00:00, 19.32it/s]


In [None]:
!cp "/content/drive/MyDrive/Text Mining/config.cfg" ./

In [None]:
# !python -m spacy init fill-config base_config.cfg config.cfg --n

[33mUsage: [0mpython [1;32m-m[0m spacy init fill-config 
           [OPTIONS] BASE_PATH [OUTPUT_FILE]
[2mTry [0m[2;34m'python [0m[1;2;34m-m[0m[2;34m spacy init fill-config [0m[1;2;34m-[0m[1;2;34m-help[0m[2;34m'[0m[2m for help.[0m
[31m╭─[0m[31m Error [0m[31m─────────────────────────────────────────────────────────────────────[0m[31m─╮[0m
[31m│[0m No such option: [1;36m-[0m[1;36m-n[0m[1;36m-iter[0m                                                     [31m│[0m
[31m╰──────────────────────────────────────────────────────────────────────────────╯[0m


In [None]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./test.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  -------------  --------  ------  ------  ------  ------

[31mAborted.[0m


In [None]:
!pip install -U spacy
best_nlp =  spacy.load('./output/model-best')



In [None]:
doc = best_nlp(test[0]['text'])
colors = {'Winner': '#F67DE3', 'Date':'#7DF6D9', 'Prizetype': '#DC143C', 'Reason': '#00FF00', 'Nationality': '#E287433' }
options = {'colors':colors}
spacy.displacy.render(doc,style='ent', options=options, jupyter=True)