<a href="https://colab.research.google.com/github/IvanDePivan/2AMM30-groep-2-component-1/blob/main/Model_RE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
import pandas as pd
import json
import numpy as np
import spacy
from spacy.training.example import Example #holds information for one training instance
import random
from tqdm import tqdm
from spacy.scorer import Scorer
import warnings
warnings.filterwarnings('ignore')
import os
import glob
import re
! pip install -U accelerate
! pip install -U transformers
! pip install evaluate



In [35]:
# Authenticate
from google.colab import drive
drive.mount('/content/drive')
from google.colab import auth
auth.authenticate_user()

# Get email of current Colab user
import requests
gcloud_token = !gcloud auth print-access-token
gcloud_tokeninfo = requests.get('https://www.googleapis.com/oauth2/v3/tokeninfo?access_token=' + gcloud_token[0]).json()
email = gcloud_tokeninfo['email']

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [36]:
# Define filepath
if email == 'tamaraexterkate93@gmail.com':
  filename = "/content/drive/MyDrive/TUe/TM/Exports/export_41675_project-41675-at-2023-10-04-09-37-9bbbec63.json"
elif email == 'n.v.diermen@student.tue.nl':
  filename = "/content/drive/MyDrive/Text Mining/export_41675_project-41675-at-2023-10-04-12-08-05f5e3f5.json"

In [37]:
DATA = []

# read json file
with open(filename, 'rb') as fp:
  training_data = json.load(fp)

  # get text, labels, relations, benchmark for each article
  for article in training_data:
    entities = []
    id_entities = []
    relations = []
    original_text = article.get('data').get('text')
    if len(article.get('annotations')) == 5: # N=5
      benchmark = True
    else:
      benchmark = False
    for annotation in article.get('annotations'):
      if annotation.get('ground_truth') == True: # only include ground truth
        for ind, label in enumerate(annotation.get('result')):
          if label.get('type') == 'labels':
            start = label.get('value').get('start')
            end = label.get('value').get('end')
            id = label.get('id')
            text = label.get('value').get('text')
            label = label.get('value').get('labels')[0] # note: cannot deal with multiple labels
            id_entities.append((start, end, label, id, text)) # WHY ORDER WEIRD FOR TRAIN[0]?
            entities.append((start, end, label))
          elif label.get('type') == 'relation':
            from_id = label.get('from_id')
            to_id = label.get('to_id')
            relation = label.get('labels')[0] # note: cannot deal with multiple relations
            relations.append((from_id, to_id, relation))
          else:
            print("found unknown label type (no label or relation)")

    # append article to training data as dictionary
    DATA.append({
    'text': original_text,
    'entities': entities,
    'id_entities': id_entities,
    'relations': relations,
    'benchmark': benchmark
    })

In [38]:
# Remove entities with smallest span in case of overlapping entities
def remove_overlap(entities):
  # Initialize a list to store the final non-overlapping entities
  final_entities = []

  # Sort the entities by their start position in ascending order
  entities.sort(key=lambda entity: entity[0])

  # Iterate through the sorted entities
  for entity in entities:
      overlaps = False
      for existing_entity in final_entities:
          # Check for overlapping entities
          if (entity[0] >= existing_entity[0] and entity[0] < existing_entity[1]) or \
            (entity[1] > existing_entity[0] and entity[1] <= existing_entity[1]):
              overlaps = True
              break
      if not overlaps:
          final_entities.append(entity)

  return final_entities

for article in DATA:
  article['entities'] = remove_overlap(article['entities'])

In [39]:
def train_test_split(data,test_size):
  train_end= int(len(data)*(1-test_size))
  return data[0:train_end], data[train_end:len(data)]

In [40]:
train, test =  train_test_split(DATA,0.5)

In [41]:
def insert_markers(text, pair):

  label_to_marker = {
      'Winner': '[WINR]',
      'Date': '[DATE]',
      'Prizetype': '[PRZT]',
      'Reason': '[REAS]',
      'Nationality': '[NTLY]'
  }

  start_to_add = 0
  end_to_add = 7
  for ent in pair:
    start = ent[0]
    end = ent[1]
    label = ent[2]
    start += start_to_add
    end += end_to_add
    text = text[:start] + label_to_marker[label] + ' ' + text[start:]
    text = text[:end] +' '+ label_to_marker[label] + text[end:]
    start_to_add += 7*2
    end_to_add += 7*2

  return text

In [42]:
def get_relation(pair, relations):
  if pair[0][3] and pair[1][3]: # check if entities of pair exist in annotations
    # see if and what relation there is between the entities of the pair
    for relation in relations:
      if (relation[0] == pair[0][3] and relation[1] == pair[1][3]) or (relation[1] == pair[0][3] and relation[0] == pair[1][3]):
          return relation[2]
  return 'no_relation'

In [43]:
def mark_data(data, NER_output=True):

  # init data output and relations list
  data_output = []
  relations_output = []

  # Iterate over each article in data
  for article in data:

    # get original text and convert to doc
    text = article['text']

    # get seperate list of winner entities and other entities in fixed format (start,end,label,id)
    if NER_output:
      # create doc
      doc = best_nlp(text)

      winners = []
      others = []
      for ent in doc.ents:
        # check to what entity in the annotations the NER entity refers to
        for original_ent in article['id_entities']:
          ent_id = None # refer to None if entity is not in annotations
          # otherwise overwrite with id of entity in annotations
          if ent.start_char == original_ent[0] and ent.end_char == original_ent[1]:
            # print(ent.text)
            # print(original_ent[4])
            ent_id = original_ent[3]
            break
        if ent.label_=='Winner':
          winners.append((ent.start_char, ent.end_char, ent.label_, ent_id))
        else:
          others.append((ent.start_char, ent.end_char, ent.label_, ent_id))

    else:
      ents = article['id_entities']
      winners = [ent for ent in ents if ent[2] =='Winner']
      others = [ent for ent in ents if ent[2] !='Winner']

    # make pairs for possible relations (winner + other)
    pairs = []
    for winner in winners:
      for other in others:
        if winner[0] < other[0]:
          pairs.append((winner,other))
        else:
          pairs.append((other,winner))

    # create new text with markers for each possible relation with winner
    new_texts = [insert_markers(text, pair) for pair in pairs]
    relations_output.extend([get_relation(pair, article['relations']) for pair in pairs])
    data_output.extend(new_texts)

  return data_output, relations_output

In [44]:
# even vanaf train[1] omdat train[0] dus een gekke volgorde heeft in id_entity
X_train, y_train = mark_data(train[1:], NER_output=False)
X_test, y_test = mark_data(test, NER_output=False)

for i in range(6):
  print(X_train[i])
  print(y_train[i] + '\n')

id2label = {0: "no_relation", 1: "received_nobelprize_for", 2: "has_won", 3: "received_nobelprize_in", 4: "born_on", 5: "died_on", 6: "has_nationality"}
label2id = {"no_relation": 0, "received_nobelprize_for": 1, "has_won": 2, "received_nobelprize_in": 3, "born_on": 4, "died_on": 5, "has_nationality": 6}

y_train = [label2id[label] for label in y_train]
y_test = [label2id[label] for label in y_test]

[WINR] Max Karl Ernst Ludwig Planck [WINR] ([DATE] 23 April 1858 [DATE] – 4 October 1947) was a German theoretical physicist whose discovery of energy quanta won him the Nobel Prize in Physics in 1918.
Planck made many substantial contributions to theoretical physics, but his fame as a physicist rests primarily on his role as the originator of quantum theory, which revolutionized human understanding of atomic and subatomic processes. In 1948, the German scientific institution Kaiser Wilhelm Society (of which Planck was twice president) was renamed Max Planck Society (MPG). The MPG now includes 83 institutions representing a wide range of scientific directions.
Life and career.
Planck came from a traditional, intellectual family. His paternal great-grandfather and grandfather were both theology professors in Göttingen; his father was a law professor at the University of Kiel and Munich. One of his uncles was also a judge.
born_on

[WINR] Max Karl Ernst Ludwig Planck [WINR] (23 April 185

In [45]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

# add our labels as tokens to vocabulary
special_tokens = ["[WINR]", "[DATE]", "[REAS]", "[NTLY]", "[PRZT]"]
num_added_toks = tokenizer.add_tokens(special_tokens)
print('We have added', num_added_toks, 'tokens')

We have added 5 tokens


In [46]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [47]:
import torch
from torch.utils.data import Dataset, DataLoader

# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, input_data, labels, tokenizer, max_length):
        self.input_data = input_data
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.input_data)

    def __getitem__(self, idx):
        text = self.input_data[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Define hyperparameters
max_seq_length = 128  # You can adjust this as needed
batch_size = 2  # You can adjust this as needed

# Create custom datasets
train_dataset = CustomDataset(X_train, y_train, tokenizer, max_seq_length)
test_dataset = CustomDataset(X_test, y_test, tokenizer, max_seq_length)

# # Create data loaders
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [48]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [49]:
import evaluate

accuracy = evaluate.load("accuracy")

In [50]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [51]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    'bert-base-cased', num_labels=7, id2label=id2label, label2id=label2id
)

model.resize_token_embeddings(len(tokenizer))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(29001, 768)

In [None]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
