# libraries

In [None]:
%%capture
! pip install transformers
! pip install hazm
from hazm import *
import transformers
from transformers import AdamW
from transformers.optimization import get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
import seaborn as sn
import pandas as pd
from operator import itemgetter
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import json
from copy import deepcopy
import numpy as np
import csv
import requests
import random
import re
import string
import codecs
from shutil import copyfile
random.seed(12345)

# Download & Loading Data

In [6]:
# a function to fetch csv data using url
def fetch_data(url):
  with requests.Session() as s:
    download = s.get(url)
    decoded_content = download.content.decode('utf-8')
    cr = csv.reader(decoded_content.splitlines(), delimiter=',')
    my_list = list(cr)
    return my_list

In [7]:
# fetching train & test
en_train = fetch_data('https://raw.githubusercontent.com/language-ml/4-token-classification/main/Multilingual-NER/en_train.csv')
en_test = fetch_data('https://raw.githubusercontent.com/language-ml/4-token-classification/main/Multilingual-NER/en_test.csv')
fa_train = fetch_data('https://raw.githubusercontent.com/language-ml/4-token-classification/main/Multilingual-NER/fa_train.csv')
fa_test = fetch_data('https://raw.githubusercontent.com/language-ml/4-token-classification/main/Multilingual-NER/fa_test.csv')

In [None]:
def stack_data(data):
  stacked_data = []
  last_id = 0
  data = data[1:]
  for i, row in enumerate(data):
    if row[1].startswith('# id'):
      if last_id != 0:
        stacked_data.append(data[last_id:i])
        last_id = i
      else:
        last_id = i
  return stacked_data

In [None]:
en_train_stacked = stack_data(en_train)
en_test_stacked = stack_data(en_test)
fa_train_stacked = stack_data(fa_train)
fa_test_stacked = stack_data(fa_test)

In [None]:
def sanitize_data(data, type_):
  sanitized_data = []
  for sample in data:
    temp = []
    for word in sample:
      if type_ == 'train':
        temp.append([word[1].strip(),word[2].strip()])
      elif type_ == 'test':
        temp.append([word[1].strip()])
    sanitized_data.append(temp)
  return sanitized_data

In [None]:
en_train_sanitized = sanitize_data(en_train_stacked, type_='train')
en_test_sanitized = sanitize_data(en_test_stacked, type_='test')
fa_train_sanitized = sanitize_data(fa_train_stacked, type_='train')
fa_test_sanitized = sanitize_data(fa_test_stacked, type_='test')

In [None]:
en_train_sanitized[0]

[['# id ff57d715-80cc-4fb2-9444-901a55ad5dc1\tdomain=train', ''],
 ['it', 'O'],
 ['is', 'O'],
 ['a', 'O'],
 ['series', 'O'],
 ['of', 'O'],
 ['badminton', 'O'],
 ['tournaments', 'O'],
 [',', 'O'],
 ['sanctioned', 'O'],
 ['by', 'O'],
 ['badminton', 'B-GRP'],
 ['world', 'I-GRP'],
 ['federation', 'I-GRP'],
 ['(', 'O'],
 ['bwf', 'O'],
 [')', 'O'],
 ['since', 'O'],
 ['2007', 'O'],
 ['.', 'O']]

In [None]:
fa_train_sanitized[0]

[['# id 72ea36b1-9107-4d0e-bea0-05b4282dd303\tdomain=train', ''],
 ['دانیلیان', 'O'],
 ['،', 'O'],
 ['مکرتچیان', 'O'],
 ['،', 'O'],
 ['هلقاتیان', 'O'],
 ['،', 'O'],
 ['گالویان', 'B-PER']]

In [None]:
all_labels = []
for sample in fa_train_sanitized:
  for word in sample[1:]:
    if word[1] not in all_labels:
      all_labels.append(word[1])

In [None]:
all_labels

['O',
 'B-PER',
 'B-CW',
 'I-CW',
 'B-LOC',
 'B-PROD',
 'I-PROD',
 'B-CORP',
 'I-CORP',
 'I-PER',
 'I-LOC',
 'B-GRP',
 'I-GRP',
 '_ O']

In [None]:
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

In [None]:
le = LabelEncoder()
le.fit(all_labels)

LabelEncoder()

In [None]:
def train_val_split(data, val_size=.2):
  data_len = len(data)
  order = np.random.permutation(data_len)
  data = np.array(data)[order]
  split_num = int(data_len * (1 - val_size))
  return data[:split_num], data[split_num:]

In [69]:
en_train_ner, en_val_ner = train_val_split(en_train_sanitized)
fa_train_ner, fa_val_ner = train_val_split(fa_train_sanitized)
en_test_ner = en_test_sanitized
fa_test_ner = fa_test_sanitized

  after removing the cwd from sys.path.


In [70]:
def tokenize_data(data, type_='train'):
  new_data = []
  for sample in tqdm(data, position=0, leave=True):
    tokens = []
    attention_mask = []
    labels = []
    token_mapping = []
    for i, word in enumerate(sample):
      if i != 0:
        w = tokenizer(word[0], return_tensors='pt',add_special_tokens=False)
        word_tokens = list(w['input_ids'].view(-1).numpy())
        tokens += word_tokens
        attention_mask += list(w['attention_mask'].view(-1).numpy())
        if type_ == 'test':
          token_mapping.append(len(word_tokens))
        # handeling labels
        # sweden: B-LOC -> swe: B-LOC, den: I-LOC
        if type_ == 'train':
          if word[1].startswith('B') and len(list(w['input_ids'].view(-1).numpy())) > 1:
            labels += [word[1]]
            labels += ['I' + word[1][1:]] * (len(word_tokens) - 1)
          else:
            labels += [word[1]] * len(word_tokens)
      else:
        sample_id = word[0].split(' ')[2].split('\t')[0]
    new_data.append([sample_id, tokens, attention_mask, len(tokens), labels, token_mapping])
  new_data = sorted(new_data, key=itemgetter(2))
  return new_data

In [71]:
# train data tokenization
en_train_ner = tokenize_data(en_train_ner, 'train')
fa_train_ner = tokenize_data(fa_train_ner, 'train')
train_ner = sorted(en_train_ner + fa_train_ner, key=itemgetter(2))
# validation data tokenization
en_val_ner = tokenize_data(en_val_ner, 'train')
fa_val_ner = tokenize_data(fa_val_ner, 'train')
val_ner = sorted(en_val_ner + fa_val_ner, key=itemgetter(2))
# test data tokenization
en_test_ner = tokenize_data(en_test_ner, 'test')
fa_test_ner = tokenize_data(fa_test_ner, 'test')
test_ner = sorted(en_test_ner + fa_test_ner, key=itemgetter(2))

100%|██████████| 12238/12238 [00:17<00:00, 714.16it/s]
100%|██████████| 12238/12238 [00:20<00:00, 605.44it/s]
100%|██████████| 3060/3060 [00:04<00:00, 697.70it/s]
100%|██████████| 3060/3060 [00:04<00:00, 613.18it/s]
100%|██████████| 798/798 [00:01<00:00, 711.59it/s]
100%|██████████| 798/798 [00:01<00:00, 627.65it/s]


In [72]:
class DataLoder:

  def __init__(self, dataset, batch_size):
    self.dataset = dataset
    self.batch_size = batch_size
    self.epoch_finished = False
    self.idx = 0

  def make_tensor(self, batch):
    data_ids = []
    input_ids = []
    attention_masks = []
    labels = []
    token_mappings = []
    for data in batch:
      data_ids.append(data[0])
      input_ids.append(list(data[1].numpy()))
      attention_masks.append(list(data[2].numpy()))
      labels.append(data[4])
      token_mappings.append(data[5])
    return [data_ids, torch.tensor(input_ids), torch.tensor(attention_masks), torch.tensor(labels).view(-1), token_mappings]

  def get_data(self, idx):
    data_id = self.dataset[idx][0] 
    tokens = torch.tensor([0] + self.dataset[idx][1] + [2])
    attention_mask = torch.tensor(self.dataset[idx][2] + [1] + [1])
    size = self.dataset[idx][3] 
    label = le.transform(self.dataset[idx][4])
    token_mapping = self.dataset[idx][5] 
    return [data_id, tokens, attention_mask, size, label, token_mapping]


  def get_batch(self, idx):
    if idx >= len(self.dataset):
      return True, None, None
    batch_data = []
    first_data = self.get_data(idx)
    first_data_size = first_data[3]
    batch_data.append(first_data)
    for i in range(1, self.batch_size):
      if idx + i == len(self.dataset):
        self.idx += i
        return True, batch_data
      temp = self.get_data(idx + i)
      if temp[3] != first_data_size:
        self.idx += i
        return False, batch_data
      else:
        batch_data.append(temp)
    self.idx += self.batch_size
    return False, batch_data


  def load_data(self):
    while(not self.epoch_finished):
      self.epoch_finished, batch = self.get_batch(self.idx)
      return self.make_tensor(batch)

# Defining Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-base", output_hidden_states = True)

In [None]:
class NER(nn.Module):

  def __init__(self,transformer):
    super(NER, self).__init__()
    self.xlm_roberta = transformer
    self.num_class = le.classes_.size
    self.classifier_head = nn.Parameter(torch.randn(768, self.num_class))

  def forward(self, x1, x2):
    # main task
    x = self.xlm_roberta(input_ids=x1, attention_mask=x2)
    token_repr = torch.einsum('ijk,kt->ijt', x.hidden_states[-1][:,1:-1,:].reshape(1, -1, 768), self.classifier_head).reshape(-1,14)
    logit = F.log_softmax(token_repr, dim=1)
    return logit

# device = 'cuda:0'
device = 'cuda:0'
ner = NER(model).to(device)

# Training

In [None]:
def print_evaluation(pred, real):
    f1 = f1_score(pred, real, average='weighted')
    acc = accuracy_score(pred, real)
    recall = recall_score(pred, real, average= 'weighted',zero_division=1)
    precision = precision_score(pred, real, average= 'macro',zero_division=1)
    print(f'f1 score: {f1}')
    print(f'accuracy score: {acc}')
    print(f'recall score: {recall}')
    print(f'precision score: {precision}')


def eval_data():
  val_loader = DataLoder(val_ner, batch_size=10)
  all_preds = []
  lbls = []
  while(True):
    data = val_loader.load_data()
    if data is None:
      break
    input_ids = data[1].to(device)
    attention_mask = data[2].to(device)
    labels = data[3].view(-1).to(device)
    labels = labels.cpu().numpy()
    for i in labels:
      lbls.append(i)
    output = ner(input_ids, attention_mask)
    idx = output[:,None].argmax(-1)
    idx = idx.cpu().numpy()
    for i in idx:
      all_preds.append(i[0])
  
  print_evaluation(all_preds , lbls)

In [None]:
# Transformer Training
epochs = 4
lr = 1e-5
k_step_loss = 0
k = 100
batch_size = 10

loss_fn = nn.CrossEntropyLoss(weight=torch.tensor([1,1,1,1,1,1,1,1,1,1,1,1,.1,1])).to(device)
optimizer = AdamW(model.parameters(), lr=lr)
loss_collection = []

for epoch in range(epochs):
  train_loader = DataLoder(train_ner, batch_size)
  step = 0
  print(f'############### EPOCH {epoch + 1} START ###############')
  while(True):
    data = train_loader.load_data()
    if data is None:
      k_step_loss = 0
      break
  
    input_ids = data[1].to(device)
    attention_mask = data[2].to(device)
    labels = data[3].view(-1).to(device)
    output = ner(input_ids, attention_mask)
    loss = loss_fn(output, labels)
    k_step_loss += loss.item()
    if (step + 1) % k == 0:
      print(f'EPOCH {epoch + 1} | STEP {step + 1} | LOSS {k_step_loss}')
      loss_collection.append(k_step_loss)
      k_step_loss = 0
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    step += 1

  print(f'############### EPOCH {epoch + 1} Evaluation ###############')
  eval_data()
  print()
  print()

############### EPOCH 1 START ###############
EPOCH 1 | STEP 100 | LOSS 770.5746170282364
EPOCH 1 | STEP 200 | LOSS 137.29697716236115
EPOCH 1 | STEP 300 | LOSS 96.58927012979984
EPOCH 1 | STEP 400 | LOSS 70.21778629720211
EPOCH 1 | STEP 500 | LOSS 72.632075317204
EPOCH 1 | STEP 600 | LOSS 63.60607452690601
EPOCH 1 | STEP 700 | LOSS 58.93165780603886
EPOCH 1 | STEP 800 | LOSS 49.94273281097412
EPOCH 1 | STEP 900 | LOSS 53.94074726104736
EPOCH 1 | STEP 1000 | LOSS 51.51620499789715
EPOCH 1 | STEP 1100 | LOSS 44.176193840801716
EPOCH 1 | STEP 1200 | LOSS 39.67821581661701
EPOCH 1 | STEP 1300 | LOSS 38.91696944087744
EPOCH 1 | STEP 1400 | LOSS 42.03956326842308
EPOCH 1 | STEP 1500 | LOSS 50.63298259675503
EPOCH 1 | STEP 1600 | LOSS 45.52160008251667
EPOCH 1 | STEP 1700 | LOSS 49.23795388638973
EPOCH 1 | STEP 1800 | LOSS 44.2594438791275
EPOCH 1 | STEP 1900 | LOSS 45.66712674498558
EPOCH 1 | STEP 2000 | LOSS 47.50389660522342
EPOCH 1 | STEP 2100 | LOSS 43.65423549711704
EPOCH 1 | STEP 2200

## Saving Model & Plotting Training Loss

In [None]:
# save model
from google.colab import drive  
drive.mount('/content/drive')
torch.save(ner, 'drive/MyDrive/HW5_checkpoint.pth')
# plot loss
plt.figure(figsize=(12,12))
plt.plot(list(range(len(loss_collection))), loss_collection)
plt.xlabel(f'every {k} steps')
plt.ylabel(f'every {k} steps total loss')

# Testing

In [None]:
test_loader = DataLoder(test_ner, 10)
while(True):
  data = test_loader.load_data()
  if data is None:
    break
  # your code
  # data is a list -> first-element: ids, second-element: input_ids, third-element: attention masks, fourth-element: labels

In [73]:
test_result = {}
test_loader = DataLoder(test_ner, batch_size=10)
all_preds = []
while(True):
  data = test_loader.load_data()
  if data is None:
    break
  break

In [75]:
data[4]

[[2], [1, 1], [1, 1]]

In [40]:
test_result = {}
test_loader = DataLoder(test_ner, batch_size=10)
all_preds = []
while(True):
  data = test_loader.load_data()
  if data is None:
    break
  ids = data[0]
  batch_num = data[1].shape[0]
  input_ids = data[1].to(device)
  attention_mask = data[2].to(device)
  token_mappings = data[4]
  output = ner(input_ids, attention_mask)
  idx = output[:,None].argmax(-1)
  idx = idx.cpu().numpy().reshape(batch_num, -1)

  for i in range(batch_num):
    test_result[ids[i]] = idx[i]
    token_mapping = token_mappings[i]
    

In [41]:
test_result

{'b7a55e29-a802-4971-870d-f9bf7c97124c': array([1, 7]),
 'b548c51f-bcd0-41cc-a42b-dc405fe5970c': array([12,  1]),
 '33c76679-179a-4ec6-a7da-1ff02a464690': array([1, 7]),
 '7a189021-1a8d-4939-9545-7057b02b8136': array([12,  1,  7,  7]),
 '8f87554c-92b1-4946-a0e5-5f7686fe8db3': array([12, 12,  2,  8,  8]),
 'bd69e0a4-c8d3-45fc-aed9-5b50aea443fe': array([12,  1,  7,  7,  7]),
 'cc1c68ae-afde-4fa5-93af-3a547683c784': array([12, 12, 12, 12,  5, 12]),
 '2445195a-30ef-4ed6-aadc-0a49947a3240': array([12, 12, 12,  3,  9,  9]),
 '075328cc-a807-4e12-8a5b-e163952321d6': array([ 4, 10, 10, 12, 12,  7]),
 '27318878-d732-4ac9-8446-8cc58cc507cd': array([12, 12, 12,  0, 12, 12]),
 '31e8f8e9-d870-41cc-8e0b-f8fd71607932': array([ 3,  9, 12, 12, 12, 12, 12]),
 '20264c8b-eac4-4bff-8786-68e05b3e74b3': array([ 1,  7,  7, 12, 12, 12, 12]),
 '6c10f839-73b0-4180-a2bf-d30ba35e65cc': array([12, 12, 12, 12,  3,  9, 12, 12]),
 'ff6d4a0d-0fa3-4247-bc1a-8bcae10d53be': array([ 0, 12, 12,  0,  6, 12, 12, 12]),
 '0913a7

In [37]:
data[1].shape

torch.Size([3, 4])