In [1]:
import os
import itertools
import pandas as pd
import numpy as np
import csv
from datasets import Dataset
from datasets import load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch

In [2]:
label_list = ['O','B-MISC','I-MISC','B-PER','I-PER','B-ORG','I-ORG','B-LOC','I-LOC']
label_encoding_dict = {
    "B-LOC": 0,
    "B-MISC": 1,
    "B-ORG": 2,
    "B-PER": 3,
    "I-LOC": 4,
    "I-MISC": 5,
    "I-ORG": 6,
    "I-PER": 7,
    "O": 8
}

In [3]:
task = "ner" 
model_checkpoint = "distilbert-base-uncased"
batch_size = 16

In [14]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [15]:
def convert(path):
    tokens = []
    entities = []
    with open(path, 'r',) as f:
        reader = csv.reader(f, delimiter='\t')
        temp_t = []
        temp_e = []
        next(reader)
        for line in enumerate(reader):
            if not line[1]:
                tokens.append(temp_t)
                entities.append(temp_e)
                temp_t = []
                temp_e = []
            else:
                t = [ '{}'.format(x) for x in list(csv.reader([line[1][0]], delimiter=',', quotechar='"'))[0] ]
                t[3] = int(t[3])
                temp_t.append(t[1])
                temp_e.append(t[2])
        tokens.append(temp_t)
        entities.append(temp_e)
    return Dataset.from_pandas(pd.DataFrame({'tokens': tokens, 'ner_tags': entities}))

In [16]:
train = convert("/Users/johnsonchan/Documents/Coding/Class/CMPSC 190I W/HW 4/HW4_spanish_ner_data/train.csv")

In [17]:
validate = convert("/Users/johnsonchan/Documents/Coding/Class/CMPSC 190I W/HW 4/HW4_spanish_ner_data/validation.csv")

In [18]:
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]), truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label_encoding_dict[label[word_idx]])
            else:
                label_ids.append(label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
        
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [19]:
train_tokenized = train.map(tokenize_and_align_labels, batched=True)
validate_tokenized = validate.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/8323 [00:00<?, ? examples/s]

Map:   0%|          | 0/1915 [00:00<?, ? examples/s]

In [20]:
device = torch.device("mps" if torch.cuda.is_available() else "cpu")

In [21]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream

DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
    

In [22]:
args = TrainingArguments(
    f"test-{task}",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=1e-5,
)
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")

  metric = load_metric("seqeval")


In [23]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}

In [24]:
trainer = Trainer(
    model.to(device),
    args,
    train_dataset=train_tokenized,
    eval_dataset=validate_tokenized,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [25]:
trainer.train()
trainer.evaluate()
trainer.save_model('spanish-ner.model')

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


RuntimeError: Placeholder storage has not been allocated on MPS device!