In [1]:
from datasets import load_from_disk, ClassLabel, DatasetDict
from transformers import AutoModel, AutoTokenizer, DataCollatorForTokenClassification, AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput
import torch.nn as nn
import torch
import evaluate
from transformers import get_scheduler
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

In [26]:
PROPERTY = 'Gender'
LAYER_NUMBER = 1

### Loading the dataset

In [4]:
Lince_feature_dataset = load_from_disk(
    '../Datasets/Lince_' + "Number" + "_spaeng")

Lince_feature_dataset

DatasetDict({
    train: Dataset({
        features: ['words', 'lid', 'Number'],
        num_rows: 27893
    })
    validation: Dataset({
        features: ['words', 'lid', 'Number'],
        num_rows: 4298
    })
    test: Dataset({
        features: ['words', 'lid', 'Number'],
        num_rows: 10720
    })
})

In [28]:
labels_list = set()

for split in Lince_feature_dataset:
    dataset = Lince_feature_dataset[split]
    for datapoint in dataset:
        labels_list.update(datapoint['Gender'])

labels_list = list(labels_list)

In [29]:
labels_list

['masc', '_', 'fem']

In [30]:
tokenizer = AutoTokenizer.from_pretrained(
    'bert-base-multilingual-uncased', return_tensors='pt')
label_all_tokens = True
labels_encoder = ClassLabel(names=labels_list)

# Tokenizing and aligning the tags


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["words"], truncation=True, is_split_into_words=True)
    examples["Gender"] = [labels_encoder.str2int(
        tag) for tag in examples["Gender"]]

    labels = []
    for i, label in enumerate(examples["Gender"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [31]:
labels_encoder.str2int('_')

1

In [32]:
preprocessed_dataset = Lince_feature_dataset.map(
    tokenize_and_align_labels, batched=True)
preprocessed_dataset.set_format(
    type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Loading cached processed dataset at /home/chaitanya/Acads/Thesis_Work/Experiments/Code-Switching/Datasets/Lince_Gender_spaeng/train/cache-58d1c04b515ee7ee.arrow
Loading cached processed dataset at /home/chaitanya/Acads/Thesis_Work/Experiments/Code-Switching/Datasets/Lince_Gender_spaeng/validation/cache-f84ed1e995aa71e1.arrow
Loading cached processed dataset at /home/chaitanya/Acads/Thesis_Work/Experiments/Code-Switching/Datasets/Lince_Gender_spaeng/test/cache-7f94ee2876a5a2c9.arrow


In [33]:
class mBERTLayerModel(nn.Module):
    def __init__(self, checkpoint, num_labels, layer_number):
        super(mBERTLayerModel, self).__init__()
        self.num_labels = num_labels
        self.layer_number = layer_number
        self.model = AutoModel.from_pretrained(checkpoint, config=AutoConfig.from_pretrained(checkpoint,
                                                                                             output_attention=True,
                                                                                             output_hidden_states=True))
        # self.dropout = nn.Dropout(0.1)
#        self.classifier = nn.LSTM(input_size=768, hidden_size=self.num_labels, num_layers=1, batch_first=True)
        self.classifier = nn.Linear(
            in_features=768, out_features=num_labels, bias=True)

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.model(input_ids=input_ids,
                             attention_mask=attention_mask)

        # last_hidden_state = outputs[0]

        bert_embeddings = outputs.hidden_states[self.layer_number]
        # print(bert_embeddings.shape)
        # sequence_outputs = self.dropout(bert_embeddings)
        # print("dropout layer output: ", sequence_outputs.shape)
        logits = self.classifier(bert_embeddings)
        loss = None

        if labels is not None:
            loss_func = nn.CrossEntropyLoss()
            # print("Classifier output sent as is shape: ", logits[0].shape)
            loss = loss_func(torch.reshape(
                logits, (-1, self.num_labels)), labels.view(-1))
            # exit()
            return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states[self.layer_number], attentions=outputs.attentions)

In [34]:
POS_tagger = mBERTLayerModel(checkpoint='bert-base-multilingual-uncased',
                             num_labels=len(labels_list), layer_number=LAYER_NUMBER)
# POS_tagger.model.requires_grad(False) # Freezing a layer
# OR

for param in POS_tagger.model.parameters():
    param.requires_grad = False

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [35]:
data_collator = DataCollatorForTokenClassification(tokenizer)

train_dataloader = DataLoader(
    preprocessed_dataset["train"].select(range(6)), shuffle=True, batch_size=3, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    preprocessed_dataset["validation"].select(range(6)), batch_size=3, collate_fn=data_collator
)

In [36]:
optimizer = AdamW(POS_tagger.parameters(), lr=5e-5)

num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

metric = evaluate.load("seqeval")


progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(num_epochs * len(eval_dataloader)))


for epoch in range(num_epochs):
    POS_tagger.train()
    for batch in train_dataloader:
        batch = {k: v for k, v in batch.items()}
        outputs = POS_tagger(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar_train.update(1)

    POS_tagger.eval()
    for batch in eval_dataloader:
        batch = {k: v for k, v in batch.items()}
        with torch.no_grad():
            outputs = POS_tagger(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        # print(predictions.shape, "batch:", batch['labels'].shape)
        labels = batch['labels']
        true_predictions = [
            [labels_encoder.int2str(p.item()) for (
                p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [labels_encoder.int2str(l.item()) for (
                p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        # for sentence_true_labels, sentence_true_predictions in zip(true_labels, true_predictions):
        metric.add_batch(predictions=true_predictions, references=true_labels)
        progress_bar_eval.update(1)
    print("===================== Epoch: ", epoch, " =====================")
    print(metric.compute())

2


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'_': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'asc': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'em': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'overall_precision': 0.0, 'overall_recall': 0.0, 'overall_f1': 0.0, 'overall_accuracy': 0.631578947368421}


  _warn_prf(average, modifier, msg_start, len(result))
