In [1]:
import torch
from transformers import BertTokenizerFast, BertForTokenClassification
from transformers import logging

import os
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'
print(device)

cuda


In [3]:
logging.set_verbosity_error()

In [4]:
MODEL_CKPT = "DeepPavlov/rubert-base-cased"

DATA_PATH = "data"
NERS_PATH = os.path.join(DATA_PATH, "ners.txt")

TRAIN_PATH = os.path.join(DATA_PATH, "public_data", "train.jsonl")
TEST_PATH = os.path.join(DATA_PATH, "public_data", "test.jsonl")

In [5]:
MAX_LENGTH = 128

LR = 1e-3
BATCH_SIZE = 32
DR = 0.1
EPOCHS = 10

In [6]:
NULL_ENTITY = "O"
B_PREFIX = "B-"
I_PREFIX = "I-"

In [7]:
model = BertForTokenClassification.from_pretrained(MODEL_CKPT)
tokenizer = BertTokenizerFast.from_pretrained(MODEL_CKPT)

In [8]:
with open(NERS_PATH, "r") as f:
    ner_types = [line.strip() for line in f.readlines()]

ner_labels = [NULL_ENTITY] + [f"{B_PREFIX}{ner}" for ner in ner_types] + [f"{I_PREFIX}{ner}" for ner in ner_types]
label_map = {label: i for i, label in enumerate(ner_labels)}

In [9]:
train_data = []
with open(TRAIN_PATH, 'r') as f:
    for line in f.readlines():
        data = json.loads(line)
        text = data['sentences']
        ners = data['ners']
        ners = sorted(ners, key=lambda x: x[0])
        train_data.append({"text": text, "ners": ners})

In [10]:
test_data = []
with open(TEST_PATH, 'r') as f:
    for line in f.readlines():
        data = json.loads(line)
        text = data['senences']
        id = data['id']
        test_data.append({"id": id, "text":  text})

In [11]:
def get_encoding(text):
    encoding = tokenizer.encode_plus(
        text,
        return_tensors='pt',
        padding='max_length',
        max_length=MAX_LENGTH,
        truncation=True,
        return_offsets_mapping=True
    )
    return encoding

In [12]:
def get_encoding_and_labels(text, ners):
    encoding = get_encoding(text)

    token_ids = encoding["input_ids"][0]
    offset_mapping = encoding["offset_mapping"][0]

    ner_i = 0
    labels = []

    for offset, token_id in zip(offset_mapping, token_ids):

        start, end = offset

        if start == end:
            labels.append(label_map[NULL_ENTITY])
            continue

        try:
            ner_start, ner_end, ner_type = ners[ner_i]
        except:
            break
        while ner_end < start:
            ner_i += 1
            try:
                ner_start, ner_end, ner_type = ners[ner_i]
            except:
                break

        if ner_start > end:
            labels.append(label_map[NULL_ENTITY])
        elif ner_start == start:
            labels.append(label_map[B_PREFIX + ner_type])
        else:
            labels.append(label_map[I_PREFIX + ner_type])

    return encoding, labels

In [13]:
train_tokenized_data = []
test_tokenized_data = []

for sample in train_data:
    text = sample["text"]
    ners = sample["ners"]
    train_tokenized_data.append(get_encoding_and_labels(text, ners))

for sample in test_data:
    text = sample['text']
    id = sample['id']
    test_tokenized_data.append((get_encoding(text), id))

In [14]:
class NERDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_dataset):
        self.tokenized_dataset = tokenized_dataset

    def __getitem__(self, idx):
        inputs, label = self.tokenized_dataset[idx]
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': torch.tensor(label)
        }

    def __len__(self):
        return len(self.tokenized_dataset)

In [15]:
class NERTestDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_dataset):
        self.tokenized_dataset = tokenized_dataset

    def __getitem__(self, idx):
        inputs, id = self.tokenized_dataset[idx]
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'offset_mapping': inputs["offset_mapping"][0],
            'id' : id
        }

    def __len__(self):
        return len(self.tokenized_dataset)

In [29]:
def collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_mask = [item["attention_mask"] for item in batch]
    labels = [item["labels"] for item in batch]

    max_len = max(len(x) for x in input_ids)
    padded_input_ids = torch.zeros(len(input_ids), max_len, dtype=torch.long)
    padded_attention_mask = torch.zeros(len(attention_mask), max_len, dtype=torch.long)

    for i, (input_id, mask) in enumerate(zip(input_ids, attention_mask)):
        padded_input_ids[i, : len(input_id)] = input_id.clone().detach()
        padded_attention_mask[i, : len(mask)] = mask.clone().detach()

    max_label_len = max(len(label) for label in labels)
    padded_labels = torch.zeros(
        len(labels), max_label_len, len(ner_labels), dtype=torch.float32
    )

    for i, label in enumerate(labels):
        for j, label_id in enumerate(label):
            padded_labels[i, j, label_id] = 1.0

    return {
        "input_ids": padded_input_ids,
        "attention_mask": padded_attention_mask,
        "labels": padded_labels,
    }

In [30]:
def collate_fn_test(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_mask = [item["attention_mask"] for item in batch]
    offset_mapping = [item["offset_mapping"] for item in batch]
    ids = torch.tensor([item['id'] for item in batch])

    # Find the maximum length in the batch for input_ids
    max_len = max(len(x) for x in input_ids)

    # Create padded tensors for input_ids and attention_mask
    padded_input_ids = torch.zeros(len(input_ids), max_len, dtype=torch.long)
    padded_attention_mask = torch.zeros(len(attention_mask), max_len, dtype=torch.long)
    padded_offset_mapping = torch.zeros(len(offset_mapping), max_len, 2, dtype=torch.long)

    # Fill the padded tensors with actual data
    for i, (input_id, mask, mapping) in enumerate(zip(input_ids, attention_mask, offset_mapping)):
        padded_input_ids[i, :len(input_id)] = input_id.clone().detach()
        padded_attention_mask[i, :len(mask)] = mask.clone().detach()
        padded_offset_mapping[i, :len(mapping)] = mapping.clone().detach()

    return {
        "input_ids": padded_input_ids,
        "attention_mask": padded_attention_mask,
        "offset_mapping": padded_offset_mapping,
        "ids": ids
    }

In [31]:
train_dataset = NERDataset(train_tokenized_data)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

test_dataset = NERTestDataset(test_tokenized_data)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn_test)

In [46]:
class NERModel(torch.nn.Module):
    def __init__(self, model):
        super(NERModel, self).__init__()
        self.model = model
        self.model.dropout = torch.nn.Dropout(DR)
        self.model.classifier = torch.nn.Linear(model.config.hidden_size, len(ner_labels))

        # Freeze the pretrained model weights
        for param in self.model.parameters():
            param.requires_grad = False

        for para in self.model.bert.encoder.layer[11].parameters():
            param.requires_grad = True

        # Set the classifier layer weights to be trainable
        for param in self.model.classifier.parameters():
            param.requires_grad = True

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids, attention_mask=attention_mask)
        return outputs.logits

In [47]:
ner_model = NERModel(model).to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(ner_model.parameters(), lr=LR)

In [48]:
for epoch in range(EPOCHS):
    ner_model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)  # Move labels to GPU device

        optimizer.zero_grad()

        outputs = ner_model(input_ids, attention_mask)

        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_dataloader)}")

Epoch 1, Loss: 10.096461408278522
Epoch 2, Loss: 9.244666323942297
Epoch 3, Loss: 8.76241050047033
Epoch 4, Loss: 8.480520024019128
Epoch 5, Loss: 8.336179649128633
Epoch 6, Loss: 8.249768032747156
Epoch 7, Loss: 8.179402716019574
Epoch 8, Loss: 8.138147943160114
Epoch 9, Loss: 8.097515723284554
Epoch 10, Loss: 8.060415520387537


In [49]:
ner_model.eval()

results = []

for batch in test_dataloader:
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    offset_mapping = batch["offset_mapping"].to(device)
    ids = batch["ids"].to(device)

    outputs = ner_model(input_ids, attention_mask)

    predictions = torch.argmax(outputs, -1)

    results += list(zip(ids, predictions, offset_mapping))

In [50]:
labeled_result = []
for id, predictions, offset_mapping  in results:
  ners = []
  prev_label = NULL_ENTITY
  start_index = 0
  end_index = 0
  for p, m in zip(predictions, offset_mapping):
    label = ner_labels[p]
    if label == NULL_ENTITY:
      if prev_label != NULL_ENTITY and start_index != end_index:
        ners.append([start_index, end_index, prev_label[2:]])
      prev_label = label
    elif label[:2] == B_PREFIX:
      if prev_label != NULL_ENTITY and start_index != end_index:
        ners.append([start_index, end_index, prev_label[2:]])
      prev_label = label
      start_index = m[0].item()
      end_index = m[1].item()
    elif label[:2] == I_PREFIX:
      if prev_label[2:] != label[2:]:
        if prev_label != NULL_ENTITY and start_index != end_index:
          ners.append([start_index, end_index, prev_label[2:]])
        start_index = m[0].item()
      prev_label = label
      end_index = m[1].item()
  res = {"id" : id.item(), "ners": ners}
  labeled_result.append(res)

In [51]:
with open('test.jsonl', 'w') as f:
    for item in labeled_result:
        f.write(json.dumps(item) + '\n')

In [52]:
import zipfile

# Create a new ZIP file
with zipfile.ZipFile('test.zip', 'w') as zip_file:
    # Add a single file to the ZIP file
    zip_file.write('test.jsonl')