In [1]:
# !pip install numpy==1.23.5 scikit-learn==1.2.2 tensorboard==2.14.1 torch==2.1.0 tqdm==4.66.1 transformers==4.34.1
# !pip install scikit-learn==1.4 torch==2.5.1 tqdm==4.66.1 transformers==4.41.1 huggingface-hub>23.0

In [2]:
import re
import random
from collections import Counter, defaultdict, namedtuple
from typing import Tuple, List, Dict, Any

from os import listdir
from os.path import isfile, join
from pathlib import Path

import torch
import numpy as np

from tqdm import tqdm, trange
import warnings

warnings.filterwarnings("ignore")

In [3]:
def set_global_seed(seed: int) -> None:
    """
    Set global seed for reproducibility.
    """

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

set_global_seed(42)

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [5]:
def handle_text(
    text: str
) -> Tuple[List[str], List[Tuple[int, int]]]:
    pattern = r'\b\w+\b'
    matches = re.finditer(pattern, text.lower())

    tokens = []
    pos = []
    for match in matches:
        tokens += [match.group(0)]
        pos += [(match.start(), match.end())]
    return tokens, pos

In [6]:
def handle_nerel(
    txt_path: str,
    ann_path: str,
) -> Tuple[List[List[str]], List[List[str]]]:

    if not (isfile(txt_path) and isfile(ann_path)):
        return [], []
    
    with open(txt_path, "r", encoding="utf-8") as reader:
      txt_lines = reader.readlines()

    with open(ann_path, "r", encoding="utf-8") as reader:
      ann_lines = reader.readlines()

    # Create named entities list
    ne_list = {}
    for ann in ann_lines:
        parts = ann.strip().split()
        if not(len(parts) >= 5 and parts[0].startswith("T")):
            continue
            
        entity_type = parts[1].strip()

        ne_text = " ".join(parts[4:])
        ne_parts = " ".join(list(map(str.strip, re.sub(r'[^\w]', ' ', ne_text).lower().strip().split())))
        ne_list.setdefault(ne_parts, entity_type)

    max_ne_len = max(map(lambda ne: len(ne.split()), ne_list.keys()))

    # Handle text lines
    cur_tokens = []
    cur_labels = []
    token_seq = []
    label_seq = []
    for line in txt_lines:
        if not line.strip():
            if not (cur_tokens and cur_labels):
                continue
            token_seq += [cur_tokens]
            label_seq += [cur_labels]
            cur_tokens = []
            cur_labels = []
        else:
            clear_line = line.strip()
            cur_tokens, cur_pos = handle_text(clear_line)

            cur_labels = ["0" for i in range(len(cur_tokens))]
            for start in range(len(cur_tokens)):
                for end in range(start + 1, min(len(cur_tokens) + 1, start + max_ne_len + 1)):
                    substr = " ".join(cur_tokens[start:end])
                    if substr in ne_list:
                        for label_idx in range(start, end):
                            cur_labels[label_idx] = ne_list[substr]
                            
            # print(f"{cur_tokens}\t{cur_labels}")
            

    if cur_tokens and cur_labels:
        token_seq += [cur_tokens]
        label_seq += [cur_labels]
       
    return token_seq, label_seq

In [7]:
def read_nerel(
    path: str,
    lower: bool = True,
) -> Tuple[List[List[str]], List[List[str]]]:
    """
    Prepare data in CoNNL like format.

    Args:
        path:   The path to the files dir (str).
        lower:  Reduce text to lowercase (bool).

    Returns:
        Function returns pair (token_seq, label_seq).
        token_seq: The list of lists. Each internal list is
            a sentence converted into tokens.
        label_seq: The list of lists. All internal lists
            contain tags corresponding to tokens from token_seq.

    """

    token_seq: List[List[str]] = []
    label_seq: List[List[str]] = []

    files = list(set([join(path, Path(f).stem) for f in listdir(path) if isfile(join(path, f))]))

    txt_suffux = ".txt"
    ann_suffix = ".ann"

    for file in files:
        cur_tokens, cur_labels = handle_nerel(file + txt_suffux,
                                              file + ann_suffix)
        token_seq += cur_tokens
        label_seq += cur_labels
    return token_seq, label_seq

In [8]:
train_token_seq, train_label_seq = read_nerel("/kaggle/input/nerel-v1-1/train/")
valid_token_seq, valid_label_seq = read_nerel("/kaggle/input/nerel-v1-1/dev/")
test_token_seq, test_label_seq   = read_nerel("/kaggle/input/nerel-v1-1/test/")

In [9]:
for token, label in zip(train_token_seq[0], train_label_seq[0]):
    print(f"{token}\t{label}")

у	0
бывшего	0
президента	PROFESSION
украины	COUNTRY
виктора	PERSON
януковича	PERSON
на	0
территории	0
россии	COUNTRY
якобы	0
родился	EVENT
сын	EVENT
сообщает	0
страна	ORGANIZATION
ua	ORGANIZATION


In [10]:
for token, label in zip(valid_token_seq[0], valid_label_seq[0]):
    print(f"{token}\t{label}")

первый	ORGANIZATION
канал	ORGANIZATION
аннулировал	EVENT
результаты	0
финального	EVENT
голосования	EVENT
на	0
шоу	0
голос	WORK_OF_ART
дети	WORK_OF_ART
победительницей	0
которого	0
стала	0
десятилетняя	AGE
микелла	PERSON
абрамова	PERSON


In [11]:
for token, label in zip(test_token_seq[0], test_label_seq[0]):
    print(f"{token}\t{label}")

скончался	EVENT
кузя	PERSON
уо	PERSON


In [12]:
token_counter = Counter([token for sentence in train_token_seq for token in sentence])
print(*token_counter.most_common(10), sep='\n')
print(f"Количество уникальных слов в тренировочном датасете: {len(token_counter)}")
print(f"Количество слов встречающихся только один раз в тренировочном датасете: {len([token for token, cnt in token_counter.items() if cnt == 1])}")

('в', 7183)
('и', 3092)
('на', 2307)
('с', 1526)
('по', 1383)
('что', 1348)
('года', 1048)
('не', 991)
('из', 794)
('он', 762)
Количество уникальных слов в тренировочном датасете: 31568
Количество слов встречающихся только один раз в тренировочном датасете: 18097


In [13]:
def get_token2idx(
    token_seq: List[List[str]],
    min_count: int,
) -> Dict[str, int]:
    """
    Get mapping from tokens to indices to use with Embedding layer.

    Args:
        token_seq: The list of lists. Each internal list (sentence)
            consists of tokens.
        min_count:  The minimum number of repetitions of
            a token in the corpus.

    Returns:
        Function returns mapping from token to id.
        token2idx: The mapping from token
            to id without "rare" words.

    """

    token2idx: Dict[str, int] = {}
    token2cnt = Counter([token for sentence in token_seq for token in sentence])

    # token2cnt = Counter({k: c for k, c in token2cnt.items() if c >= min_count})
    token2idx["<PAD>"] = 0
    token2idx["<UNK>"] = 1

    current_idx = 2
    for token, count in token2cnt.items():
        if count >= min_count:
            token2idx[token] = current_idx
            current_idx += 1

    return token2idx

In [14]:
token2idx = get_token2idx(train_token_seq, min_count=2)

In [15]:
def get_label2idx(label_seq: List[List[str]]) -> Dict[str, int]:
    """
    Get mapping from labels to indices.

    Args:
        label_seq: The list of lists. Each internal list (sentence)
            consists of labels.

    Returns:
        Function returns mapping from label to id.
        label2idx: The mapping from label to id.

    """

    label2idx: Dict[str, int] = {}
    label_list = set(label for sentence in label_seq for label in sentence)
    label_list = sorted(label_list, key=lambda x: 'A' if x == 'O' else x)

    label2idx = {k: i for i, k in enumerate(label_list)}

    return label2idx

In [16]:
label2idx = get_label2idx(train_label_seq)

In [18]:
for token, idx in list(token2idx.items())[:20]:
    print(f"{token}\t{idx}")

<PAD>	0
<UNK>	1
у	2
бывшего	3
президента	4
украины	5
виктора	6
януковича	7
на	8
территории	9
россии	10
якобы	11
родился	12
сын	13
сообщает	14
страна	15
ua	16
ребёнок	17
появился	18
свет	19


In [19]:
for label, idx in label2idx.items():
    print(f"{label}\t{idx}")

0	0
AGE	1
AWARD	2
CITY	3
COUNTRY	4
CRIME	5
DATE	6
DISEASE	7
DISTRICT	8
EVENT	9
FACILITY	10
FAMILY	11
IDEOLOGY	12
LANGUAGE	13
LAW	14
LOCATION	15
MONEY	16
NATIONALITY	17
NUMBER	18
ORDINAL	19
ORGANIZATION	20
PENALTY	21
PERCENT	22
PERSON	23
PRODUCT	24
PROFESSION	25
RELIGION	26
STATE_OR_PROVINCE	27
TIME	28
WORK_OF_ART	29


In [19]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


def compute_metrics(
    outputs: torch.Tensor,
    labels: torch.LongTensor,
) -> Dict[str, float]:
    """
    Compute NER metrics.

    Args:
        outputs: the model outputs (batch_size, num_classes, sequence_len)
        labels: the correct classes (batch_size, sequence_len)

    Returns:
        metrics: mapping metric names to their corresponding values
    """

    metrics = {}

    outputs = outputs.detach().cpu().numpy()
    labels = labels.detach().cpu().numpy()

    outputs = np.transpose(outputs, (0, 2, 1))
    outputs = outputs.reshape(-1, outputs.shape[-1])
    y_pred = np.argmax(outputs, axis=-1)

    labels = labels.flatten()

    mask = labels != -1
    y_true = labels[mask]
    y_pred = y_pred[mask]


    metrics['accuracy'] = accuracy_score(
        y_true=y_true,
        y_pred=y_pred,
    )

    for metric_func in [precision_score, recall_score, f1_score]:
        metric_name = metric_func.__name__.split('_')[0]
        for average_type in ["micro", "macro", "weighted"]:
            metrics[metric_name + '_' + average_type] = metric_func(
                y_true=y_true,
                y_pred=y_pred,
                average=average_type,
                zero_division=0,
            )

    return metrics

In [20]:
def train_epoch(
    model: torch.nn.Module,
    dataloader: torch.utils.data.DataLoader,
    optimizer: torch.optim.Optimizer,
    criterion: torch.nn.Module,
    device: torch.device,
    epoch: int,
    model_type: str,
) -> None:
    """
    One training cycle (loop).

    Args:
        model: BiLSTM model
        dataloader: Dataloader with train data
        optimizer: an algorithm for model optimization
        criterion: the loss function
        device: the device on which the model will work
        epoch: the total number of epochs

    Returns:
        None
    """

    model.train()

    epoch_loss = []
    batch_metrics_list = defaultdict(list)

    for i, (tokens, labels) in tqdm(
        enumerate(dataloader),
        total=len(dataloader),
        desc="loop over train batches",
    ):

        tokens, labels = tokens.to(device), labels.to(device)

        outputs = None
        loss = None

        if model_type == 'BiLSTM':
            optimizer.zero_grad()
            outputs = model(tokens)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        elif model_type == 'Transformer':
            optimizer.zero_grad()
            outputs = model(**tokens)
            loss = criterion(outputs["logits"].transpose(1, 2), labels)
            loss.backward()
            optimizer.step()
        else:
            raise ValueError('Use \'BiLSTM\' or \'Transformer\' model_type.')

        epoch_loss.append(loss.item())

        with torch.no_grad():
            model.eval()
            if model_type == 'BiLSTM':
                outputs_inference = model(tokens)
            elif model_type == 'Transformer':
                outputs_inference = model(**tokens)["logits"].transpose(1, 2)
            else:
                raise ValueError('Use \'BiLSTM\' or \'Transformer\' model_type.')
            model.train()

        batch_metrics = compute_metrics(
            outputs=outputs_inference,
            labels=labels,
        )

        for metric_name, metric_value in batch_metrics.items():
            batch_metrics_list[metric_name].append(metric_value)

    avg_loss = np.mean(epoch_loss)
    print(f"Train loss: {avg_loss}\n")

    for metric_name, metric_value_list in batch_metrics_list.items():
        metric_value = np.mean(metric_value_list)
        print(f"Train {metric_name}: {metric_value}\n")

In [21]:
def evaluate_epoch(
    model: torch.nn.Module,
    dataloader: torch.utils.data.DataLoader,
    criterion: torch.nn.Module,
    device: torch.device,
    epoch: int,
    model_type: str,
) -> None:
    """
    One evaluation cycle (loop).

    Args:
        model: BiLSTM model
        dataloader: Dataloader with data for evaluation
        criterion: a loss function
        device: the device on which the model will work
        epoch: the total number of epochs

    Returns:
        None
    """

    model.eval()

    epoch_loss = []
    batch_metrics_list = defaultdict(list)

    with torch.no_grad():

        for i, (tokens, labels) in tqdm(
            enumerate(dataloader),
            total=len(dataloader),
            desc="loop over test batches",
        ):

            tokens, labels = tokens.to(device), labels.to(device)

            if model_type == 'BiLSTM':
                outputs = model(tokens)
                loss = criterion(outputs, labels)
            elif model_type == 'Transformer':
                outputs = model(**tokens)
                loss = criterion(outputs["logits"].transpose(1, 2), labels)
                outputs = outputs["logits"].transpose(1, 2)
            else:
                raise ValueError('Use \'BiLSTM\' or \'Transformer\' model_type.')

            epoch_loss.append(loss.item())

            batch_metrics = compute_metrics(
                outputs=outputs,
                labels=labels,
            )

            for metric_name, metric_value in batch_metrics.items():
                batch_metrics_list[metric_name].append(metric_value)

        avg_loss = np.mean(epoch_loss)
        print(f"Test loss:  {avg_loss}\n")

        for metric_name, metric_value_list in batch_metrics_list.items():
            metric_value = np.mean(metric_value_list)
            print(f"Test {metric_name}: {metric_value}\n")

In [87]:
def train(
    n_epochs: int,
    model: torch.nn.Module,
    train_dataloader: torch.utils.data.DataLoader,
    valid_dataloader: torch.utils.data.DataLoader,
    optimizer: torch.optim.Optimizer,
    criterion: torch.nn.Module,
    device: torch.device,
    model_type: str,
) -> None:
    """
    Training loop.

    Args:
        n_epochs: the total number of epochs in training
        model: BiLSTM model
        train_dataloader:  Dataloader with train data
        valid_dataloader: Dataloader with data for evaluation
        optimizer: an algorithm for model optimization
        criterion: a loss function
        device: the device on which the model will work

    Returns:
        None
    """

    for epoch in range(n_epochs):

        print(f"Epoch [{epoch+1} / {n_epochs}]\n")

        train_epoch(
            model=model,
            dataloader=train_dataloader,
            optimizer=optimizer,
            criterion=criterion,
            device=device,
            epoch=epoch,
            model_type=model_type,
        )
        evaluate_epoch(
            model=model,
            dataloader=valid_dataloader,
            criterion=criterion,
            device=device,
            epoch=epoch,
            model_type=model_type,
        )

In [63]:
from transformers import AutoTokenizer

In [150]:
model_name = "cointegrated/rubert-tiny"

In [151]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/241k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/468k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [152]:
class TransformersDataset(torch.utils.data.Dataset):
    """
    Transformers Dataset for NER.
    """

    def __init__(
        self,
        token_seq: List[List[str]],
        label_seq: List[List[str]],
    ):
        """
        Class constructor.

        Args:
            token_seq: the list of lists contains token sequences.
            label_seq: the list of lists consists of label sequences.

        Returns:
            None
        """
        self.token_seq = token_seq
        self.label_seq = [self.process_labels(labels, label2idx) for labels in label_seq]

    def __len__(self):
        """
        Returns length of the dataset.

        Args:
            None

        Returns:
            length of the dataset
        """
        return len(self.token_seq)

    def __getitem__(
        self,
        idx: int,
    ) -> Tuple[List[str], List[int]]:
        """
        Gets one item for tthe dataset

        Args:
            idx: the index of the particular element in the dataset

        Returns:
            (tokens, labels), where `tokens` is sequence of token in the dataset
                by index `idx` and `labels` is corresponding labels list
        """
        tokens = self.token_seq[idx]
        labels = self.label_seq[idx]

        return tokens, labels

    @staticmethod
    def process_labels(
        labels: List[str],
        label2idx: Dict[str, int],
    ) -> List[int]:
        """
        Transform list of labels into list of labels' indices.

        Args:
            labels: the list of strings contains the labels
            label2idx: mapping from a label to an index

        Returns:
            ids: the sequence of indices that correspond to labels
        """

        ids = [label2idx[label] for label in labels]

        return ids

In [153]:
train_dataset = TransformersDataset(
    token_seq=train_token_seq,
    label_seq=train_label_seq,
)
valid_dataset = TransformersDataset(
    token_seq=valid_token_seq,
    label_seq=valid_label_seq,
)
test_dataset = TransformersDataset(
    token_seq=test_token_seq,
    label_seq=test_label_seq,
)

In [154]:
train_dataset[0]

(['российский',
  'магнат',
  'дмитрий',
  'ицков',
  'собирается',
  'поместить',
  'содержимое',
  'своего',
  'мозга',
  'в',
  'искусственное',
  'тело',
  'и',
  'достичь',
  'бессмертия'],
 [17, 25, 23, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [155]:
valid_dataset[0]

(['десятки',
  'мафиозных',
  'боссов',
  'могут',
  'быть',
  'освобождены',
  'из',
  'тюрем',
  'по',
  'всей',
  'италии',
  'из',
  'за',
  'риска',
  'заражения',
  'коронавирусом',
  'судьи',
  'уже',
  'освободили',
  'как',
  'минимум',
  'трех',
  'стареющих',
  'бандитов',
  'поместив',
  'их',
  'под',
  'домашний',
  'арест'],
 [18,
  25,
  25,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  4,
  0,
  0,
  0,
  0,
  7,
  25,
  0,
  9,
  0,
  0,
  18,
  0,
  25,
  0,
  0,
  0,
  21,
  21])

In [156]:
test_dataset[0]

(['путин',
  'подписал',
  'указ',
  'о',
  'подготовке',
  'празднования',
  '75',
  'й',
  'годовщины',
  'победы'],
 [23, 9, 14, 14, 14, 9, 1, 1, 1, 9])

In [157]:
from transformers import PreTrainedTokenizer
from transformers.tokenization_utils_base import BatchEncoding


class TransformersCollator:
    """
    Transformers Collator that handles variable-size sentences.
    """

    def __init__(
        self,
        tokenizer: PreTrainedTokenizer,
        tokenizer_kwargs: Dict[str, Any],
        label_padding_value: int,
    ):
        """
        TransformersCollator class constructor.

        Args:
            tokenizer: the pretrained tokenizer which converts sentence
                to tokens.
            tokenizer_kwargs: the arguments of the tokenizer
            label_padding_value: the padding value for a label

        Returns:
            None
        """
        self.tokenizer = tokenizer
        self.tokenizer_kwargs = tokenizer_kwargs

        self.label_padding_value = label_padding_value

    def __call__(
        self,
        batch: List[Tuple[List[str], List[int]]],
    ) -> Tuple[torch.LongTensor, torch.LongTensor]:
        """
        Calls transformers' collator.

        Args:
            batch: One batch with sentence and labels.

        Returns:
            (tokens, labels), where `tokens` is sequence of token
                and `labels` is corresponding labels list
        """
        tokens, labels = zip(*batch)

        tokens = self.tokenizer(tokens, **self.tokenizer_kwargs)
        labels = self.encode_labels(tokens, labels, self.label_padding_value)

        tokens.pop("offset_mapping")

        return tokens, labels

    @staticmethod
    def encode_labels(
        tokens: BatchEncoding,
        labels: List[List[int]],
        label_padding_value: int,
    ) -> torch.LongTensor:

        encoded_labels = []

        for doc_labels, doc_offset in zip(labels, tokens.offset_mapping):

            doc_enc_labels = np.ones(len(doc_offset), dtype=int) * label_padding_value
            arr_offset = np.array(doc_offset)

            doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
            encoded_labels.append(doc_enc_labels.tolist())

        return torch.LongTensor(encoded_labels)

In [158]:
tokenizer_kwargs = {
    "is_split_into_words":    True,
    "return_offsets_mapping": True,
    "padding":                True,
    "truncation":             True,
    "max_length":             512,
    "return_tensors":         "pt",
}

In [159]:
collator = TransformersCollator(
    tokenizer=tokenizer,
    tokenizer_kwargs=tokenizer_kwargs,
    label_padding_value=-1,
)

In [160]:
train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=2,
    shuffle=True,
    collate_fn=collator,
)
valid_dataloader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=1,
    shuffle=False,
    collate_fn=collator,
)
test_dataloader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=1,
    shuffle=False,
    collate_fn=collator,
)

In [161]:
tokens, labels = next(iter(train_dataloader))

tokens = tokens.to(device)
labels = labels.to(device)

In [162]:
tokens

{'input_ids': tensor([[    2, 17976,  1241, 20839,  2371,   314,  3277, 13325,  2788, 25599,
          7548,  7906,  2480,    25,  3452,   991,   650, 16615, 11785,   679,
          9297,     3,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [    2,  2389,  3687,   769,  5215, 20538,  1129,   887,  1736,  8126,
           322, 23964, 20245,   650,  2225, 18287,   644,   331, 17356,    89,
           795,   541,   689, 25989,  9097,   314, 20222, 10633,  9762,   314,
          3200,   860, 11783,  1129, 11316,  2629,   719,  2262,  8481,   316,
         24183,   334,  3374,  3003,   776,   322, 29350,  9521, 19946,   656,
       

In [77]:
labels

tensor([[-1,  0,  0,  0,  0,  0,  0,  0,  0, -1, 23, -1, -1, -1,  0, 25,  0,  0,
          0,  0, -1,  9,  9, -1,  9, 23, -1, -1, -1,  0,  0, 23, -1, -1, -1,  0,
          0,  0,  0, 23, -1, 23, -1, -1,  0,  6,  6,  6,  6,  0,  0, -1,  0,  0,
          9,  9,  9,  9,  0,  9,  0,  3, -1, -1,  0, 27, -1, -1],
        [-1, 23, -1, 23, -1, -1,  9,  6,  6,  6,  0,  3, -1,  0,  9,  0,  4, -1,
          0,  9, -1,  9,  0,  0, 23, -1, 23, -1, -1, 23, -1, -1,  0, 23, -1, 23,
         -1, -1, -1, 23, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]],
       device='cuda:0')

In [163]:
from transformers import AutoModelForTokenClassification

In [165]:
criterion = torch.nn.CrossEntropyLoss(ignore_index=-1)

In [166]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

In [167]:
outputs = model(**tokens)

In [168]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label2idx),
).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
train(2, model, train_dataloader, test_dataloader, optimizer, criterion, device, model_type='Transformer')

Some weights of BertForTokenClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1 / 2]



loop over train batches: 100%|██████████| 2532/2532 [01:04<00:00, 39.49it/s]


Train loss: 1.1269770462660333

Train accuracy: 0.730566379258425

Train precision_micro: 0.730566379258425

Train precision_macro: 0.36312456567739076

Train precision_weighted: 0.6405253334151676

Train recall_micro: 0.730566379258425

Train recall_macro: 0.39484436978950604

Train recall_weighted: 0.730566379258425

Train f1_micro: 0.730566379258425

Train f1_macro: 0.36037544326537396

Train f1_weighted: 0.6669212134547301



loop over test batches: 100%|██████████| 656/656 [00:08<00:00, 80.77it/s]


Test loss:  0.8011731366689385

Test accuracy: 0.7847300352982242

Test precision_micro: 0.7847300352982242

Test precision_macro: 0.5266015486516193

Test precision_weighted: 0.7421485402047576

Test recall_micro: 0.7847300352982242

Test recall_macro: 0.5537226437283732

Test recall_weighted: 0.7847300352982242

Test f1_micro: 0.7847300352982242

Test f1_macro: 0.524480126698854

Test f1_weighted: 0.7481568827004733

Epoch [2 / 2]



loop over train batches: 100%|██████████| 2532/2532 [01:03<00:00, 39.67it/s]


Train loss: 0.7220907474957747

Train accuracy: 0.8142334456058686

Train precision_micro: 0.8142334456058686

Train precision_macro: 0.5549421812394669

Train precision_weighted: 0.7832895309669589

Train recall_micro: 0.8142334456058686

Train recall_macro: 0.569998296636999

Train recall_weighted: 0.8142334456058686

Train f1_micro: 0.8142334456058686

Train f1_macro: 0.5417780503711705

Train f1_weighted: 0.7834856083115391



loop over test batches: 100%|██████████| 656/656 [00:08<00:00, 80.73it/s]

Test loss:  0.6496579609030472

Test accuracy: 0.8174364219286242

Test precision_micro: 0.8174364219286242

Test precision_macro: 0.5976435160487428

Test precision_weighted: 0.8002671724032545

Test recall_micro: 0.8174364219286242

Test recall_macro: 0.6124160447810778

Test recall_weighted: 0.8174364219286242

Test f1_micro: 0.8174364219286242

Test f1_macro: 0.5894434933456056

Test f1_weighted: 0.7940216736034397






In [169]:
evaluate_epoch(
    model=model,
    dataloader=test_dataloader,
    criterion=criterion,
    device=device,
    epoch=1,
    model_type='Transformer',
)

loop over test batches: 100%|██████████| 656/656 [00:08<00:00, 81.64it/s]

Test loss:  0.6496579609030472

Test accuracy: 0.8174364219286242

Test precision_micro: 0.8174364219286242

Test precision_macro: 0.5976435160487428

Test precision_weighted: 0.8002671724032545

Test recall_micro: 0.8174364219286242

Test recall_macro: 0.6124160447810778

Test recall_weighted: 0.8174364219286242

Test f1_micro: 0.8174364219286242

Test f1_macro: 0.5894434933456056

Test f1_weighted: 0.7940216736034397






In [97]:
from transformers import BertTokenizer, PreTrainedTokenizer, AutoModelForTokenClassification

In [117]:
loaded_tokenizer = BertTokenizer.from_pretrained(save_dir)
quantized_model = AutoModelForTokenClassification.from_pretrained(save_dir)

In [111]:
evaluate_epoch(
    model=model,
    dataloader=test_dataloader,
    criterion=criterion,
    device=device,
    epoch=1,
    model_type='Transformer',
)

loop over test batches: 100%|██████████| 656/656 [00:08<00:00, 80.63it/s]

Test loss:  0.515794799854076

Test accuracy: 0.8540867246135613

Test precision_micro: 0.8540867246135613

Test precision_macro: 0.6823362475846498

Test precision_weighted: 0.8633824049041402

Test recall_micro: 0.8540867246135613

Test recall_macro: 0.6853750116532525

Test recall_weighted: 0.8540867246135613

Test f1_micro: 0.8540867246135613

Test f1_macro: 0.6696878175256592

Test f1_weighted: 0.8459211537388217






In [28]:
import json

save_dir = "./saved_bert_model"

In [23]:
with open(save_dir + "/label2idx", "w") as writer:
    writer.write(json.dumps(label2idx. indent=2))

SyntaxError: expression cannot contain assignment, perhaps you meant "=="? (<ipython-input-23-58bce0f7e495>, line 2)

In [22]:
with open(save_dir + "/token2idx", "w") as writer:
    writer.write(json.dumps(token2idx, indent=2))

SyntaxError: expression cannot contain assignment, perhaps you meant "=="? (<ipython-input-22-ffd2bda14595>, line 2)

In [None]:
type(with open(save_dir + "/label2idx", "w") as writer:)

In [178]:
    

token2idx

model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

('./saved_bert_model/tokenizer_config.json',
 './saved_bert_model/special_tokens_map.json',
 './saved_bert_model/vocab.txt',
 './saved_bert_model/added_tokens.json',
 './saved_bert_model/tokenizer.json')

In [179]:
!zip -r pretrained.zip /kaggle/working/

updating: kaggle/working/ (stored 0%)
updating: kaggle/working/model.pth (deflated 8%)
updating: kaggle/working/.virtual_documents/ (stored 0%)
updating: kaggle/working/saved_bert_model/ (stored 0%)
updating: kaggle/working/saved_bert_model/special_tokens_map.json (deflated 80%)
updating: kaggle/working/saved_bert_model/vocab.txt (deflated 52%)
updating: kaggle/working/saved_bert_model/tokenizer.json (deflated 70%)
updating: kaggle/working/saved_bert_model/tokenizer_config.json (deflated 74%)
  adding: kaggle/working/saved_bert_model/config.json (deflated 65%)
  adding: kaggle/working/saved_bert_model/model.safetensors (deflated 8%)


In [145]:
# Check the number of parameters before and after pruning
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print("Number of parameters after pruning:", count_parameters(pruned_model))

Number of parameters after pruning: 29105502
