## setup

In [1]:
!wget http://www.labinform.ru/pub/named_entities/collection5.zip

--2025-04-20 14:31:58--  http://www.labinform.ru/pub/named_entities/collection5.zip
Resolving www.labinform.ru (www.labinform.ru)... 95.181.230.181
Connecting to www.labinform.ru (www.labinform.ru)|95.181.230.181|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1899530 (1.8M) [application/zip]
Saving to: ‘collection5.zip’


2025-04-20 14:32:01 (1.08 MB/s) - ‘collection5.zip’ saved [1899530/1899530]



In [2]:
!unzip -q /kaggle/working/collection5.zip

In [3]:
!pip install jsonlines

Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-4.0.0


In [34]:
import numpy as np
import pandas as pd
import os
import json
import random
import spacy
from spacy.cli import download
from tqdm.auto import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForTokenClassification, get_scheduler, Trainer, TrainingArguments, AutoModelForMaskedLM, pipeline 
from datasets import load_dataset
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

swag_seed = 59
download("ru_core_news_lg")

Collecting ru-core-news-lg==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_lg-3.7.0/ru_core_news_lg-3.7.0-py3-none-any.whl (513.4 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 513.4/513.4 MB 3.0 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('ru_core_news_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


## data

In [5]:
ds_path = '/kaggle/working/Collection5'
print(len(list(os.listdir(ds_path))))

2000


In [6]:
def parse_ann_file(ann_path, txt_path):
    entities = []
    with open(ann_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.startswith('T'):
                parts = line.strip().split('\t')
                id_, tag_info, text = parts
                tag_parts = tag_info.split()
                entity_type = tag_parts[0]
                start = int(tag_parts[1])
                end = int(tag_parts[2])
                entities.append((start, end, entity_type))
                
    with open(txt_path, 'r', encoding='utf-8') as f:
        text = f.read().replace('\n', '\n ')
        
    text_entities = (text, {'entities': entities})
    
    return text_entities


def process_dataset(folder_path):
    data = []
    files = [f.split('.')[0] for f in os.listdir(folder_path) if f.endswith('.txt')]
    
    for file_base in files:
        txt_path = os.path.join(folder_path, f"{file_base}.txt")
        ann_path = os.path.join(folder_path, f"{file_base}.ann")
        
        if os.path.exists(ann_path):
            try:
                data.append(parse_ann_file(ann_path, txt_path))
            except Exception as e:
                print(f"Error processing {file_base}: {e}")
    
    return data

In [7]:
def split_dataset(data, test_size=0.2, random_state=swag_seed):
    train_data, test_data = train_test_split(
        data, 
        test_size=test_size, 
        random_state=random_state
    )
    return train_data, test_data


def prepare_dataset(input_folder):
    
    all_data = process_dataset(input_folder)
    train_data, test_data = split_dataset(all_data)

    print(f"Total samples: {len(all_data)}")
    print(f"Train samples: {len(train_data)}")
    print(f"Test samples: {len(test_data)}")
    
    label_counts = defaultdict(int)
    for _, annotations in all_data:
        for start, end, label in annotations['entities']:
            label_counts[label] += 1

    print("\nLabel distribution:")
    for label, count in label_counts.items():
        print(f"{label}: {count}")

    return train_data, test_data

In [8]:
train_data, test_data = prepare_dataset(ds_path)

Total samples: 1000
Train samples: 800
Test samples: 200

Label distribution:
GEOPOLIT: 4104
ORG: 7033
MEDIA: 1509
PER: 10623
LOC: 3140


In [9]:
all_labels = set()
for _, annotations in train_data:
    for _, _, label in annotations['entities']:
        all_labels.add(label)
for _, annotations in test_data:
    for _, _, label in annotations['entities']:
        all_labels.add(label)

label_list = ['O'] + sorted(list(all_labels))
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

label2id

{'O': 0, 'GEOPOLIT': 1, 'LOC': 2, 'MEDIA': 3, 'ORG': 4, 'PER': 5}

In [45]:
class NERDataset(Dataset):
    def __init__(self, data, tokenizer, label2id, id2label, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.id2label = id2label
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        text, annotations = item
        encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            return_offsets_mapping=True,
            padding='max_length',
            return_tensors='pt'
        )
        
        labels = torch.zeros(self.max_length, dtype=torch.long)
        offset_mapping = encoding['offset_mapping'][0]
        
        char_labels = ['O'] * len(text)
        for start, end, label in annotations['entities']:
            for i in range(start, end):
                if i < len(char_labels):
                    char_labels[i] = label
        
        for token_idx, (token_start, token_end) in enumerate(offset_mapping):
            if token_start == token_end == 0:
                continue
            
            token_char_labels = char_labels[token_start:token_end]
            
            unique_labels = set(token_char_labels) - {'O'}
            if len(unique_labels) == 1:
                labels[token_idx] = self.label2id[unique_labels.pop()]
        
        return {
            'input_ids': encoding['input_ids'][0],
            'attention_mask': encoding['attention_mask'][0],
            'labels': labels,
        }

    # sinful
    def print_alignment(self, idx):
        sample = self[idx]
        text = self.data[idx][0]
        tokens = self.tokenizer.convert_ids_to_tokens(sample['input_ids'])
        labels = [self.id2label[l.item()] for l in sample['labels']]
        
        print("\nText:", text)
        print("\nToken alignment:")
        for token, label in zip(tokens, labels):
            print(f"{token:20} {label}")
        
        print("\nOriginal entities:")
        for start, end, label in self.data[idx][1]['entities']:
            print(f"- {text[start:end]} ({start}-{end}): {label}")

In [13]:
# model_name = 'cointegrated/rubert-tiny2'
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# train_dataset = NERDataset(train_data, tokenizer, label2id, id2label, max_len)
# train_dataset.print_alignment(9)

## train

In [14]:
model_name = 'cointegrated/rubert-tiny2'
batch_size = 64
lr = 5e-5
epochs = 20
max_len = 256
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [46]:
def train_model(train_data, test_data, device, label2id, id2label):
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(label2id),
        id2label=id2label,
        label2id=label2id
    ).to(device)
    
    train_dataset = NERDataset(train_data, tokenizer, label2id, id2label, max_len)
    test_dataset = NERDataset(test_data, tokenizer, label2id, id2label, max_len)
    
    train_dataloader = DataLoader(
        train_dataset, 
        batch_size=batch_size, 
        shuffle=True
    )
    test_dataloader = DataLoader(
        test_dataset, 
        batch_size=batch_size
    )
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    num_training_steps = epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=10,
        num_training_steps=num_training_steps
    )

    # eval before and after training
    evaluate_model(model, test_dataloader, device, label2id, id2label)
    
    progress_bar = tqdm(range(num_training_steps))
    model.train()  
    for epoch in range(epochs):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
            progress_bar.set_description(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

    evaluate_model(model, test_dataloader, device, label2id, id2label)
    return model, tokenizer
    
    
def evaluate_model(model, test_dataloader, device, label2id, id2label):
    model.eval()
    all_predictions = []
    all_labels = []
    
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        
        predictions = torch.argmax(outputs.logits, dim=-1)
        
        for i in range(predictions.shape[0]):
            active_tokens = batch['attention_mask'][i] == 1
            all_predictions.extend(predictions[i][active_tokens].cpu().numpy())
            all_labels.extend(batch['labels'][i][active_tokens].cpu().numpy())
    
    text_labels = [id2label[l] for l in all_labels]
    text_predictions = [id2label[p] for p in all_predictions]
    
    target_names = [label for label in id2label.values() if label != 'O']
    
    print("\nClassification Report (excluding 'O' class):")
    print(classification_report(
        text_labels,
        text_predictions,
        labels=target_names,
        zero_division=0
    ))
    

In [16]:
model, tokenizer = train_model(train_data, test_data, device, label2id, id2label)

tokenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.74M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/118M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Classification Report (excluding 'O' class):
              precision    recall  f1-score   support

    GEOPOLIT       0.02      0.16      0.03       721
         LOC       0.04      0.40      0.07      1106
       MEDIA       0.01      0.07      0.01       501
         ORG       0.02      0.03      0.03      2857
         PER       0.07      0.07      0.07      5491

   micro avg       0.03      0.10      0.05     10676
   macro avg       0.03      0.14      0.04     10676
weighted avg       0.05      0.10      0.05     10676



  0%|          | 0/260 [00:00<?, ?it/s]


Classification Report (excluding 'O' class):
              precision    recall  f1-score   support

    GEOPOLIT       0.94      0.67      0.79       721
         LOC       0.86      0.65      0.74      1106
       MEDIA       0.99      0.27      0.43       501
         ORG       0.73      0.82      0.77      2857
         PER       0.91      0.99      0.95      5491

   micro avg       0.85      0.85      0.85     10676
   macro avg       0.89      0.68      0.73     10676
weighted avg       0.86      0.85      0.84     10676



Пробовал разное кол-во эпох, остановился на 20, т.к. дальше уже начинаются скачки лосса 1.7-1.2 туда-сюда

Хуже всего модель научилась предсказывать класс MEDIA, что логично, т.к. это самый редкий класс, и, помимо этого, видов MEDIA, т.е. конкретных журналов, радио, газет и т.д. очень много, и наш датасет их явно не покрывает.
Это также заметно по тому, что точность предсказания MEDIA очень хорошая, в то время как реколл очень маленький.

In [17]:
outd = '/kaggle/working/clf_v1'  
model.save_pretrained(outd)
tokenizer.save_pretrained(outd)

('/kaggle/working/clf_v1/tokenizer_config.json',
 '/kaggle/working/clf_v1/special_tokens_map.json',
 '/kaggle/working/clf_v1/vocab.txt',
 '/kaggle/working/clf_v1/added_tokens.json',
 '/kaggle/working/clf_v1/tokenizer.json')

## mlm

In [19]:
class MLMDataset(Dataset):
        def __init__(self, texts, tokenizer, max_length):
            self.texts = texts
            self.tokenizer = tokenizer
            self.max_length = max_length
        
        def __len__(self):
            return len(self.texts)
        
        def __getitem__(self, idx):
            text = self.texts[idx]
            encoding = self.tokenizer(
                text,
                truncation=True,
                max_length=self.max_length,
                padding='max_length',
                return_tensors='pt'
            )
            
            input_ids = encoding['input_ids'][0].clone()
            labels = input_ids.clone()
            
            mask_prob = 0.15
            mask_indices = torch.rand(len(input_ids)) < mask_prob
            
            special_tokens = [tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id]
            mask_indices &= ~torch.isin(input_ids, torch.tensor(special_tokens))
            labels[~mask_indices] = -100
            
            mask_token_id = tokenizer.mask_token_id
            for i, masked in enumerate(mask_indices):
                if masked:
                    if torch.rand(1) < 0.8:
                        input_ids[i] = mask_token_id
                    elif torch.rand(1) < 0.4:
                        input_ids[i] = torch.randint(0, len(tokenizer), (1,))
            
            return {
                'input_ids': input_ids,
                'attention_mask': encoding['attention_mask'][0],
                'labels': labels
            }

В этом датасете задал максимальную длину 128, т.к. если ставить прошлые 256, то кажется, что можно намаскировать \[PAD] токенов, что не очень желательно 

In [13]:
model_name = 'cointegrated/rubert-tiny2'
batch_size = 64
lr = 5e-5
epochs = 25
max_len = 128
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [35]:
def pretrain_mlm(train_data, model_name, device):
    
    texts = [text for text, _ in train_data]
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    mlm_model = AutoModelForMaskedLM.from_pretrained(model_name).to(device)
    
    mlm_dataset = MLMDataset(texts, tokenizer, max_len)
    dataloader = DataLoader(mlm_dataset, batch_size=batch_size, shuffle=True)

    optimizer = torch.optim.AdamW(mlm_model.parameters(), lr=lr)
    num_training_steps = epochs * len(dataloader)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=10,
        num_training_steps=num_training_steps
    )
    
    progress_bar = tqdm(range(num_training_steps))
    mlm_model.train()
    
    for epoch in range(epochs):
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = mlm_model(**batch)
            loss = outputs.loss
            loss.backward()
            
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
            progress_bar.set_description(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

    return model, tokenizer

In [36]:
mlm_model, mlm_tokenizer = pretrain_mlm(train_data, model_name, device)

  0%|          | 0/325 [00:00<?, ?it/s]

In [37]:
outd = '/kaggle/working/mlm_v1'  
mlm_model.save_pretrained(outd)
mlm_tokenizer.save_pretrained(outd)

('/kaggle/working/mlm_v1/tokenizer_config.json',
 '/kaggle/working/mlm_v1/special_tokens_map.json',
 '/kaggle/working/mlm_v1/vocab.txt',
 '/kaggle/working/mlm_v1/added_tokens.json',
 '/kaggle/working/mlm_v1/tokenizer.json')

In [39]:
def train_model_after_mlm(model, tokenizer, train_data, test_data, device, label2id, id2label):
    
    train_dataset = NERDataset(train_data, tokenizer, label2id, id2label, max_len)
    test_dataset = NERDataset(test_data, tokenizer, label2id, id2label, max_len)
    
    train_dataloader = DataLoader(
        train_dataset, 
        batch_size=batch_size, 
        shuffle=True
    )
    test_dataloader = DataLoader(
        test_dataset, 
        batch_size=batch_size
    )
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    num_training_steps = epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=10,
        num_training_steps=num_training_steps
    )
    
    progress_bar = tqdm(range(num_training_steps))
    model.train()  
    for epoch in range(epochs):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
            progress_bar.set_description(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

    evaluate_model(model, test_dataloader, device, label2id, id2label)
    return model, tokenizer

In [40]:
mlm_ner_model, mlm_ner_tokenizer = train_model_after_mlm(mlm_model, mlm_tokenizer, train_data, test_data, device, label2id, id2label)

  0%|          | 0/325 [00:00<?, ?it/s]


Classification Report (excluding 'O' class):
              precision    recall  f1-score   support

    GEOPOLIT       0.92      0.87      0.89       470
         LOC       0.89      0.88      0.88       719
       MEDIA       0.91      0.82      0.86       379
         ORG       0.85      0.85      0.85      1686
         PER       0.96      0.99      0.97      3399

   micro avg       0.92      0.92      0.92      6653
   macro avg       0.91      0.88      0.89      6653
weighted avg       0.92      0.92      0.92      6653



По сравнению с моделью без mlm претрейна, у которой обучение ner-задачи начиналось с лосса ~3.0 и закончилось на 0.15, у модели с mlm претрейном в начале обучения лосс был 0.1, что даже ниже чем у предыдущей уже обученной модели. Mlm версия после обученмя достигла кросс-энтропии 0.04, что видно и по F1: macro f1 0.73 -> macro f1 0.89, прирост значительный.

# synth gen

In [16]:
lenta_dataset = load_dataset("IlyaGusev/ru_news")
texts = []
for i, item in tqdm(enumerate(lenta_dataset["train"])):
    if i > 12000: break
    texts.append(item["text"])
print(len(texts))

Loading dataset shards:   0%|          | 0/26 [00:00<?, ?it/s]

0it [00:00, ?it/s]

12001


In [35]:
nlp = spacy.load("ru_core_news_lg")

In [36]:
spacy_to_ner = {
    "PER": "PER",
    "ORG": "ORG",
    "LOC": "LOC",
    "MISC": None
}

MEDIA_KEYWORDS = {"телеканал", "газета", "издание", "журнал", "СМИ", "канал", "радио"}
GEOPOLIT_KEYWORDS = {"Россия", "США", "Китай", "Франция", "Германия", "Великобритания"}

def is_media(text):
    return any(keyword in text.lower() for keyword in MEDIA_KEYWORDS)

def is_geopolit(text):
    return any(keyword.lower() in text.lower() for keyword in GEOPOLIT_KEYWORDS)

In [37]:
def generate_ner_annotations(text):
    doc = nlp(text)
    entities = []
    
    for ent in doc.ents:
        custom_label = spacy_to_ner.get(ent.label_)
        
        if custom_label:
            if ent.label_ == "LOC":
                label = "GEOPOLIT" if is_geopolit(ent.text) else "LOC"
            else:
                label = custom_label
            
            entities.append((ent.start_char, ent.end_char, label))
    
    for match in nlp.tokenizer(text):
        if is_media(match.text):
            start = match.idx
            end = start + len(match.text)
            entities.append((start, end, "MEDIA"))
    
    filtered = []
    for ent in sorted(entities, key=lambda x: x[0]):
        if not filtered or ent[0] >= filtered[-1][1]:
            filtered.append(ent)
    
    return {"entities": filtered}

def generate_synthetic_dataset(input_texts):
    results = []
    
    for text in tqdm(input_texts):
        try:
            annotations = generate_ner_annotations(text)
            results.append((text, annotations))
        except Exception as e:
            print(f"Ошибка при обработке текста: {str(e)}")
            continue
            
    return results

In [38]:
def check_class_distribution(data):
    
    counts = defaultdict(int)
    
    for _, annotations in data:
        for _, _, label in annotations["entities"]:
            counts[label] += 1
    
    print("\nClasses distrib:")
    for label, count in sorted(counts.items()):
        print(f"{label}: {count}")
    
    return counts

In [39]:
data = generate_synthetic_dataset(texts)
check_class_distribution(data)

  0%|          | 0/12001 [00:00<?, ?it/s]


Classes distrib:
GEOPOLIT: 5478
LOC: 80314
MEDIA: 4056
ORG: 73982
PER: 54888


defaultdict(int,
            {'ORG': 73982,
             'LOC': 80314,
             'PER': 54888,
             'MEDIA': 4056,
             'GEOPOLIT': 5478})

In [41]:
combined_data = train_data + data

In [43]:
model_name = 'cointegrated/rubert-tiny2'
batch_size = 64
lr = 5e-5
epochs = 20
max_len = 256
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [47]:
gen_plus_model, gen_plus_tokenizer = train_model(combined_data, test_data, device, label2id, id2label)

tokenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.74M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/118M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Classification Report (excluding 'O' class):
              precision    recall  f1-score   support

    GEOPOLIT       0.02      0.06      0.03       710
         LOC       0.01      0.03      0.01       970
       MEDIA       0.02      0.28      0.03       476
         ORG       0.05      0.31      0.09      2671
         PER       0.10      0.08      0.09      5277

   micro avg       0.04      0.14      0.06     10104
   macro avg       0.04      0.15      0.05     10104
weighted avg       0.07      0.14      0.07     10104



  0%|          | 0/4020 [00:00<?, ?it/s]


Classification Report (excluding 'O' class):
              precision    recall  f1-score   support

    GEOPOLIT       0.95      0.52      0.67       710
         LOC       0.71      0.92      0.80       970
       MEDIA       0.70      0.36      0.47       476
         ORG       0.83      0.94      0.88      2671
         PER       0.98      0.99      0.98      5277

   micro avg       0.89      0.91      0.90     10104
   macro avg       0.83      0.74      0.76     10104
weighted avg       0.90      0.91      0.89     10104



Модель получилась хуже, чем версия с mlm претрейном, при том, что значение лосса на трейне сильно меньше, чем у модели с mlm. Т.е. можно с большой уверенностью сказать, что модель переобучилась за счет огромного кол-ва новых данных, полученных с помощью spacy. И для лучшего результата, возможно, стоит сгенерировать меньше данных, чтобы их количество было соизмеримо с нашим датасетом, ну или по-крайней мере делать раннюю остановку исходя из метрик на тестовом сплите.

Однако стоит заметить, что переобученная модель лучше всех справилась с классом PERson.

Для наилучшего результата можно было бы ещё попробовать совместить два подхода сразу - mlm + synth gen.

Обычное обучение, mlm, synth gen f1 macro:
0.73 , 0.89 , 0.76