# Fine-tuning ruBert


In [1]:
import pandas as pd
import ast
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, load_metric
from transformers import AutoTokenizer

In [2]:
from huggingface_hub import notebook_login

# для сохранения моделей на HF
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Предобработка данных

In [3]:
path = 'C:\\Users\\User\\Desktop\\punct_project\\new_books_prepared.csv'

df = pd.read_csv(path, index_col=0)[['tokens', 'labels']]
df['tokens'] = df.tokens.apply(ast.literal_eval)
df['labels'] = df.labels.apply(ast.literal_eval)

df

Unnamed: 0,tokens,labels
0,"[итак, вы, понимаете, читатель, каким, образом...","[,, o, ,, ,, o, o, o, o, o, o, .]"
1,"[я, ходил, много, и, долго, так, что, уже, сов...","[o, o, o, o, ,, o, o, o, o, ,, o, o, ,, ,, o, ..."
2,"[а, всетаки, моя, ночь, была, лучше, дняот, ка...","[o, o, o, o, o, o, o, o, o, .]"
3,"[я, пришел, назад, в, город, очень, поздно, и,...","[o, o, o, o, o, o, ,, o, o, o, o, ,, o, o, o, ..."
4,"[по, той, стороне, тротуара, недалеко, от, мое...","[o, o, o, ,, o, o, o, ,, o, o, o, o, ,, o, ,, ..."
...,...,...
26469,"[и, разве, астрономы, могли, бы, понять, и, вы...","[o, o, o, o, o, o, o, o, ,, o, o, o, o, o, o, ..."
26470,"[но, она, всетаки, не, рассмотрела, бы, его, л...","[o, o, o, o, o, o, o, ,, o, o, o, ,, o, ,, o, ..."
26471,"[она, понимает, думал, он, она, знает, о, чем,...","[o, ,, o, ,, o, ,, o, o, o, ., o, o, o, ,, o, ..."
26472,"[нет, не, надо, говорить, подумал, он, когда, ...","[,, o, o, ,, o, ,, o, o, o, o, ., o, ,, o, o, ..."


In [4]:
# проверка, что длины токенов и меток совпадают 
df[df.tokens.apply(len) != df.labels.apply(len)]

Unnamed: 0,tokens,labels


In [5]:
# оставляем только не самые длинные тексты
df = df[df.tokens.apply(len) < 200]
df

Unnamed: 0,tokens,labels
0,"[итак, вы, понимаете, читатель, каким, образом...","[,, o, ,, ,, o, o, o, o, o, o, .]"
1,"[я, ходил, много, и, долго, так, что, уже, сов...","[o, o, o, o, ,, o, o, o, o, ,, o, o, ,, ,, o, ..."
2,"[а, всетаки, моя, ночь, была, лучше, дняот, ка...","[o, o, o, o, o, o, o, o, o, .]"
3,"[я, пришел, назад, в, город, очень, поздно, и,...","[o, o, o, o, o, o, ,, o, o, o, o, ,, o, o, o, ..."
4,"[по, той, стороне, тротуара, недалеко, от, мое...","[o, o, o, ,, o, o, o, ,, o, o, o, o, ,, o, ,, ..."
...,...,...
26469,"[и, разве, астрономы, могли, бы, понять, и, вы...","[o, o, o, o, o, o, o, o, ,, o, o, o, o, o, o, ..."
26470,"[но, она, всетаки, не, рассмотрела, бы, его, л...","[o, o, o, o, o, o, o, ,, o, o, o, ,, o, ,, o, ..."
26471,"[она, понимает, думал, он, она, знает, о, чем,...","[o, ,, o, ,, o, ,, o, o, o, ., o, o, o, ,, o, ..."
26472,"[нет, не, надо, говорить, подумал, он, когда, ...","[,, o, o, ,, o, ,, o, o, o, o, ., o, ,, o, o, ..."


In [6]:
def prepare_labels_list(labels_list):
    new_labels = []
    
    for i in labels_list:
        if i == 'o':
            new_labels.append('O')
            
        else:
            new_labels.append('B-' + i)
            
    return new_labels
            

In [7]:
df['labels'] = df.labels.apply(prepare_labels_list)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['labels'] = df.labels.apply(prepare_labels_list)


Unnamed: 0,tokens,labels
0,"[итак, вы, понимаете, читатель, каким, образом...","[B-,, O, B-,, B-,, O, O, O, O, O, O, B-.]"
1,"[я, ходил, много, и, долго, так, что, уже, сов...","[O, O, O, O, B-,, O, O, O, O, B-,, O, O, B-,, ..."
2,"[а, всетаки, моя, ночь, была, лучше, дняот, ка...","[O, O, O, O, O, O, O, O, O, B-.]"
3,"[я, пришел, назад, в, город, очень, поздно, и,...","[O, O, O, O, O, O, B-,, O, O, O, O, B-,, O, O,..."
4,"[по, той, стороне, тротуара, недалеко, от, мое...","[O, O, O, B-,, O, O, O, B-,, O, O, O, O, B-,, ..."
...,...,...
26469,"[и, разве, астрономы, могли, бы, понять, и, вы...","[O, O, O, O, O, O, O, O, B-,, O, O, O, O, O, O..."
26470,"[но, она, всетаки, не, рассмотрела, бы, его, л...","[O, O, O, O, O, O, O, B-,, O, O, O, B-,, O, B-..."
26471,"[она, понимает, думал, он, она, знает, о, чем,...","[O, B-,, O, B-,, O, B-,, O, O, O, B-., O, O, O..."
26472,"[нет, не, надо, говорить, подумал, он, когда, ...","[B-,, O, O, B-,, O, B-,, O, O, O, O, B-., O, B..."


In [8]:
df_train, df_val_test = train_test_split(df, test_size=0.2, random_state=999)
df_val, df_test  = train_test_split(df_val_test, test_size=0.5, random_state=999)

data = DatasetDict({
    'train': Dataset.from_pandas(df_train, preserve_index=False),
    'test': Dataset.from_pandas(df_val, preserve_index=False)
})

data

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 20687
    })
    test: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 2586
    })
})

## Токенизация

In [9]:
model_name = 'ai-forever/ruBert-base'

In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [11]:
label_list = set()
for labels in df_train['labels']:
    label_list |= set(labels)

label_list = sorted(list(label_list))
label_list

['B-!', 'B-,', 'B-.', 'B-...', 'B-:', 'B-?', 'O']

In [12]:
def tokenize_and_align_labels(pair, label_all_tokens=False):
    tokenized_inputs = tokenizer(pair["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(pair['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        label_ids = [label_list.index(idx) if isinstance(idx, str) else idx for idx in label_ids]

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [13]:
tokenized_datasets = data.map(tokenize_and_align_labels, batched=True)
tokenized_datasets['train'][0]['tokens']

Map:   0%|          | 0/20687 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/2586 [00:00<?, ? examples/s]

['и', 'в', 'плену', 'у', 'страха', 'мы']

## Модель

In [14]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_name,
                                                        num_labels=len(label_list),
                                                        ignore_mismatched_sizes=True)
model.config.id2label = dict(enumerate(label_list))
model.config.label2id = {v: k for k, v in model.config.id2label.items()}

Some weights of BertForTokenClassification were not initialized from the model checkpoint at ai-forever/ruBert-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
for param in model.bert.embeddings.parameters():
    param.requires_grad = False

In [16]:
batch_size = 16

args = TrainingArguments(
    "ner",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,
    weight_decay=0.01,
    include_inputs_for_metrics=True,
    hub_model_id="rubert-base-punctuation",
    save_strategy="epoch",
    push_to_hub=True
)

In [17]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [18]:
metric = load_metric("seqeval")

  metric = load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [19]:
import numpy as np

def compute_metrics(p):
    predictions, labels, inputs = p.predictions, p.label_ids, p.inputs
    predictions = np.argmax(p.predictions, axis=2)

    # send only the first token of each word to the evaluation
    true_predictions = []
    true_labels = []
    for prediction, label, tokens in zip(predictions, labels, inputs):
        true_predictions.append([])
        true_labels.append([])
        for (p, l, t) in zip(prediction, label, tokens):
            if l != -100 and not tokenizer.convert_ids_to_tokens(int(t)).startswith('##'):
                true_predictions[-1].append(label_list[p])
                true_labels[-1].append(label_list[l])

    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=1)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [20]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
C:\Users\User\Desktop\punct_project\ner is already a clone of https://huggingface.co/markusiko/rubert-base-punctuation. Make sure you pull the latest changes with `repo.git_pull()`.


In [21]:
import logging
from transformers.trainer import logger as noisy_logger
noisy_logger.setLevel(logging.WARNING)

In [22]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1901,0.159186,0.823761,0.823122,0.823441,0.946397
2,0.1438,0.151183,0.843553,0.823691,0.833504,0.950099


TrainOutput(global_step=2586, training_loss=0.18713807875300598, metrics={'train_runtime': 862.1676, 'train_samples_per_second': 47.988, 'train_steps_per_second': 2.999, 'total_flos': 3437658653845614.0, 'train_loss': 0.18713807875300598, 'epoch': 2.0})

In [32]:
# результиаты уже сохранены в hf hub
# model.save_pretrained('base_rubert.bin')
# tokenizer.save_pretrained('base_rubert.bin')

## Инференс

In [34]:
from transformers import pipeline

punct_corrector = pipeline("token-classification", model="markusiko/rubert-base-punctuation")

In [103]:
punct_corrector('Варенька бросилась навстречу своей старой приятельнице, княгиня поцеловала ее', aggregation_strategy='simple')

[{'entity_group': ',',
  'score': 0.6137649,
  'word': 'приятель',
  'start': 42,
  'end': 50},
 {'entity_group': ',',
  'score': 0.63064533,
  'word': '##нице',
  'start': 50,
  'end': 54},
 {'entity_group': '.',
  'score': 0.9959116,
  'word': 'ее',
  'start': 75,
  'end': 77}]

In [82]:
def get_corrected_sentence(text):
    new_text = ''
    tokens_predicted = punct_corrector(text, aggregation_strategy='simple')
    current_idx = 0
    for i in range(len(tokens_predicted)):
        if (i != len(tokens_predicted) - 1):
            if(tokens_predicted[i+1]['word'].startswith('##')):
                continue
        new_text += text[current_idx:tokens_predicted[i]['end']] + tokens_predicted[i]['entity_group']
        current_idx = tokens_predicted[i]['end']
        
    return new_text

In [83]:
get_corrected_sentence('что я сделал чтобы быть с ней')

'что я сделал, чтобы быть с ней?'

In [84]:
get_corrected_sentence('красивый сильный жеребец ушел от нас')

'красивый, сильный жеребец ушел от нас.'

In [87]:
get_corrected_sentence('Варенька бросилась навстречу своей старой приятельнице а княгиня поцеловала ее')

'Варенька бросилась навстречу своей старой приятельнице, а княгиня поцеловала ее.'

## Метрика на тесте

In [98]:
df_test['raw_sentence'] = df_test.tokens.apply(lambda x: ' '.join(x))

In [137]:
def prepare_pred(text):
    prediction = get_corrected_sentence(text)
    tokens = [token for token in prediction.split(' ') if token != '']
    labels = []
    
    for token in tokens:
        if (len(token) > 3) & (token[-3:] == '...'):
            labels.append('...')
        elif token[-1] in punctuation_signs:
            labels.append(token[-1])
        else:
            labels.append('o')
    
    return prepare_labels_list(labels)

prepare_pred('привет как у тебя дела')

['B-,', 'O', 'O', 'O', 'B-?']

In [100]:
from tqdm import tqdm 
tqdm.pandas()

In [139]:
import re

df_test['raw_sentence'] = df_test.tokens.apply(lambda x: re.sub(' +', ' ' , ' '.join(x)))
df_test['preds'] = df_test.raw_sentence.progress_apply(prepare_pred)

df_test

100%|██████████| 2586/2586 [05:24<00:00,  7.98it/s]


Unnamed: 0,tokens,labels,raw_sentence,preds
1341,"[все, это, смог, я, различить, лишь, смутно, и...","[O, O, O, O, O, O, O, O, O, B-., O, O, O, O, O...",все это смог я различить лишь смутно и с трудо...,"[O, O, O, O, O, O, O, O, O, B-,, O, O, O, O, O..."
25027,"[она, поехала, в, игрушечную, лавку, накупила,...","[O, O, O, O, B-,, O, O, O, O, O, B-., O, O, O,...",она поехала в игрушечную лавку накупила игруше...,"[O, O, O, O, B-,, O, O, O, O, O, B-., O, O, O,..."
2585,"[наконец, настало, утро, четырнадцатого, числа...","[O, O, O, O, B-., O, O, O, O, O, O, O, B-,, O,...",наконец настало утро четырнадцатого числа пого...,"[O, O, O, O, B-., O, O, O, O, O, O, O, B-., O,..."
16829,"[хорошо, а, почему, прежде, бывало, с, восьми,...","[B-., O, O, B-,, B-,, O, O, O, O, O, O, O, B-,...",хорошо а почему прежде бывало с восьми часов в...,"[B-,, O, O, O, O, O, O, O, O, O, O, O, B-,, O,..."
7937,"[говоря, это, графиня, оглянулась, на, дочь, н...","[O, B-,, O, O, O, B-., O, B-,, O, O, O, O, O, ...",говоря это графиня оглянулась на дочь наташа л...,"[O, B-,, O, O, O, B-., O, O, O, O, B-,, O, O, ..."
...,...,...,...,...
13908,"[разве, на, одну, секунду, я, пришел, за, сове...","[O, O, O, B-..., O, O, O, B-., B-,, B-,, O, O,...",разве на одну секунду я пришел за советом я ко...,"[O, O, O, O, O, O, O, B-?, B-,, B-,, O, O, O, ..."
21490,"[план, был, очень, хорош, но, дело, заключалос...","[O, O, O, B-,, O, O, O, O, B-,, O, O, O, O, O,...",план был очень хорош но дело заключалось в том...,"[O, O, O, B-,, O, O, O, O, B-,, O, O, O, O, O,..."
2567,"[сохраняя, поелику, возможно, равновесие, чтоб...","[B-,, O, B-,, B-,, O, O, O, B-,, O, O, B-,, O,...",сохраняя поелику возможно равновесие чтобы хор...,"[O, O, O, B-,, O, O, O, B-,, O, O, B-,, O, O, ..."
25405,"[было, ли, в, лице, левина, чтонибудь, особенн...","[O, O, O, O, O, O, B-,, O, O, O, B-,, O, O, O,...",было ли в лице левина чтонибудь особенное или ...,"[O, O, O, O, O, O, B-,, O, O, O, B-,, O, O, O,..."


In [145]:
all_labels = []

for i in df_test.values:
    for j in range(len(i[0])):
        if i[0][j] != '':
            all_labels.append(i[1][j])

len(all_labels)

97234

In [146]:
all_preds = []

for row in df_test.values:
    all_preds += row[3]

In [147]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y_pred = le.fit_transform(all_preds)
y_true = le.transform(all_labels)

y_true

array([6, 6, 6, ..., 6, 6, 2])

In [154]:
from collections import Counter
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score,\
                            f1_score, precision_score, recall_score, average_precision_score

def calc_metrics_no_proba(y_true, y_pred):
    print('Доля пробелов:', np.mean(np.array(y_true) == 6))
    print('Accuracy:', accuracy_score(y_true, y_pred))
#     print('Top-2 Accuracy:', top_k_accuracy_score(y_true, y_pred_proba, k=2))
#     rint('ROC-AUC (OVR):',roc_auc_score(y_true, y_pred_proba, multi_class='ovr'))
#     print('AUC-PR:',average_precision_score(y_true, y_pred_proba, average='weighted'))
    
    metrics = []
    metrics.append(list(dict(sorted(Counter(y_true).items())).values()))
    metrics.append(f1_score(y_true, y_pred, average=None))
    metrics.append(precision_score(y_true, y_pred, average=None, zero_division=0))
    metrics.append(recall_score(y_true, y_pred, average=None, zero_division=0))
#     metrics.append(roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average=None))
#     metrics.append(average_precision_score(y_true, y_pred_proba, average=None))
    metrics_index = ['Count', 'F1-Score', 'Precision', 'Recall']
#                      'ROC-AUC', 'AUC-PR']
    df_metrics = pd.DataFrame(metrics, columns=le.classes_, index=metrics_index)
    
    return df_metrics


In [155]:
calc_metrics_no_proba(y_true, y_pred)

Доля пробелов: 0.7972005677026555
Accuracy: 0.9494209844293149


Unnamed: 0,B-!,"B-,",B-.,B-...,B-:,B-?,O
Count,213.0,13004.0,5814.0,164.0,214.0,310.0,77515.0
F1-Score,0.308219,0.85095,0.830135,0.0,0.009302,0.714286,0.979373
Precision,0.56962,0.85662,0.809446,0.0,1.0,0.736301,0.976896
Recall,0.211268,0.845355,0.851909,0.0,0.004673,0.693548,0.981862
