In [None]:
!pip install transformers seqeval[gpu]



In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertConfig, BertForTokenClassification

Проверим доступность GPU:

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


### Загрузка и предобработка данных

Мы будем использовать NER датасет с [Kaggle](https://www.kaggle.com/namanj27/ner-dataset). Данные размечены в IOB-формате.

In [None]:
path = "train_data_sent.csv"
data = pd.read_csv(path, delimiter='\t')
data.head(30)

Unnamed: 0,ID,token,tag
0,0,Book,O
1,1,I,O
2,2,I.,O
3,3,On,O
4,4,a,O
5,5,January,O
6,6,evening,O
7,7,of,O
8,8,the,O
9,9,early,O


In [None]:
data['Sentence'] = [0 for i in range(len(data))]
j = 0
for i in range(0, len(data)):
  data['Sentence'][i] = j
  if data['token'][i] == '.':
    j+=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Sentence'][i] = j


In [None]:
data.head()

Unnamed: 0,ID,token,tag,Sentence
0,0,Book,O,0
1,1,I,O,0
2,2,I.,O,0
3,3,On,O,0
4,4,a,O,0


In [None]:
# Распределение NER тегов
print("Number of tags: {}".format(len(data.tag.unique())))
frequencies = data.tag.value_counts()
frequencies

Number of tags: 7


O        158321
I-PER      9924
B-PER      6875
I-LOC      2019
B-LOC      1613
I-ORG       213
B-ORG        98
Name: tag, dtype: int64

Обучающий пример в задаче NER - это не токен, а целое предложение, так как для решения задачи нам необходим контекст.

Далее соберем данные в предложения.

In [None]:
# let's create a new column called "sentence" which groups the words by sentence
data['sentence'] = data[['Sentence','token','tag']].groupby(['Sentence'])['token'].transform(lambda x: ' '.join(x))
# let's also create a new column called "word_labels" which groups the tags by sentence
data['word_labels'] = data[['Sentence','token','tag']].groupby(['Sentence'])['tag'].transform(lambda x: ','.join(x))
data.head()

Unnamed: 0,ID,token,tag,Sentence,sentence,word_labels
0,0,Book,O,0,Book I I. On a January evening of the early se...,"O,O,O,O,O,O,O,O,O,O,O,O,B-PER,I-PER,O,O,O,O,O,..."
1,1,I,O,0,Book I I. On a January evening of the early se...,"O,O,O,O,O,O,O,O,O,O,O,O,B-PER,I-PER,O,O,O,O,O,..."
2,2,I.,O,0,Book I I. On a January evening of the early se...,"O,O,O,O,O,O,O,O,O,O,O,O,B-PER,I-PER,O,O,O,O,O,..."
3,3,On,O,0,Book I I. On a January evening of the early se...,"O,O,O,O,O,O,O,O,O,O,O,O,B-PER,I-PER,O,O,O,O,O,..."
4,4,a,O,0,Book I I. On a January evening of the early se...,"O,O,O,O,O,O,O,O,O,O,O,O,B-PER,I-PER,O,O,O,O,O,..."


Создадим словари для перевода тегов в id и обратно.

In [None]:
label2id = {k: v for v, k in enumerate(data.tag.unique())}
id2label = {v: k for v, k in enumerate(data.tag.unique())}
label2id

{'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-LOC': 3,
 'I-LOC': 4,
 'B-ORG': 5,
 'I-ORG': 6}

Оставим в данных только колонки `sentence` и `word_labels`, а также уберем дубликаты:

In [None]:
data = data[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
data.head()

Unnamed: 0,sentence,word_labels
0,Book I I. On a January evening of the early se...,"O,O,O,O,O,O,O,O,O,O,O,O,B-PER,I-PER,O,O,O,O,O,..."
1,Though there was already talk of the erection ...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
2,Conservatives cherished it for being small and...,"B-PER,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-LO..."
3,It was Madame Nilsson 's first appearance that...,"O,O,B-PER,O,O,O,O,O,O,O,O,O,B-PER,I-PER,I-PER,..."
4,""" To come to the Opera in a Brown coupe was al...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."


### Подготовка датасета и даталоадеров

Зададим гиперпараметры модели:

In [None]:
MAX_LEN = 190 #максимальная длина предложения в тестовых данных
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 7
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Важный момент: BERT обучается на кусочках слов, используя **wordpiece tokenization**. Поэтому нам также нужно разбить слова на кусочки, используя эту токенизцию, и продублировать теги:

Пример: "Washington" -> "b-gpe"

Токены:  "Wash", "##ing", "##ton" -> "b-gpe", "b-gpe", "b-gpe".






In [None]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

Такой способы расстановки тегов - это один из возможных вариантов. Также можно присваивать истинную метку только первому токену в сущности и обучать на такой разметке BERT. А можно проставлять все метки, кроме первой, равными `X` - так тоже работает!

In [None]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.sentence[index]
        word_labels = self.data.word_labels[index]
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)

        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.insert(-1, "O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]

        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [label2id[label] for label in labels]
        # the following line is deprecated
        #label_ids = [label if label != 0 else -100 for label in label_ids]

        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        }

    def __len__(self):
        return self.len

Создадим тренировочный и тестовый датасеты в пропорции 80:20:

In [None]:
train_size = 0.8
#train_size = 1
train_dataset = data.sample(frac=train_size,random_state=200)
test_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (6419, 2)
TRAIN Dataset: (6419, 2)


Посмотрим на данные

In [None]:
training_set[0]

{'ids': tensor([  101,  1000,  1000,  1045,  1005,  1049,  2025,   999,  1998,  2065,
          3810,  2039,  2026,  2606,  3084,  2033,  2028,  1010,  1045,  1005,
          2222,  4929,  2009,  1999,  2048, 17448,  6229,  1045,  1005,  1049,
          3174,  1010,  1000,  6639,  8183,  1010,  4815,  2125,  2014,  5658,
          1010,  1998,  5513,  2091,  1037, 15655, 23055,  1012,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,  

Создадим даталоадеры

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

### Объявление модели

Загрузим предобученную модель **BertForTokenClassification**.

In [None]:
model = BertForTokenClassification.from_pretrained('bert-base-uncased',
                                                   num_labels=len(id2label),
                                                   id2label=id2label,
                                                   label2id=label2id)
model.to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

### Обучение (дообучение) модели

Перед началом обучения проведем sanity check. Loss до начала обучения должен быть равен примерно -ln(1/number of classes) = -ln(1/17) = 2.83.

Так как в начале обучения веса случайные (из равномерного распределения), это значит, что вероятность угадывания равна 1/17. Тогда лосс будет равен -ln(1/17).

Проверим это:


In [None]:
ids = training_set[0]["ids"].unsqueeze(0)
mask = training_set[0]["mask"].unsqueeze(0)
targets = training_set[0]["targets"].unsqueeze(0)
ids = ids.to(device)
mask = mask.to(device)
targets = targets.to(device)
outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
initial_loss = outputs[0]
initial_loss

tensor(1.8973, device='cuda:0', grad_fn=<NllLossBackward0>)

Все хорошо.

Проверим размер тензора:

In [None]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 190, 7])

Зададим оптимизатор

In [None]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

Создадим функцию для обучения модели

In [None]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()

    for idx, batch in enumerate(training_loader):

        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
        loss, tr_logits = outputs.loss, outputs.logits
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)

        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")

        # compute training accuracy
        flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_preds.extend(predictions)
        tr_labels.extend(targets)

        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

Запустим обучение

In [None]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 100 training steps: 1.8927136659622192
Training loss per 100 training steps: 0.28372410830516037
Training loss per 100 training steps: 0.18971036988037143
Training loss per 100 training steps: 0.15077868881527073
Training loss per 100 training steps: 0.12898033153860275
Training loss per 100 training steps: 0.1130970145771819
Training loss per 100 training steps: 0.10204122713674015
Training loss per 100 training steps: 0.09307292378364734
Training loss per 100 training steps: 0.08624473379020313
Training loss per 100 training steps: 0.0799377691894811
Training loss per 100 training steps: 0.07542007301455377
Training loss per 100 training steps: 0.07114435698074474
Training loss per 100 training steps: 0.06760117552218164
Training loss per 100 training steps: 0.06447759863173698
Training loss per 100 training steps: 0.061955262253851666
Training loss per 100 training steps: 0.05981693092503583
Training loss per 100 training steps: 0.0578496737537924

### Применение и оценка качества модели

Будем оценивать качество модели на отложенных данных

In [None]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):

            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.long)

            outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
            loss, eval_logits = outputs.loss, outputs.logits

            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += targets.size(0)

            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # compute evaluation accuracy
            flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
            active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.extend(targets)
            eval_preds.extend(predictions)

            tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    #print(eval_labels)
    #print(eval_preds)

    labels = [id2label[id.item()] for id in eval_labels]
    predictions = [id2label[id.item()] for id in eval_preds]

    #print(labels)
    #print(predictions)

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

### Протестируем модель

In [None]:
labels, predictions = valid(model, testing_loader)

In [None]:
from seqeval.metrics import classification_report

print(classification_report([labels], [predictions]))

### Функция получения предсказаний

In [None]:
def predict(model, test_loader):
    model.eval()

    eval_preds = []
    #idsss = []

    with torch.no_grad():
        for idx, batch in enumerate(test_loader):

            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)

            outputs = model(input_ids=ids, attention_mask=mask)

            eval_logits = outputs.logits
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)

            active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)

            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_preds.extend(predictions)

            predictions = [id2label[id.item()] for id in eval_preds]
            #idsss.extend(ids)

        return predictions#, idsss


### Обработка данных для предсказания

Класс для создания датасета из тестовых данных

In [None]:
class dataset_(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.sentence[index]
        tokenized_sentence = tokenizer.tokenize(sentence)

        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]

        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long)
        }

    def __len__(self):
        return self.len

In [None]:
data_test = pd.read_csv('test_data_no_labels_sent.csv', delimiter='\t')
data_help = data_test.copy()

In [None]:
data_test['Sentence'] = [0 for i in range(len(data_test))]
j = 0
for i in range(0, len(data_test)):
  data_test['Sentence'][i] = j
  if data_test['token'][i] == '.':
    j+=1

# let's create a new column called "sentence" which groups the words by sentence
data_test['sentence'] = data_test[['Sentence','token']].groupby(['Sentence'])['token'].transform(lambda x: ' '.join(x))

data_test = data_test[["sentence"]].drop_duplicates().reset_index(drop=True)
data_test.tail()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_test['Sentence'][i] = j


Unnamed: 0,sentence
1087,"It was a happy circumstance , and animated Mr...."
1088,"Mr. Knightley had a cheerful manner , which al..."
1089,"When this was over , Mr. Woodhouse gratefully ..."
1090,I am afraid you must have had a shocking walk .
1091,” “


In [None]:
final_test_set = dataset_(data_test, tokenizer, MAX_LEN)

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

final_test_loader = DataLoader(final_test_set, **test_params)

In [None]:
preds = predict(model, final_test_loader)

Вспомогательные данные, для того чтобы сопомтавить предсказанные теги токенов обратно к словам

In [None]:
data_test = pd.read_csv('test_data_no_labels_sent.csv', delimiter='\t')
data_help = data_test.copy()
data_help['lag'] = data_help['token'].shift(fill_value = '.')
data_help['count_tokens'] = ""
for i in range(0, len(data_help)):
  data_help['count_tokens'][i] = len(tokenizer.tokenize(data_help['token'][i]))
  if data_help['lag'][i] == '.':
    data_help['count_tokens'][i] += 1
  if data_help['token'][i] == '.':
    data_help['count_tokens'][i] += 1
    #data_help['count_tokens'][i-1] += 1

data_help['count_tokens'][31468] += 1
data_help

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_help['count_tokens'][i] = len(tokenizer.tokenize(data_help['token'][i]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_help['count_tokens'][i] += 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_help['count_tokens'][i] += 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_help['count_token

Unnamed: 0,ID,token,lag,count_tokens
0,0,So,.,2
1,1,had,So,1
2,2,his,had,1
3,3,way,his,1
4,4,of,way,1
...,...,...,...,...
31464,31464,shocking,a,1
31465,31465,walk,shocking,1
31466,31466,.,walk,2
31467,31467,”,.,2


Костыль. По какой-то причине модель не выдала предсказания для нескольких слов посередине всех данных. Нашел несостыковку вручную, нет времени разбираться

In [None]:
preds_ = preds[:19141] + ['O', 'O', 'O', 'O', 'O'] + preds[19141:]
data_help['tag'] = ""
for i in range(0, len(data_help)):
  data_help['tag'][i] = preds_[:data_help['count_tokens'][i]]
  preds_ = preds_[data_help['count_tokens'][i]:]
data_help

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_help['tag'][i] = preds_[:data_help['count_tokens'][i]]


Unnamed: 0,ID,token,lag,count_tokens,tag
0,0,So,.,2,"[O, O]"
1,1,had,So,1,[O]
2,2,his,had,1,[O]
3,3,way,his,1,[O]
4,4,of,way,1,[O]
...,...,...,...,...,...
31464,31464,shocking,a,1,[O]
31465,31465,walk,shocking,1,[O]
31466,31466,.,walk,2,"[O, O]"
31467,31467,”,.,2,"[O, O]"


Сопоставляем тег из массива тегов для слова - этому слову

In [None]:
for i in range(0, len(data_help)):
    if data_help['token'][i] == '.':
        data_help['tag'][i+1][0] = data_help['tag'][i+1][1]
    data_help['tag'][i] = data_help['tag'][i][0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_help['tag'][i] = data_help['tag'][i][0]


In [None]:
# Сохраняем итоговые предсказания в файл
data_help.index.rename('ID', inplace=True)
data_help['tag'].to_csv('submit_good_preds10.csv')