# Fine-tuning disitilbert


## install dependencies and imports

In [None]:
!pip install -q datasets evaluate transformers[sentencepiece]
!pip install -q accelerate
!apt install -q git-lfs

!gdown 1tMflGHMM5AcKnpcpt_6JdTGQcAaOzMYF
!unzip /content/nlp_test_task_2023.zip

In [35]:
from transformers import Trainer
from transformers import TrainingArguments
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering


import evaluate
import collections
from datasets import load_dataset


import numpy as np
import torch
from torch.utils.data import DataLoader
from transformers import default_data_collator
from transformers import get_scheduler
from accelerate import Accelerator
from torch.optim import AdamW


from huggingface_hub import notebook_login
from huggingface_hub import Repository, get_full_repo_name


import json
from tqdm.auto import tqdm


notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
SEED = 0
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
np.random.seed(SEED)

## prepare dataset

In [None]:
PATH = '/content/nlp_test_task_2023/dataset'
dataset = load_dataset('json', data_files=PATH + '/train.json')['train']
dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=SEED)

In [12]:
def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["label"]]
    inputs = tokenizer(
        questions,
        examples["text"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = [{'text' : i['text'],
                'answer_start' : i['answer_start']} for i in examples['extracted_part']]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [13]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["label"]]
    inputs = tokenizer(
        questions,
        examples["text"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [14]:
def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[str(feature["example_id"])].append(idx)

    predicted_answers = []
    for example in examples:
        example_id = str(example["id"])
        context = example["text"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": str(ex["id"]), 
                            "answers": {'text' : ex['extracted_part']['text'], 
                                        'answer_start' : ex['extracted_part']['answer_start']}} for ex in examples]

    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [None]:
n_best = 100
# Максимальная длинна ответа чуть больше 100
max_answer_length = 120 
metric = evaluate.load("squad")

In [None]:
model_checkpoint = "AndrewChar/model-QA-5-epoch-RU"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
# Пробовал разные варианты max_len и stride
# эти значения самые оптимальные
max_length = 512
stride = 128


train_dataset = dataset['train'].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=dataset['train'].column_names,
)


validation_dataset =  dataset['test'].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns= dataset['test'].column_names,
)


train_dataset.set_format("torch")
validation_set = validation_dataset.remove_columns(["example_id", "offset_mapping"])
validation_set.set_format("torch")

In [16]:
print('MAX_LEN =', max_length)
print('STRIDE =', stride)
print('')
print('Размер train:', len(dataset['train']))
print('Размер train после предобработки:', len(train_dataset))
print('')
print('Размер test:', len(dataset['test']))
print('Размер test после предобработки:', len(validation_dataset))

MAX_LEN = 512
STRIDE = 128

Размер train: 1439
Размер train после предобработки: 2940

Размер test: 360
Размер test после предобработки: 744


In [17]:
batch_size = 16

train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=batch_size,
)


eval_dataloader = DataLoader(
    validation_set, collate_fn=default_data_collator, batch_size=batch_size
)

## train_model

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint, from_tf=True)
# Пробовал разные значение lr. Текущий самый оптимальный. 
optimizer = AdamW(model.parameters(), lr=1e-5)

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [None]:
num_train_epochs = 10
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch
print('Кол-во эпох:', num_train_epochs)
print('Кол-во шагов на эпоху:', num_update_steps_per_epoch)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

Кол-во эпох: 10
Кол-во шагов на эпоху: 184


In [None]:
model_name = "distilbert_finetuned"
repo_name = get_full_repo_name(model_name)

output_dir = "distilbert_finetuned"
repo = Repository(output_dir, clone_from=repo_name)

Обучал модель вплоть до 40 эпох. Получить прироста метрики не удавалось.

In [None]:
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for step, batch in enumerate(train_dataloader):
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)


    # Evaluation
    model.eval()
    start_logits = []
    end_logits = []
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        start_logits.append(accelerator.gather(outputs.start_logits).cpu().numpy())
        end_logits.append(accelerator.gather(outputs.end_logits).cpu().numpy())

    start_logits = np.concatenate(start_logits)
    end_logits = np.concatenate(end_logits)
    start_logits = start_logits[: len(validation_dataset)]
    end_logits = end_logits[: len(validation_dataset)]

    metrics = compute_metrics(
        start_logits, end_logits, validation_dataset, dataset["test"]
    )
    print(f"epoch {epoch}, metrics on eval: exact_match = {round(metrics['exact_match'], 2)} f1 = {round(metrics['f1'], 2)}")
    
    #Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}, exact_match={round(metrics['exact_match'], 2)}", blocking=False
        )

  0%|          | 0/1840 [00:00<?, ?it/s]

epoch 0, metrics on eval: exact_match = 62.22 f1 = 77.65
epoch 1, metrics on eval: exact_match = 66.94 f1 = 79.34
epoch 2, metrics on eval: exact_match = 70.83 f1 = 81.32
epoch 3, metrics on eval: exact_match = 71.11 f1 = 81.07
epoch 4, metrics on eval: exact_match = 72.22 f1 = 81.53
epoch 5, metrics on eval: exact_match = 72.5 f1 = 81.39
epoch 6, metrics on eval: exact_match = 73.06 f1 = 81.78
epoch 7, metrics on eval: exact_match = 73.33 f1 = 81.67
epoch 8, metrics on eval: exact_match = 73.06 f1 = 81.54
epoch 9, metrics on eval: exact_match = 72.78 f1 = 81.55


In [None]:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)

# Eval


Посмотрим на ошибочные предсказания модели

In [None]:
tokenizer = AutoTokenizer.from_pretrained("GeorgeKhlestov/distilbert_finetuned")
model = AutoModelForQuestionAnswering.from_pretrained("GeorgeKhlestov/distilbert_finetuned")

In [18]:
args = TrainingArguments(
    "GeorgeKhlestov/distilbert_finetuned",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    push_to_hub=False,
)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    tokenizer=tokenizer,
)

In [19]:
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 100, max_answer_length = 120):
    """ Функция, которая преобразует предсказания модели в текст
    Вход:
        examples - датасет в исходном формате
        featurs - предобработанный датасет
        raw_predictions - выходы модели
        n_best - кол-во кандидатов
        max_answer_length - максимальная длинна ответа
    Выход:
        predictions : dict"""

    all_start_logits, all_end_logits = raw_predictions
    # Build a map example to its corresponding features.
    
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    
    features_per_example = collections.defaultdict(list)
    
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # The dictionaries we have to fill.
    predictions = collections.OrderedDict()

    # Logging.
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []
        
        context = example["text"]
        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]

            # Update minimum null prediction.
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"text": "", "score": 0.0}
        
        # Let's pick our final answer: the best one or the null answer (only for squad_v2)
        answer = best_answer["text"] if best_answer["score"] > min_null_score else ""
        predictions[example["id"]] = answer

    return predictions

In [20]:
raw_predictions = trainer.predict(validation_dataset)
processed_predictions = postprocess_qa_predictions(dataset['test'], 
                                                   validation_dataset,
                                                   raw_predictions.predictions,
                                                   n_best_size=100,
                                                   max_answer_length=120)

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Post-processing 360 example predictions split into 744 features.


  0%|          | 0/360 [00:00<?, ?it/s]

In [21]:
def find_wrong(ground_truth, predicted):
    wrong_answers = []

    for real in ground_truth:
        id = real['id']
        real_extracted_text = real['extracted_part']['text'][0]
        predicted_text = predicted[id]
        if predicted_text != real_extracted_text:
            wrong_answers.append({'real' : real_extracted_text,
                                  'predicted' : predicted_text})
    return wrong_answers

In [22]:
wrong_answers = find_wrong(dataset['test'], processed_predictions)
print('Общее кол-во неправильных ответов:', len(wrong_answers))
print('Фактическое accuracy:', 1 - len(wrong_answers)/len(dataset['test']))

Общее кол-во неправильных ответов: 99
Фактическое accuracy: 0.725


In [23]:
num2show = (15, 25, 1) # start, stop, step

for i in range(*num2show):
    print('real:', wrong_answers[i]['real'])
    print('predicted:', wrong_answers[i]['predicted'], '\n')

real: если Заказчиком установлено требование обеспечения исполнения договора и/или обеспечения исполнения гарантийных обязательств.
predicted: Обеспечение исполнения договора предусмотрено в размере 5% от начальной (максимальной) цены договора 

real: 
predicted: В 

real: Размер обеспечения исполнения договора не может превышать 5 (пять) процентов от цены договора
predicted:  

real: 
predicted: Из 

real: 
predicted: П 

real: Размер обеспечения гарантийных обязательств составляет 5% от цены Договора. Срок предоставления обеспечения гарантийных обязательств: до момента направления
predicted: Размер обеспечения гарантийных обязательств составляет 5% от цены Договора. 

real: Размер обеспечения гарантийных обязательств составляет 0,01 % от начальной (максимальной) цены контракта указанной в извещении об осуществлении закупки
predicted: Размер обеспечения гарантийных обязательств составляет 0,01 % от начальной (максимальной) цены контракта 

real: 
predicted: В 

real: 
predicted: Д 

r

Модель часто вместо пустого ответа предсказывает 1-5 бессмысленых символа. Будем считать, что в тексте нет нужной информации, если модель предсказывает меньше 10-ти символов(из EDA мы знаем, что непустой ответ минимальной длинны содержит примерно 5 токенов, а это точно больше чем 10 символов)



In [24]:
zero_answer_trashold = 10

for id in processed_predictions:
    if len(processed_predictions[id]) < zero_answer_trashold:
        processed_predictions[id] = ''

In [25]:
wrong_answers = find_wrong(dataset['test'], processed_predictions)
print('Общее кол-во неправильных ответов:', len(wrong_answers))
print('Фактическое accuracy:', 1 - len(wrong_answers)/len(dataset['test']))

Общее кол-во неправильных ответов: 61
Фактическое accuracy: 0.8305555555555555


In [27]:
num2show = (30, 40, 1) # start, stop, step

for i in range(*num2show):
    print('real:', wrong_answers[i]['real'])
    print('predicted:', wrong_answers[i]['predicted'], '\n')

real: такое лицо предоставляет обеспечение исполнения договора в размере 7,5 % от НМЦ.
predicted: если лицом, с которым заключается договор, предложена цена договора (с учетом всех переторжек), которая на 25% и более, ниже НМЦ, такое лицо предоставляет обеспечение исполнения договора в размере 7,5 % от НМЦ. 

real: Размер обеспечения исполнения договора: 5% от начальной (максимальной) цены договора
predicted: Размер обеспечения исполнения договора: 5% от начальной (максимальной) цены договора, НДС не облагается. 

real: Размер обеспечения гарантийных обязательств, порядок предоставления такого обеспечения, требования к такому обеспечению 0,9 % от начальной (максимальной) цены контракта, что составляет 9 487,50 руб.
predicted:  

real: Размер обеспечения гарантийных обязательств: установлен. Размер обеспечения гарантийных обязательств составляет 1% от цены Договора.
predicted: Размер обеспечения гарантийных обязательств составляет 1% от цены Договора. 

real: Устанавливается требование 

Условно ошибки модель можно разделить на три вида:  
- предсказание пустого ответа  
- предсказание неправильного фрагмента  
- предсказания частично правильного ответа  
  
С последними двумя ошибками лучше всего поможет справиться более качественная разметка.

# Submit

In [None]:
# Загружаем тестовый датасет
test = load_dataset('json', data_files=PATH + '/test.json')['train']

test_dataset =  test.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns= test.column_names,
)

In [None]:
# Делаем предсказания
raw_predictions = trainer.predict(test_dataset)
processed_predictions = postprocess_qa_predictions(test, 
                                                   test_dataset,
                                                   raw_predictions.predictions,
                                                   n_best_size=100,
                                                   max_answer_length=120)

In [33]:
# Выбрасываем короткие предсказания
for id in processed_predictions:
    if len(processed_predictions[id]) < zero_answer_trashold:
        processed_predictions[id] = ''

In [48]:
# Загружаем данные в более удобном формате
with open(PATH + '/test.json', 'r') as f:
    test = json.load(f)


# Записываем предсказания
for sample in range(len(test)):
    id = test[sample]['id']
    predicted_text = processed_predictions[id]
    if predicted_text in test[sample]['text'] and predicted_text != '':
        test[sample]['extracted_part'] = {'text' : [processed_predictions[id]],
                                          'answer_start' : [test[sample]['text'].find(predicted_text)],
                                          'answer_end' : [test[sample]['text'].find(predicted_text) + len(predicted_text)]}
    else:
        test[sample]['extracted_part'] = {'text' : [''],
                                          'answer_start' : [0],
                                          'answer_end' : [0]}

In [45]:
# Сохраняем предсказания
final = json.dumps(test, indent=2)

with open("predictions.json", "w") as final:
    json.dump(test, final)