<a href="https://colab.research.google.com/github/Kkordik/NovelQSI/blob/main/train_Longformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# In this notebook I fine tuned longformer-qa for Quote Speaker Identification task and published ne model to the HuggingFace


In [None]:
!pip install evaluate transformers[torch]

### Now you will login to your Hugging Face account by token

**Use the token with "write" permission, as you will need it for pushing fine tined model to the hub**

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
# @title Load dataset
from datasets import load_dataset

dataset = (load_dataset("Kkordik/NovelQSI", split="train").train_test_split(test_size=0.3))
print(dataset)

In [None]:
# @title Prepare dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

pretrained_model = "mrm8488/longformer-base-4096-finetuned-squadv2" # @param {type:"string"}
max_token_length = 4096 # @param

tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
model = AutoModelForQuestionAnswering.from_pretrained(pretrained_model)


def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_token_length,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

### Now you will have to configurate Trainer

*model_hub_id*: is identifier of the model repository, where the fine tuned model will be pushed to. **Make sure that you have created one, and that at the start of the notebook loged in with "write" permission token**

Don't touch other params if you don't know what you do.

In [None]:
# @title Configurate Trainer
from transformers import TrainingArguments, Trainer, DefaultDataCollator

model_hub_id = "Kkordik/test_longformer_4096_qsi" # @param {type: "string"}
num_train_epochs = 3 # @param {type:"integer"}
learning_rate = 2e-5 # @param

data_collator = DefaultDataCollator()

training_args = TrainingArguments(
    output_dir=model_hub_id.split("/")[1],
    hub_model_id=model_hub_id,
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    gradient_accumulation_steps=8,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=num_train_epochs,
    weight_decay=0.01,
    push_to_hub=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
# @title Start training
trainer.train()
trainer.push_to_hub()