# How to use the DistilBERT.py (edit by xhu85@jhu.edu)

It is introduction code of how to use the distilBERT.py for my first edition, you can change it after understanding how the code working

## Import Part

In [1]:
from transformers import (AutoTokenizer, DefaultDataCollator, AutoModelForQuestionAnswering,
                          TrainingArguments, Trainer, pipeline)
from datasets import load_dataset

In [8]:
def initialize_tokenizer_model_collator():
    """
    Initialize new AutoTokenizer AutoModel Data collator
    :return:
    data_collator
    tokenizerï¼š from AutoTokenizer
    model: from AutoModelForQuestionAnswering
    """

    data_collator = DefaultDataCollator()
    tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
    model = AutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")

    return data_collator, tokenizer, model


def training(output_dir: str, model: AutoModelForQuestionAnswering, train_dataset, test_dataset,
             tokenizer: AutoTokenizer, data_collator: DefaultDataCollator, save_path):
    training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy='epoch',
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_gpu_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        push_to_hub=False,  # no connection to Hugging HUb
        report_to=['none']  # it require the set up of the wandb, will do it probably
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator
    )

    trainer.train()
    trainer.save_model(save_path)
    tokenizer.save_pretrained(save_path)


def prepared_squad(tokenizer):
    """
    It downloads and prepare the SQuAD(Stanford Question Answering Dataset) for training
    :return:
    tokenized_squad: tokenized SQuAD
    """
    squad = load_dataset("squad", split="train[:5000]")
    squad = squad.train_test_split(test_size=0.2)

    def preprocess_function_squad(examples):
        """
        It is a preprocessing example from the hugging hub
        :param examples:
        :param tokenizer:
        :return:
        """
        questions = [q.strip() for q in examples["question"]]  # Strip the question
        inputs = tokenizer(  # tokenize input
            questions,
            examples["context"],
            max_length=384,
            truncation="only_second",  # if len(questions+context) > max_input, only context will be truncated to fit
            return_offsets_mapping=True,  # offset mapping in the tokenizers output, map token position to the character
            # position in the original text
            padding="max_length",  # ensure all tokenized input are padded to the same length (max_length)
        )

        offset_mapping = inputs.pop("offset_mapping")
        answers = examples["answers"]
        start_positions = []
        end_positions = []

        for i, offset in enumerate(offset_mapping):
            answer = answers[i]
            start_char = answer["answer_start"][0]
            end_char = answer["answer_start"][0] + len(answer["text"][0])
            sequence_ids = inputs.sequence_ids(i)

            # Find the start and end of the context
            idx = 0
            while sequence_ids[idx] != 1:
                idx += 1
            context_start = idx
            while sequence_ids[idx] == 1:
                idx += 1
            context_end = idx - 1

            # If the answer is not fully inside the context, label it (0, 0)
            if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
                start_positions.append(0)
                end_positions.append(0)
            else:
                # Otherwise it's the start and end token positions
                idx = context_start
                while idx <= context_end and offset[idx][0] <= start_char:
                    idx += 1
                start_positions.append(idx - 1)

                idx = context_end
                while idx >= context_start and offset[idx][1] >= end_char:
                    idx -= 1
                end_positions.append(idx + 1)

        inputs["start_positions"] = start_positions
        inputs["end_positions"] = end_positions
        return inputs

    tokenized_squad = squad.map(preprocess_function_squad, batched=True, remove_columns=squad["train"].column_names)
    return tokenized_squad, tokenizer


def initialize_model_with_squad(save_path, model_name):
    """
    Initialize the model
    Train it with the SQuAD datasets and save it to setting directory
    :param save_path: the directory we want our tokenizer and model saved to
    :return: nothing return, model is saved under the certain directory

    """
    data_collator, tokenizer, model = initialize_tokenizer_model_collator()
    tokenized_squad, tokenizer = prepared_squad(tokenizer)
    training(output_dir=model_name, model=model, train_dataset=tokenized_squad['train'],
             test_dataset=tokenized_squad['test'], tokenizer=tokenizer, data_collator=data_collator,
             save_path=save_path)


def question_answer(model_path, question, context):
    """
    Question Answer function
    :param model_path: the path to the Directory of the tokenizer and the model
    :param question: the string of the Question to the context
    :param context: The text that you want to
    :return: The Answer to the Question Related to the Context
    """
    question_answerer = pipeline("question-answering", model=model_path, tokenizer=model_path)
    return question_answerer(question=question, context=context)


## Instruction part

In the py, you can write those under

<code>Python
    if __name__ == "__main__":
</code>

Or when you import it for other code part

<code>Python
    import distilBERT
    distilBERT.question_answer('./model/sample_model', 'How old is Tom?', 'Tom is 2 year old')
</code>

Do whatever you like!

### Initialization

you need to initialize the model before there is no any other prior code

The auto, tokenizer is imported from the <a herf='https://huggingface.co/distilbert/distilbert-base-uncased'>Huggingface</a>, there still some downstream task need to be done for pre-train the model

<a herf='https://huggingface.co/datasets/rajpurkar/squad'>SQuAD</a> (Standford Question Answering Dataset) is used for the pre-train stage

Default training parameter is under the

<code>Python
def training(output_dir: str, model: AutoModelForQuestionAnswering, train_dataset, test_dataset,
             tokenizer: AutoTokenizer, data_collator: DefaultDataCollator, save_path):
        output_dir=output_dir,
        evaluation_strategy='epoch',
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_gpu_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        push_to_hub=False,  # no connection to Hugging HUb
        report_to=['none']  # it require the set up of the wandb, will do it probably
    )
    ........
</code>
                                                                                                                                                                              
It using the              
<ul>
<li> Epoch, 3 epochs actually</li>
<li>Learning Rate of 2e-5</li>
<li>Push to hub is not allow here for privacy concern</li>
<li>The wandb is banned for easy deploy</li>
</ul>

However if you want to push the model on the HuggingFace you need to login actually
You can find instruction <a herf='https://huggingface.co/docs/huggingface_hub/quick-start'>Here</a>

Wandb is very useful machine for observing the training loss and other metrics, however to use this you need the authentication and it has the risk of privacy, to deploy it you need this <a herf='https://docs.wandb.ai/guides/hosting/self-managed/basic-setup'>instruction</a>


Let start the procedure to initialize the model

#### Indicate the path we store our model to 

A good practice is to always know where your model stored, since we need to import back our model for retraining and QA tas.

Me, the developer tend to store it under the root directory, to model directory

then Create a directory for this model specficlly

In [9]:
path = './model/model_sample2'

#### Identify the model name

not that necessary, just make sure not duplicate with other model under the directory

In [11]:
model_name = 'qa_model2'

In [12]:
initialize_model_with_squad(path, model_name)

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,2.838853
2,No log,2.237673
3,No log,2.166206


Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_de

#### How to use the model?

import the model from the path we store to , and then use the question, context and expect for an answer

In [14]:
question = 'How many programming languages does BLOOM support?'
context = ("BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 "
               "programming languages.")
print(question_answer(model_path=path, question=question,context=context))

{'score': 0.14730927348136902, 'start': 10, 'end': 21, 'answer': '176 billion'}


In [15]:
question = ('How much higher are the post-test odds of a high RDI compared to the pre-test odds following '
                'a positive test?')
context = ('Based on a moderate classification threshold from the boosting algorithm, the estimated post-test odds '
               'of a high RDI were 2.20 times higher than the pre-test odds given a positive test, while the '
               'corresponding post-test odds were decreased by 52% given a negative test (sensitivity and specificity '
               'of 0.66 and 0.70, respectively).')
print(question_answer(model_path=path, question=question, context=context))


{'score': 0.14245392382144928, 'start': 122, 'end': 132, 'answer': '2.20 times'}
