In [9]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

model = AutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
import pandas as pd
df = pd.read_csv('../question_answer_pair.csv')
df

Unnamed: 0,index,highlighted_sent,answer,question
0,146,Antioxidants are compounds that can delay or i...,oxidative stress,What is a major role of antioxidants in cancer...
1,153,All kidney transplant recipients at our center...,28 recipients,How many kidney transplant recipients have und...
2,61,"Thrombocytosis, an uncommon side effect of all...",of all trans,What is one side effect of all-trans retinoic ...
3,55,Disruption risks in supply chains are low-freq...,The existing,What type of interconnectivity could be a key ...
4,155,The first Covid-19 listed studies with pediatr...,develop neurological symptoms,What is uncertain about the long-term impact o...
...,...,...,...,...
2164,3,"In early 2020, the emerging respiratory virus ...",high,What type of efficacy does PPE have?
2165,132,"Diedel, a protein secreted from peripheral tis...",peroxide can,What is the name of the enzyme that quenches h...
2166,35,Animal models are crucial for the study of sev...,The World Health,Who defines zoonotic disease as?
2167,134,Uganda hosts the largest number of refugees in...,access services,What has the COVID-19 prevention and control s...


In [3]:
# Remove all occurrences of "[HL]" from the 'highlighted_sent' column
df['context'] = df['highlighted_sent'].str.replace(r"\[HL\]", "", regex=True)
df = df.rename(columns={"answer": "answers"})

# Convert the updated DataFrame to a list of dictionaries for question-answering
qa_data_cleaned = df[['question', 'answers', 'context']]


In [4]:
import pandas as pd

# Assuming df is your pandas DataFrame

# Define a function to find the start position of the answer in the context
def find_answer_start(context, answer):
    return context.find(answer)

# Adjust the 'answers' column to match the expected structure
qa_data_cleaned['answers'] = qa_data_cleaned.apply(lambda row: {'text': [row['answers']], 'answer_start': [find_answer_start(row['context'], row['answers'])]}, axis=1)

# Show the updated DataFrame
qa_data_cleaned

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qa_data_cleaned['answers'] = qa_data_cleaned.apply(lambda row: {'text': [row['answers']], 'answer_start': [find_answer_start(row['context'], row['answers'])]}, axis=1)


Unnamed: 0,question,answers,context
0,What is a major role of antioxidants in cancer...,"{'text': ['oxidative stress'], 'answer_start':...",Antioxidants are compounds that can delay or i...
1,How many kidney transplant recipients have und...,"{'text': ['28 recipients'], 'answer_start': [1...",All kidney transplant recipients at our center...
2,What is one side effect of all-trans retinoic ...,"{'text': ['of all trans'], 'answer_start': [-1]}","Thrombocytosis, an uncommon side effect of all..."
3,What type of interconnectivity could be a key ...,"{'text': ['The existing'], 'answer_start': [308]}",Disruption risks in supply chains are low-freq...
4,What is uncertain about the long-term impact o...,"{'text': ['develop neurological symptoms'], 'a...",The first Covid-19 listed studies with pediatr...
...,...,...,...
2164,What type of efficacy does PPE have?,"{'text': ['high'], 'answer_start': [324]}","In early 2020, the emerging respiratory virus ..."
2165,What is the name of the enzyme that quenches h...,"{'text': ['peroxide can'], 'answer_start': [-1]}","Diedel, a protein secreted from peripheral tis..."
2166,Who defines zoonotic disease as?,"{'text': ['The World Health'], 'answer_start':...",Animal models are crucial for the study of sev...
2167,What has the COVID-19 prevention and control s...,"{'text': ['access services'], 'answer_start': ...",Uganda hosts the largest number of refugees in...


In [5]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [6]:
from datasets import Dataset
data_dicts = qa_data_cleaned.to_dict('records')
dataset = Dataset.from_pandas(pd.DataFrame(data_dicts))
dataset = dataset.train_test_split(test_size=0.2)

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['question', 'answers', 'context'])

Map: 100%|██████████| 1735/1735 [00:00<00:00, 3768.96 examples/s]
Map: 100%|██████████| 434/434 [00:00<00:00, 3685.22 examples/s]


In [7]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

# Initialize the data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

# Assuming `tokenized_dataset` is your DatasetDict after applying preprocess_function

# Convert the 'train' and 'test' splits to PyTorch DataLoader
train_dataloader = DataLoader(
    tokenized_dataset["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

validation_dataloader = DataLoader(
    tokenized_dataset["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [10]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="my_awesome_qa_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

                                                 
 33%|███▎      | 109/327 [11:46<19:08,  5.27s/it]

{'eval_loss': 3.738853931427002, 'eval_runtime': 60.4406, 'eval_samples_per_second': 7.181, 'eval_steps_per_second': 0.463, 'epoch': 1.0}


                                                   
 67%|██████▋   | 218/327 [23:25<08:52,  4.89s/it]

{'eval_loss': 3.189769744873047, 'eval_runtime': 59.1537, 'eval_samples_per_second': 7.337, 'eval_steps_per_second': 0.473, 'epoch': 2.0}


                                                 
100%|██████████| 327/327 [35:00<00:00,  6.42s/it]

{'eval_loss': 3.124058246612549, 'eval_runtime': 59.207, 'eval_samples_per_second': 7.33, 'eval_steps_per_second': 0.473, 'epoch': 3.0}
{'train_runtime': 2100.055, 'train_samples_per_second': 2.479, 'train_steps_per_second': 0.156, 'train_loss': 3.680132851323586, 'epoch': 3.0}





TrainOutput(global_step=327, training_loss=3.680132851323586, metrics={'train_runtime': 2100.055, 'train_samples_per_second': 2.479, 'train_steps_per_second': 0.156, 'train_loss': 3.680132851323586, 'epoch': 3.0})

In [31]:
question = "What does COVID-19 aim to protect?"
context = "More than 35 million people worldwide are currently addicted to drugs. COVID-19 has created new routes for drug trafficking that increase the risk of drug addiction, making it vital to address this problem. The aim is to effectively protect the [HL]physical health[HL] of PWUD and prevent the combination of CO VID-19 and the physiological and psychological effects of drugs from affecting relapse behaviour."

In [23]:
import torch
torch.save(model.state_dict(), '../model/model_weights.pth')

In [24]:
model = AutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
model.load_state_dict(torch.load('../model/model_weights.pth'))
model.eval()

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
      

In [32]:
from transformers import pipeline

question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer)
question_answerer(question=question, context=context)

{'score': 0.017759479582309723,
 'start': 249,
 'end': 264,
 'answer': 'physical health'}