In [4]:
import torch
import os
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, DefaultDataCollator, Trainer, TrainingArguments, BertTokenizer
from datasets import Dataset, load_dataset, DatasetDict
import json
import pandas as pd


In [5]:
# Load the model from a .pt file
model_add = "C:\\Users\\CShuwen\\Desktop\\nlp_Project\\CS4248_G10\\bert_huggingface_model\\bert_baseline.pt"
model = torch.load(model_add, map_location=torch.device('cpu'))
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Ensure the model is in evaluation mode (not training mode)
model.eval()


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem

In [6]:
CWD = os.getcwd()


class preprocess_huggingface():
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

        self.train_squad = open_file("C:\\Users\\CShuwen\\Desktop\\nlp_Project\\CS4248_G10\\dataset\\train-v1.1.json")
        self.train_data = dataset_parse(self.train_squad)

        self.dev_squad = open_file("C:\\Users\\CShuwen\\Desktop\\nlp_Project\\CS4248_G10\\dataset\\dev-v1.1.json")
        self.dev_data = dataset_parse(self.dev_squad)

        self.data = DatasetDict({"train": self.train_data, "validation": self.dev_data})

    def preprocess_function(self, examples):
        questions = [q.strip() for q in examples["question"]]
        inputs = tokenizer(
            questions,
            examples["context"],
            max_length=384,
            truncation="only_second",
            return_offsets_mapping=True,
            padding="max_length",
        )

        offset_mapping = inputs.pop("offset_mapping")
        answers = examples["answers"]
        start_positions = []
        end_positions = []

        for i, offset in enumerate(offset_mapping):
            answer = answers[i]
            start_char = answer["answer_start"][0]
            end_char = answer["answer_start"][0] + len(answer["text"][0])
            sequence_ids = inputs.sequence_ids(i)

            # Find the start and end of the context
            idx = 0
            while sequence_ids[idx] != 1:
                idx += 1
            context_start = idx
            while sequence_ids[idx] == 1:
                idx += 1
            context_end = idx - 1

            # If the answer is not fully inside the context, label it (0, 0)
            if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
                start_positions.append(0)
                end_positions.append(0)
            else:
                # Otherwise it's the start and end token positions
                idx = context_start
                while idx <= context_end and offset[idx][0] <= start_char:
                    idx += 1
                start_positions.append(idx - 1)

                idx = context_end
                while idx >= context_start and offset[idx][1] >= end_char:
                    idx -= 1
                end_positions.append(idx + 1)

        inputs["start_positions"] = start_positions
        inputs["end_positions"] = end_positions
        return inputs


    def preprocess_validation_function(self, examples):
        questions = [q.strip() for q in examples["question"]]
        inputs = tokenizer(
            questions,
            examples["context"],
            max_length=384,
            truncation="only_second",
            return_offsets_mapping=True,
            padding="max_length",
        )
        
        offset_mapping = inputs.pop("offset_mapping")
        
        answers = examples["answers"]
        start_positions = []
        
        end_positions = []

        for i, offset in enumerate(offset_mapping):
            answer = answers[i]
            start_char = answer["answer_start"][0]
            end_char = answer["answer_start"][0] + len(answer["text"][0])
            sequence_ids = inputs.sequence_ids(i)

            # Find the start and end of the context
            idx = 0
            while sequence_ids[idx] != 1:
                idx += 1
            context_start = idx
            while sequence_ids[idx] == 1:
                idx += 1
            context_end = idx - 1

            # If the answer is not fully inside the context, label it (0, 0)
            if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
                start_positions.append(0)
                end_positions.append(0)
            else:
                # Otherwise, it's the start and end token positions
                idx = context_start
                while idx <= context_end and offset[idx][0] <= start_char:
                    idx += 1
                start_positions.append(idx - 1)

                idx = context_end
                while idx >= context_start and offset[idx][1] >= end_char:
                    idx -= 1
                end_positions.append(idx + 1)
        
        inputs["start_positions"] = start_positions
        inputs["end_positions"] = end_positions
        return inputs


    def get_train_set(self):
        ds = self.data
        tokenized = ds.map(self.preprocess_function, batched=True, remove_columns=ds["train"].column_names)

        return tokenized
    
    def get_test_set(self):
        ds = self.data
        validation_features = ds.map(
            self.preprocess_validation_function,
            batched=True,
            remove_columns=ds["validation"].column_names
        )

        return validation_features
    
    def get_data(self):
        return self.data

def open_file(file_path):
    try:
        f = open(file_path)
        dataset = json.load(f, strict=False)["data"]
        return dataset
    except FileNotFoundError:
        print("File not found.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

def dataset_parse(dataset):
    pre_dataframe = []
    for data in dataset:
        for p in data["paragraphs"]:
            for pqas in p["qas"]:
                for ans in pqas["answers"]:
#                     pre_dataframe.append(map(str, [p["context"], pqas["question"].strip(), ans["answer_start"], ans["text"]]))
                    pre_dataframe.append([p["context"], pqas["question"].strip(), { "answer_start": [int(ans["answer_start"])], "text": ans["text"] }, pqas["id"]])
    
    with open("processed.json", "w") as file:
        json.dump(pre_dataframe, file)

    
    df = pd.DataFrame(pre_dataframe, columns=["context", "question", "answers", "id"])
    ds = Dataset.from_pandas(df)

    return ds

In [7]:
data_collator = DefaultDataCollator()

training_args = TrainingArguments(
    output_dir="../tmp/bert-squad-baseline-model",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    weight_decay=0.01,
    logging_strategy="epoch",
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    tokenizer=tokenizer,
)


In [None]:
bert_squad = preprocess_huggingface(tokenizer)
# extract the test data set
validation_set = bert_squad.get_test_set()['validation']
train_set = bert_squad.get_train_set()['train']

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

In [None]:
validation_set.set_format(type=validation_set.format["type"], columns=list(validation_set.features.keys()))
predictions = trainer.predict(validation_set)
print(predictions.predictions[1])
print(predictions)

In [None]:
# printed using slurm:
"""

"""