In [None]:
!pip install datasets

In [2]:
from datasets import load_dataset
raw = load_dataset("squad_v2")

Downloading builder script:   0%|          | 0.00/5.28k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.02k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/9.55M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/801k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [3]:
from transformers import AutoTokenizer

model_checkpt = "PinkiKumari22/finetuned_qa"
tokenizer = AutoTokenizer.from_pretrained(model_checkpt)

tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

In [4]:
context = raw["validation"][0]["context"]
question = raw["validation"][0]["question"]

inputs = tokenizer(question, context)
tokenizer.decode(inputs["input_ids"])

'[CLS] in what country is normandy located? [SEP] the normans ( norman : nourmands ; french : normands ; latin : normanni ) were the people who in the 10th and 11th centuries gave their name to normandy, a region in france. they were descended from norse ( " norman " comes from " norseman " ) raiders and pirates from denmark, iceland and norway who, under their leader rollo, agreed to swear fealty to king charles iii of west francia. through generations of assimilation and mixing with the native frankish and roman - gaulish populations, their descendants would gradually merge with the carolingian - based cultures of west francia. the distinct cultural and ethnic identity of the normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries. [SEP]'

In [5]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [6]:
max_length = 384
stride = 128
validation_dataset = raw["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw["validation"].column_names,
)
len(raw["validation"]), len(validation_dataset)

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

(11873, 12134)

In [None]:
!pip install evaluate
import evaluate
!pip install rouge_score
import numpy as np
import collections

In [8]:
metric1 = evaluate.load("squad_v2")
meteor = evaluate.load("meteor")
bleu = evaluate.load("bleu")
rouge = evaluate.load('rouge')

from tqdm.auto import tqdm

import numpy as np

n_best = 20
max_answer_length = 30

def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    predictions = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append({
                "id": example_id,
                "prediction_text": best_answer["text"],
                "no_answer_probability": 0.0  # Set a default value
            })
            predictions.append(best_answer["text"])
        else:
            predicted_answers.append({
                "id": example_id,
                "prediction_text": "",
                "no_answer_probability": 0.0  # Set a default value
            })
            predictions.append("")



    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    theoretical = [{"answers": ex["answers"]} for ex in examples]
    d1 = metric1.compute(predictions=predicted_answers, references=theoretical_answers)
    d2 = meteor.compute(predictions=predictions, references=theoretical)
    d3 = bleu.compute(predictions=predictions, references=theoretical)
    d4 = rouge.compute(predictions=predictions, references=theoretical)

    return d1,d2,d3,d4

Downloading builder script:   0%|          | 0.00/6.47k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.93k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [9]:
from transformers import TFAutoModelForQuestionAnswering
model = TFAutoModelForQuestionAnswering.from_pretrained(model_checkpt)

config.json:   0%|          | 0.00/712 [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/436M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForQuestionAnswering.

All the layers of TFBertForQuestionAnswering were initialized from the model checkpoint at PinkiKumari22/finetuned_qa.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForQuestionAnswering for predictions without further training.


In [10]:
from transformers import DefaultDataCollator
data_collator = DefaultDataCollator(return_tensors="tf")

tf_eval_dataset = model.prepare_tf_dataset(
    validation_dataset,
    collate_fn = data_collator,
    shuffle=False,
    batch_size=16,
)

In [11]:
predictions = model.predict(tf_eval_dataset)



In [12]:
compute_metrics(
    predictions["start_logits"],
    predictions["end_logits"],
    validation_dataset,
    raw["validation"],
)

  0%|          | 0/11873 [00:00<?, ?it/s]

({'exact': 7.479154383896235,
  'f1': 12.104968804598856,
  'total': 11873,
  'HasAns_exact': 14.979757085020243,
  'HasAns_f1': 24.244651588563126,
  'HasAns_total': 5928,
  'NoAns_exact': 0.0,
  'NoAns_f1': 0.0,
  'NoAns_total': 5945,
  'best_exact': 50.08843594710688,
  'best_exact_thresh': 0.0,
  'best_f1': 50.09257061476382,
  'best_f1_thresh': 0.0},
 {'meteor': 0.0009801603959021738},
 {'bleu': 0.0,
  'precisions': [1.5039629423531004e-05, 0.0, 0.0, 0.0],
  'brevity_penalty': 1.0,
  'length_ratio': 5.6001852943653665,
  'translation_length': 66491,
  'reference_length': 11873},
 {'rouge1': 2.105617788259075e-05,
  'rouge2': 0.0,
  'rougeL': 2.105617788259075e-05,
  'rougeLsum': 2.105617788259075e-05})