In [1]:
import os

In [2]:
%pwd

'F:\\Question-Answering-Project\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'F:\\Question-Answering-Project'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path
    data_path: Path
    model_path: Path
    tokenizer_path: Path
    metric: str
    metric_file_name: Path
    

In [6]:
from Question_Answering.constants import *
from Question_Answering.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        config = self.config.model_evaluation
        params = self.params.EvaluationArguments
        
        create_directories([config.root_dir])

        model_evaluation_config = ModelEvaluationConfig(
            root_dir = config.root_dir,
            data_path = config.data_path,
            model_path = config.model_path,
            tokenizer_path = config.tokenizer_path,
            metric = params.metric,
            metric_file_name = config.metric_file_name
        )
        return model_evaluation_config

In [8]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from datasets import load_dataset, load_from_disk, load_metric
import torch
import pandas as pd
from tqdm.auto import tqdm
import evaluate
from Question_Answering.entity import ModelEvaluationConfig
from collections import defaultdict
import numpy as np
from accelerate import Accelerator
from transformers import default_data_collator
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'evaluate'

In [39]:
class ModelEvaluation:
    def __init__(self, config: ModelEvaluationConfig):
        self.config = config
        self.metric = evaluate.load(config.metric)

    def compute_metrics(self, start_logits, end_logits, features, examples):
        example_to_features = defaultdict(list)
        for idx, feature in enumerate(features):
            example_to_features[feature["example_id"]].append(idx)

        n_best = 20
        max_answer_length = 30
        predicted_answers = []
        predicted_answers = []

        for example in tqdm(examples):
            example_id = example["id"]
            context = example["context"]
            answers = []

            # Loop through all features associated with that example
            for feature_index in example_to_features[example_id]:
                start_logit = start_logits[feature_index]
                end_logit = end_logits[feature_index]
                offsets = features[feature_index]["offset_mapping"]

                start_indexes = np.argsort(start_logit)[-1: -n_best - 1: -1].tolist()
                end_indexes = np.argsort(end_logit)[-1: -n_best - 1: -1].tolist()
                for start_index in start_indexes:
                    for end_index in end_indexes:
                        # Skip answers that are not fully in the context
                        if offsets[start_index] is None or offsets[end_index] is None:
                            continue
                        # Skip answers with a length that is either < 0 or > max_answer_length
                        if (
                                end_index < start_index
                                or end_index - start_index + 1 > max_answer_length
                        ):
                            continue

                        answer = {
                            "text": context[offsets[start_index][0]: offsets[end_index][1]],
                            "logit_score": start_logit[start_index] + end_logit[end_index],
                        }
                        answers.append(answer)

            # Select the answer with the best score
            if len(answers) > 0:
                best_answer = max(answers, key=lambda x: x["logit_score"])
                predicted_answers.append(
                    {"id": example_id, "prediction_text": best_answer["text"]}
                )
            else:
                predicted_answers.append({"id": example_id, "prediction_text": ""})

        theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
        return self.metric.compute(predictions=predicted_answers, references=theoretical_answers)

    def evaluate(self):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
        model = AutoModelForQuestionAnswering.from_pretrained(self.config.model_path).to(device)

        raw_datasets_validation_split = load_from_disk("artifacts/ingestion/validation")
        train_dataset = load_from_disk(self.config.train_data_path)
        train_dataset.set_format("torch")
        valid_dataset = load_from_disk(self.config.valid_data_path)
        valid_dataset.set_format("torch")
        train_dataloader = DataLoader(
            train_dataset,
            shuffle=True,
            collate_fn=default_data_collator,
            batch_size=8,
        )
        eval_dataloader = DataLoader(
            valid_dataset,
            shuffle=True,
            collate_fn=default_data_collator,
            batch_size=8,
        )

        accelerator = Accelerator()
        model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
            model, optimizer, train_dataloader, eval_dataloader
        )
        model.eval()
        start_logits = []
        end_logits = []

        for batch in tqdm(eval_dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)

            start_logits.append(outputs.start_logits)
            end_logits.append(outputs.end_logits)
        start_logits = np.concatenate(start_logits)
        end_logits = np.concatenate(end_logits)
        start_logits = start_logits[: len(valid_dataset)]
        end_logits = end_logits[: len(valid_dataset)]

        metrics = self.compute_metrics(
            start_logits, end_logits, valid_dataset, raw_datasets_validation_split
        )
        print(metrics)


In [26]:
try:
    config = ConfigurationManager()
    model_evaluation_config = config.get_model_evaluation_config()
    model_evaluation_config = ModelEvaluation(config=model_evaluation_config)
    model_evaluation_config.evaluate()
except Exception as e:
    raise e

[2023-11-03 13:58:03,840: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-11-03 13:58:03,842: INFO: common: yaml file: params.yaml loaded successfully]
[2023-11-03 13:58:03,843: INFO: common: created directory at: artifacts]
[2023-11-03 13:58:03,844: INFO: common: created directory at: artifacts/model_evaluation]


NameError: name 'model_validation_config' is not defined

In [9]:
from safetensors.torch import load_model, save_model

In [15]:
finetuned_model = load_model(model, "artifacts/model_trainer/model.safetensors")

In [14]:
model = AutoModelForQuestionAnswering.from_pretrained("bert-base-cased")

Downloading model.safetensors: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 436M/436M [02:45<00:00, 2.63MB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
from transformers import BertConfig, BertModel
config = BertConfig.from_json_file("artifacts/model_trainer/config.json")
model = BertModel.from_pretrained("artifacts/model_trainer/model.safetensors", config = config )

Some weights of BertModel were not initialized from the model checkpoint at artifacts/model_trainer/model.safetensors and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:

# Python program to read
# json file
 
import json
 
# Opening JSON file
f = open('artifacts/model_trainer/tokenizer_config.json')
 
# returns JSON object as 
# a dictionary
data = json.load(f)
 
# Iterating through the json
# list
for i in data.keys():
    print(i)
 
# Closing file
f.close()

added_tokens_decoder
clean_up_tokenization_spaces
cls_token
do_lower_case
mask_token
model_max_length
pad_token
sep_token
strip_accents
tokenize_chinese_chars
tokenizer_class
unk_token


In [47]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from datasets import load_dataset, load_from_disk, load_metric
import torch
import pandas as pd
from tqdm.auto import tqdm
import evaluate
from Question_Answering.entity import ModelEvaluationConfig
from collections import defaultdict
import numpy as np
from accelerate import Accelerator
from transformers import default_data_collator
from torch.utils.data import DataLoader
from transformers import BertConfig, BertModel
from torch.optim import AdamW

In [70]:
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
finetuned_config = BertConfig.from_json_file("artifacts/model_trainer/config.json")
model = BertModel.from_pretrained("artifacts/model_trainer/model.safetensors", config=finetuned_config).to(device)

raw_datasets_validation_split = load_from_disk("artifacts/data_ingestion/validation")

valid_dataset = load_from_disk("artifacts/data_transformation/validation_dataset")
valid_dataset = valid_dataset.remove_columns(["example_id","offset_mapping"])
valid_dataset.set_format("torch")

eval_dataloader = DataLoader(
    valid_dataset,
    shuffle=True,
    batch_size=8,
)


Some weights of BertModel were not initialized from the model checkpoint at artifacts/model_trainer/model.safetensors and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [50]:
eval_dataloader

<torch.utils.data.dataloader.DataLoader at 0x201078bbdf0>

In [55]:
valid_dataset.column_names

['input_ids',
 'token_type_ids',
 'attention_mask',
 'offset_mapping',
 'example_id']

In [56]:
raw_datasets_validation_split = load_from_disk("artifacts/data_ingestion/validation")

In [57]:
raw_datasets_validation_split.column_names

['id', 'title', 'context', 'question', 'answers']

In [59]:
from tqdm.auto import tqdm
import numpy as np

def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [60]:
metric = evaluate.load("squad")

Downloading builder script: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 4.53k/4.53k [00:00<00:00, 129kB/s]
Downloading extra modules: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 3.32k/3.32k [00:00<?, ?B/s]


In [None]:
"""model.eval()
start_logits = []
end_logits = []

for batch in tqdm(eval_dataloader):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    """

In [67]:
from accelerate import Accelerator
accelerator = Accelerator()