In [1]:
import os
import sys

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
PROJECT_PATH = "/content/drive/My Drive/Text Summarizer"
os.chdir(PROJECT_PATH)

In [4]:
sys.path.append(os.path.join(PROJECT_PATH, 'src'))

In [5]:
%pwd

'/content/drive/MyDrive/Text Summarizer'

In [10]:
pip install -r requirements.txt

Obtaining file:///content/drive/MyDrive/Text%20Summarizer (from -r requirements.txt (line 21))
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets (from -r requirements.txt (line 3))
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacrebleu (from -r requirements.txt (line 4))
  Downloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score (from -r requirements.txt (line 5))
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py7zr (from -r requirements.txt (line 6))
  Downloading py7zr-0.21.1-py3-none-any.whl (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m10.3 MB/s[0m eta [3

In [6]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen = True)
class ModelEvaluationConfig:
    root_dir: Path
    data_path: Path
    model_path: Path
    tokenizer_path: Path
    metrics_file_name: Path

In [7]:
from textsummarizer.utils.common import read_yaml, create_directories
from textsummarizer.constants import *

In [8]:
class Configurationmanager:
    def __init__(
            self,
            config_filepath = CONFIG_FILE_PATH,
            params_filepath = PARAMS_FILE_PATH):

            self.config = read_yaml(config_filepath)
            self.params = read_yaml(params_filepath)

            create_directories([self.config.artifacts_root])


    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
          config = self.config.model_evaluation 
          params = self.params.TrainingArguments


          create_directories([config.root_dir])

          model_evaluation_config = ModelEvaluationConfig(

              root_dir = config.root_dir,
              data_path = config.data_path,
              model_path = config.model_path,
              tokenizer_path = config.tokenizer_path,
              metrics_file_name = config.metrics_file_name
                )

          return model_evaluation_config

In [9]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from textsummarizer.loggin import logger
from datasets import load_dataset, load_from_disk, load_metric
import torch
import pandas as pd
from tqdm import tqdm

In [10]:
class ModelEvaluation:
    def __init__(self, config: ModelEvaluationConfig):
        self.config = config


    # Evaluation

    def generate_batch_sized_chunks(self,list_of_elements, batch_size):

        """
        Split the dataset into smaller batches that we can process simultaneously
        """

        for i in range(0, len(list_of_elements), batch_size):
            yield list_of_elements[i : i + batch_size]

    def calculate_metric_on_test_ds(self, dataset, metric, model, tokenizer, batch_size=16,
                                    device="cuda" if torch.cuda.is_available() else "cpu",
                                    column_text="article", column_summary="highlights"):

        article_batches = list(self.generate_batch_sized_chunks(dataset[column_text], batch_size))
        target_batches = list(self.generate_batch_sized_chunks(dataset[column_summary], batch_size))

        for article_batch, target_batch in tqdm(
            zip(article_batches, target_batches), total=len(article_batches)):

            inputs = tokenizer(article_batch, max_length=1024,
                            padding="max_length", return_tensors="pt")

            summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                                    attention_mask=inputs["attention_mask"].to(device),
                                    length_penalty=0.3, num_beams=8, max_length=70)

            # Finally we decode the generated texts,
            # replace the token, and add the decoded texts with the refrences to the metric.

            decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                                clean_up_tokenization_spaces=True)
                for s in summaries]

            decoded_summaries = [d.replace("", " ") for d in decoded_summaries]

            metric.add_batch(predictions=decoded_summaries, references=target_batch)

        # Finally compute and return the ROUGR scores.

        score = metric.compute()
        return score

    def evaluate(self):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)

        #loading data
        dataset_samsum_pt = load_from_disk(self.config.data_path)

        rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

        rouge_metric = load_metric('rouge')

        score = self.calculate_metric_on_test_ds(
        dataset_samsum_pt['test'][0:10], rouge_metric, model_pegasus, tokenizer, batch_size = 2, column_text = 'dialogue', column_summary = 'summary'
        )

        rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)

        df = pd.DataFrame(rouge_dict, index = [f'pegasus'])

        df.to_csv(self.config.metric_file_name, index = False)

In [12]:
try:
    config = Configurationmanager()
    print(config.config)  # Debugging line to print the config
    model_evaluation_config = config.get_model_evaluation_config()
    print(model_evaluation_config)  # Debugging line to print the model evaluation config
    model_evaluation = ModelEvaluation(model_evaluation_config)
    model_evaluation.evaluate()
except Exception as e:
    print(f"Error: {e}")
    raise e

{'artifacts_root': 'artifacts', 'data_ingestion': {'root_dir': 'artifacts/data_ingestion', 'source_url': 'https://github.com/Naominour/Text-Summarizer/raw/master/summarizer-data.zip', 'local_data_file': 'artifacts/data_ingestion/data.zip', 'unzip_dir': 'artifacts/data_ingestion'}, 'data_validation': {'root_dir': 'artifacts/data_validation', 'STATUS_FILE': 'artifacts/data_validation/status.txt', 'ALL_REQUIRED_FILES': ['train', 'test', 'validation']}, 'data_transformation': {'root_dir': 'artifacts/data_transformation', 'data_path': 'artifacts/data_ingestion/samsum_dataset', 'tokenizer_name': 'google/pegasus-cnn_dailymail'}, 'model_trainer': {'root_dir': 'artifacts/model_trainer', 'data_path': 'artifacts/data_transformation/samsum_dataset', 'model_ckpt': 'google/pegasus-cnn_dailymail'}, 'model_evaluation': {'root_dir': 'artifacts/model_evaluation', 'data_path': 'artifacts/data_transformation/samsum_dataset', 'model_path': 'artifacts/model_trainer/pegasus-samsum-model', 'tokenizer_path': '

BoxKeyError: "'ConfigBox' object has no attribute 'model_path'"