In [1]:
import os

In [2]:
%pwd

'C:\\Users\\iheba\\IdeaProjects\\Mlops-Text-Summarizer\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'C:\\Users\\iheba\\IdeaProjects\\Mlops-Text-Summarizer'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path
    data_path: Path
    model_path: Path
    tokenizer_path: Path
    metric_file_name: Path
    mlflow_uri: str
    all_params: dict



In [6]:
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
            self,
            config_filepath=CONFIG_FILE_PATH,
            params_filepath=PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        config = self.config.model_evaluation
        params = self.params.TrainingArguments
        create_directories([config.root_dir])

        model_evaluation_config = ModelEvaluationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            model_path=config.model_path,
            tokenizer_path=config.tokenizer_path,
            metric_file_name=config.metric_file_name,
            mlflow_uri=config.mlflow_uri,
            all_params=params

        )

        return model_evaluation_config


In [8]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk, load_metric
import torch
import pandas as pd
from tqdm import tqdm
import transformers
import mlflow
from urllib.parse import urlparse

[2024-06-24 00:59:31,897: INFO: config: PyTorch version 2.3.0 available.]
[2024-06-24 00:59:31,898: INFO: config: TensorFlow version 2.16.1 available.]


In [9]:
class ModelEvaluation:
    def __init__(self, config: ModelEvaluationConfig):
        self.config = config

        device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
        self.model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)

    def generate_batch_sized_chunks(self, list_of_elements, batch_size):
        """split the dataset into smaller batches that we can process simultaneously
        Yield successive batch-sized chunks from list_of_elements."""
        for i in range(0, len(list_of_elements), batch_size):
            yield list_of_elements[i: i + batch_size]

    def calculate_metric_on_test_ds(self, dataset, metric, model, tokenizer,
                                    batch_size=16, device="cuda" if torch.cuda.is_available() else "cpu",
                                    column_text="article",
                                    column_summary="highlights"):
        article_batches = list(self.generate_batch_sized_chunks(dataset[column_text], batch_size))
        target_batches = list(self.generate_batch_sized_chunks(dataset[column_summary], batch_size))

        for article_batch, target_batch in tqdm(
                zip(article_batches, target_batches), total=len(article_batches)):
            inputs = tokenizer(article_batch, max_length=1024, truncation=True,
                               padding="max_length", return_tensors="pt")

            summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                                       attention_mask=inputs["attention_mask"].to(device),
                                       length_penalty=0.8, num_beams=8, max_length=128)
            ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''

            # Finally, we decode the generated texts, 
            # replace the  token, and add the decoded texts with the references to the metric.
            decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                                  clean_up_tokenization_spaces=True)
                                 for s in summaries]

            decoded_summaries = [d.replace("", " ") for d in decoded_summaries]

            metric.add_batch(predictions=decoded_summaries, references=target_batch)

        #  Finally compute and return the ROUGE scores.
        score = metric.compute()
        return score

    def evaluate(self):
        #loading data 
        dataset_samsum_pt = load_from_disk(self.config.data_path)

        rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

        rouge_metric = load_metric('rouge')

        score = self.calculate_metric_on_test_ds(
            dataset_samsum_pt['test'][0:10], rouge_metric, self.model_pegasus, self.tokenizer, batch_size=2,
            column_text='dialogue', column_summary='summary'
        )

        rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)

        df = pd.DataFrame(rouge_dict, index=['pegasus'])
        # Saving metrics as local
        df.to_csv(self.config.metric_file_name, index=False)
        return rouge_dict

    def log_into_mlflow(self, metrics: dict):

        mlflow.set_registry_uri(self.config.mlflow_uri)
        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
        summarization_pipeline = transformers.pipeline("summarization", model=self.model_pegasus,
                                                       tokenizer=self.tokenizer,
                                                       device=0 if torch.cuda.is_available() else -1)
        with mlflow.start_run():
            mlflow.set_experiment("Fine tuning Pegasus model")

            # Log the metrics
            for metric in metrics:
                mlflow.log_metric(metric, metrics[metric])

            mlflow.log_params(self.config.all_params)

            mlflow.transformers.log_model(summarization_pipeline, "model_pegasus_fine_tuned",
                                          registered_model_name="PegasusSamsumModel")

            # Register the model if the tracking URI is not a file store
            if tracking_url_type_store != "file":
                mlflow.register_model("runs:/{}/model_pegasus_fine_tuned".format(mlflow.active_run().info.run_id),
                                      "PegasusSamsumModel")


In [11]:
import dagshub

dagshub.init(repo_owner='iheb.aamrii', repo_name='Mlops-Text-Summarizer', mlflow=True)

[2024-06-24 01:01:09,557: INFO: _client: HTTP Request: GET https://dagshub.com/api/v1/user "HTTP/1.1 200 OK"]
[2024-06-24 01:01:10,263: INFO: _client: HTTP Request: GET https://dagshub.com/api/v1/repos/iheb.aamrii/Mlops-Text-Summarizer "HTTP/1.1 200 OK"]


[2024-06-24 01:01:10,270: INFO: helpers: Initialized MLflow to track repo "iheb.aamrii/Mlops-Text-Summarizer"]


[2024-06-24 01:01:10,272: INFO: helpers: Repository iheb.aamrii/Mlops-Text-Summarizer initialized!]


In [12]:
try:
    config = ConfigurationManager()
    model_evaluation_config = config.get_model_evaluation_config()
    model_evaluation_config = ModelEvaluation(config=model_evaluation_config)
    metrics = model_evaluation_config.evaluate()
    model_evaluation_config.log_into_mlflow(metrics)

except Exception as e:
    raise e

[2024-06-24 01:01:14,379: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-06-24 01:01:14,381: INFO: common: yaml file: params.yaml loaded successfully]
[2024-06-24 01:01:14,382: INFO: common: created directory at: artifacts]
[2024-06-24 01:01:14,382: INFO: common: created directory at: artifacts/model_evaluation]


  rouge_metric = load_metric('rouge')
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
100%|██████████| 5/5 [02:07<00:00, 25.58s/it]

[2024-06-24 01:03:30,230: INFO: rouge_scorer: Using default tokenizer.]





RuntimeError: Failed to import transformers.pipelines because of the following error (look up to see its traceback):
cannot import name 'formatargspec' from 'inspect' (C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\inspect.py)