In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/samsum-dataset/samsum-train.csv
/kaggle/input/samsum-dataset/samsum-test.csv
/kaggle/input/samsum-dataset/samsum-validation.csv
/kaggle/input/samsum-dataset/samsum_dataset/dataset_dict.json
/kaggle/input/samsum-dataset/samsum_dataset/validation/state.json
/kaggle/input/samsum-dataset/samsum_dataset/validation/dataset_info.json
/kaggle/input/samsum-dataset/samsum_dataset/validation/data-00000-of-00001.arrow
/kaggle/input/samsum-dataset/samsum_dataset/test/state.json
/kaggle/input/samsum-dataset/samsum_dataset/test/dataset_info.json
/kaggle/input/samsum-dataset/samsum_dataset/test/data-00000-of-00001.arrow
/kaggle/input/samsum-dataset/samsum_dataset/train/state.json
/kaggle/input/samsum-dataset/samsum_dataset/train/dataset_info.json
/kaggle/input/samsum-dataset/samsum_dataset/train/data-00000-of-00001.arrow


## Entity


In [2]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path 
    source_URL:str 
    local_data_file: Path
    unzip_dir: Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    ALL_REQUIRED_FILES: list

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    tokenizer_name: Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: Path
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    weight_decay: float
    logging_steps: int
    evaluation_strategy: str
    eval_steps: int
    save_steps: float
    gradient_accumulation_steps: int


@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path
    data_path: Path
    model_path: Path
    tokenizer_path: Path
    metric_file_name: Path
    

## utils


In [3]:
!pip install ensure box


Collecting ensure
  Downloading ensure-1.0.4-py3-none-any.whl.metadata (10 kB)
[31mERROR: Could not find a version that satisfies the requirement box (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for box[0m[31m
[0m

In [4]:
!pip install bbox

Collecting bbox
  Downloading bbox-0.9.4-py3-none-any.whl.metadata (576 bytes)
Collecting pyquaternion<0.10.0,>=0.9.5 (from bbox)
  Downloading pyquaternion-0.9.9-py3-none-any.whl (14 kB)
Downloading bbox-0.9.4-py3-none-any.whl (14 kB)
Installing collected packages: pyquaternion, bbox
Successfully installed bbox-0.9.4 pyquaternion-0.9.9


In [5]:
!pip install transformers transformers[sentencepiece] datasets sacrebleu rouge_score py7zr pandas nltk PyYAML matplotlib torch notebook boto3 mypy-boto3-s3 python-box ensure fastapi uvicorn Jinja2 

Collecting sacrebleu
  Downloading sacrebleu-2.4.0-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.4/57.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l- done
[?25hCollecting py7zr
  Downloading py7zr-0.20.8-py3-none-any.whl.metadata (16 kB)
Collecting mypy-boto3-s3
  Downloading mypy_boto3_s3-1.34.14-py3-none-any.whl.metadata (17 kB)
Collecting python-box
  Downloading python_box-7.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.8 kB)
Collecting ensure
  Using cached ensure-1.0.4-py3-none-any.whl.metadata (10 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-2.8.2-py3-none-any.whl.metadata (8.5 kB)
Collecting pycryptodomex>=3.16.0 (from py7zr)
  Downloading pycryptodomex-3.20.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)

In [6]:
import os
import yaml
from ensure import ensure_annotations
from box.exceptions import BoxValueError
from box import ConfigBox
from typing import Any
from pathlib import Path

@ensure_annotations
def read_yaml(path_to_yaml:Path) -> ConfigBox:
    """
    reads yaml file and returns
    Args:
        path_to _yaml (str): path like input
    Raises: 
        valueError: if yaml file is empty
        e: empty file
    
    Returns:
        config_box: configbox type
    """
    try:
        with open(path_to_yaml) as yaml_file:
            content = yaml.safe_load(yaml_file)
            logger.info(f"yaml file: {path_to_yaml} loaded succesfully")
            return ConfigBox(content)
    except BoxValueError:
            raise ValueError("yaml file is empty")
    except Exception as e:
         raise e 
    
@ensure_annotations
def create_directories(path_to_dir: list,verbose = True):
     """
     Args:
        path_to_dir (list): list of path of directories
        ignore_log(bool, optional): ignore if multiple dirs is to be created. Default to False
     """    
     for path in path_to_dir:
          os.makedirs(path,exist_ok=True)
          if verbose:
               logger.info(f"create directory as : {path}")


@ensure_annotations
def get_size(path: Path)-> str:
     """get size of file in KB
     Args:
        path(Path): path of the file
    Returns:
        str:size in KB
     """
     size_in_KB = round(os.path.getsize(path)/1024)
     return f"~{size_in_KB} KB"

## Config yaml

In [7]:
import yaml

config_data = {
    'artifacts_root': 'artifacts',
    'data_ingestion': {
        'root_dir': 'artifacts/data_ingestion',
        'source_URL': 'https://github.com/Luvannie/Text-Summarization-Project/raw/main/summarizer-data.zip',
        'local_data_file': 'artifacts/data_ingestion/summarizer-data.zip',
        'unzip_dir': 'artifacts/data_ingestion',
    },
    'data_validation': {
        'root_dir': 'artifacts/data_validation',
        'STATUS_FILE': 'artifacts/data_validation/status.txt',
        'ALL_REQUIRED_FILES': ["train", "test", "validation", "dataset_dict.json"],
    },
    'data_transformation': {
        'root_dir': 'artifacts/data_transformation',
        'data_path': 'artifacts/data_ingestion/samsum_dataset',
        'tokenizer_name': 'google/pegasus-cnn_dailymail',
    },
    'model_trainer': {
        'root_dir': 'artifacts/model_trainer',
        'data_path': 'artifacts/data_transformation/samsum_dataset',
        'model_ckpt': 'google/pegasus-cnn_dailymail',
    },
    'model_evaluation': {
        'root_dir': 'artifacts/model_evaluation',
        'data_path': 'artifacts/data_transformation/samsum_dataset',
        'model_path': 'artifacts/model_trainer/pegasus-samsum-model',
        'tokenizer_path': 'artifacts/model_trainer/tokenizer',
        'metric_file_name': 'artifacts/model_evaluation/metrics.csv',
    },
}

with open('config.yaml', 'w') as yaml_file:
    yaml.dump(config_data, yaml_file, default_flow_style=False)

print("Config file 'config.yaml' has been created.")


Config file 'config.yaml' has been created.


## Params.yaml


In [8]:
params_data = {
    'TrainingArguments': {
        'num_train_epochs': 1,
        'warmup_steps': 500,
        'per_device_train_batch_size': 1,
        'weight_decay': 0.01,
        'logging_steps': 10,
        'evaluation_strategy': 'steps',
        'eval_steps': 500,
        'save_steps': 1e6,
        'gradient_accumulation_steps': 16,
    }
}

with open('params.yaml', 'w') as yaml_file:
    yaml.dump(params_data, yaml_file, default_flow_style=False)

print("Parameters file 'params.yaml' has been created.")


Parameters file 'params.yaml' has been created.


## Constants


In [9]:
from pathlib import Path

CONFIG_FILE_PATH= Path("/kaggle/working/params.yaml")
PARAMS_FILE_PATH= Path("/kaggle/working/config.yaml")

## Config class

In [10]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir 
        )

        return data_ingestion_config
    
    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            STATUS_FILE=config.STATUS_FILE,
            ALL_REQUIRED_FILES=config.ALL_REQUIRED_FILES,
        )

        return data_validation_config
    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            tokenizer_name = config.tokenizer_name
        )

        return data_transformation_config
    
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.TrainingArguments

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            model_ckpt = config.model_ckpt,
            num_train_epochs = params.num_train_epochs,
            warmup_steps = params.warmup_steps,
            per_device_train_batch_size = params.per_device_train_batch_size,
            weight_decay = params.weight_decay,
            logging_steps = params.logging_steps,
            evaluation_strategy = params.evaluation_strategy,
            eval_steps = params.evaluation_strategy,
            save_steps = params.save_steps,
            gradient_accumulation_steps = params.gradient_accumulation_steps
        )

        return model_trainer_config
    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        config = self.config.model_evaluation

        create_directories([config.root_dir])

        model_evaluation_config = ModelEvaluationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            model_path = config.model_path,
            tokenizer_path = config.tokenizer_path,
            metric_file_name = config.metric_file_name
           
        )

        return model_evaluation_config

## Components

In [11]:
import zipfile
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config


    
    def download_file(self):
        if not os.path.exists(self.config.local_data_file):
            filename, headers = request.urlretrieve(
                url = self.config.source_URL,
                filename = self.config.local_data_file
            )
            logger.info(f"{filename} download! with following info: \n{headers}")
        else:
            logger.info(f"File already exists of size: {get_size(Path(self.config.local_data_file))}")  

        
    
    def extract_zip_file(self):
        """
        zip_file_path: str
        Extracts the zip file into the data directory
        Function returns None
        """
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)

In [12]:
!pip install --upgrade botocore s3fs


Collecting botocore
  Downloading botocore-1.34.31-py3-none-any.whl.metadata (5.7 kB)
  Downloading botocore-1.34.22-py3-none-any.whl.metadata (5.6 kB)
Downloading botocore-1.34.22-py3-none-any.whl (11.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.9/11.9 MB[0m [31m94.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: botocore
  Attempting uninstall: botocore
    Found existing installation: botocore 1.29.165
    Uninstalling botocore-1.29.165:
      Successfully uninstalled botocore-1.29.165
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
boto3 1.26.100 requires botocore<1.30.0,>=1.29.100, but you have botocore 1.34.22 which is incompatible.[0m[31m
[0mSuccessfully installed botocore-1.34.22


In [13]:
!pip install --upgrade datasets
!pip install --upgrade aiobotocore
!pip install --upgrade transformers

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting fsspec<=2023.10.0,>=2023.1.0 (from fsspec[http]<=2023.10.0,>=2023.1.0->datasets)
  Downloading fsspec-2023.10.0-py3-none-any.whl.metadata (6.8 kB)
Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2023.10.0-py3-none-any.whl (166 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.4/166.4 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Installing collected packages: pyarrow-hotfix, fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2023.12.2
    Uninstalling fsspec-2023.12.2:
      Successfully uninstalled fs

In [14]:
import datasets

print(datasets.__version__)




2.16.1


In [15]:
from transformers import AutoTokenizer
from datasets import load_from_disk 
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)


    
    def convert_examples_to_features(self,example_batch):
        input_encodings = self.tokenizer(example_batch['dialogue'] , max_length = 1024, truncation = True )
        
        with self.tokenizer.as_target_tokenizer():
            target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True )
            
        return {
            'input_ids' : input_encodings['input_ids'],
            'attention_mask': input_encodings['attention_mask'],
            'labels': target_encodings['input_ids']
        }
    

    def convert(self):
        dataset_samsum = load_from_disk(self.config.data_path)
        dataset_samsum_pt = dataset_samsum.map(self.convert_examples_to_features, batched = True)
        dataset_samsum_pt.save_to_disk(os.path.join(self.config.root_dir,"samsum_dataset"))