In [1]:
import os
os.chdir("../")
%pwd

'c:\\Users\\Surya\\Private\\Work\\Projects\\MLOPS\\text-summary-mlops'

# 1. Data Ingestion

In [1]:
# dataclass - that returns type of a function
# link for dataclass - https://realpython.com/python-data-classes/

# src/entity/__init__.py
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    # contents in the config/config.yaml file
    root_dir : Path 
    source_URL : str 
    local_data_file : Path
    unzip_dir : Path

In [8]:
from text_summary.constants import CONFIG_FILE_PATH
import yaml
from box import ConfigBox

with open(CONFIG_FILE_PATH,"r") as f:
    data = yaml.safe_load(f) # returns dictionary values
    print(ConfigBox(data).data_ingestion) # converts to ConfigBox() type

{'root_dir': 'artifacts/data_ingestion', 'source_URL': 'https://github.com/entbappy/Branching-tutorial/raw/master/summarizer-data.zip', 'local_data_file': 'artifacts/data_ingestion/data.zip', 'unzip_dir': 'artifacts/data_ingestion'}


In [9]:
DataIngestionConfig(Path("c:/"),"sourcepath",Path("c:/"),Path("c:/"))

DataIngestionConfig(root_dir=WindowsPath('c:/'), source_URL='sourcepath', local_data_file=WindowsPath('c:/'), unzip_dir=WindowsPath('c:/'))

In [10]:
# src/config/configuration.py

from text_summary.constants import *
from text_summary.utils.common import read_yaml,create_directories
# from text_summary.entity import DataIngestionConfig

# CONGIF MANAGER
class ConfigurationManager:
    def __init__(self,config_path=CONFIG_FILE_PATH,params_path=PARAMS_FILE_PATH):
        # reading the yaml files
        self.config = read_yaml(config_path)
        self.param = read_yaml(params_path)

        # create directories
        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        create_directories([config.root_dir])
        data_ingestion_config = DataIngestionConfig(
            root_dir= config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir
        )
        return data_ingestion_config

In [11]:
# create a file named data_ingestion.py inside the components
# src/components/data_ingestion.py

import os
from urllib import request
import zipfile
from text_summary.logging import logger
from text_summary.utils.common import get_size
# from text_summary.entity import DataIngestionConfig

class DataIngestion:
    
    def __init__(self,config : DataIngestionConfig):
        self.config = config

    def download_files(self):
        # if dataset doesn't downloaded yet 
        if not os.path.exists(self.config.local_data_file):
            filename , headers = request.urlretrieve(
                url = self.config.source_URL,
                filename = self.config.local_data_file
            )
            # zip file is downloaded
            logger.info(f"File {filename} downloaded with the info : {headers}")
        else:
            # dataset is already downloaded
            logger.info(f"File {filename} is already exist of size : {get_size(Path(self.config.local_data_file))}")

    def extract_data(self):
        # extracts the zip file
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path,exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file,'r') as zip_ref:
            zip_ref.extractall(unzip_path)

In [12]:
# create a file name stage1DataIngestion.py inside the pipeline

try:
    configManager = ConfigurationManager()
    data_ingestion_config = configManager.get_data_ingestion_config()
    data_ingestion = DataIngestion(config = data_ingestion_config)
    data_ingestion.download_files()
    data_ingestion.extract_data()
except Exception as e:
    raise e

[2023-08-17 21:22:21,286 : INFO : common : YAML file `config\config.yaml` loaded successfully]
[2023-08-17 21:22:21,291 : INFO : common : YAML file `params.yaml` loaded successfully]
[2023-08-17 21:22:21,294 : INFO : common : artifacts Directory is created ]
[2023-08-17 21:22:21,296 : INFO : common : artifacts/data_ingestion Directory is created ]
[2023-08-17 21:22:27,172 : INFO : 943864026 : File artifacts/data_ingestion/data.zip downloaded with the info : Connection: close
Content-Length: 7903594
Cache-Control: max-age=300
Content-Security-Policy: default-src 'none'; style-src 'unsafe-inline'; sandbox
Content-Type: application/zip
ETag: "dbc016a060da18070593b83afff580c9b300f0b6ea4147a7988433e04df246ca"
Strict-Transport-Security: max-age=31536000
X-Content-Type-Options: nosniff
X-Frame-Options: deny
X-XSS-Protection: 1; mode=block
X-GitHub-Request-Id: 8F92:3B34A1:1B0DB7:26BC7B:64DE4222
Accept-Ranges: bytes
Date: Thu, 17 Aug 2023 15:52:03 GMT
Via: 1.1 varnish
X-Served-By: cache-ccu8300

# 2. Data Validation

In [8]:
# src/entity/__init__.py
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir : Path
    STATUS_FILE : str
    ALL_REQUIRED_FILES : list

In [23]:
# src/config/configuration.py
from text_summary.constants import *
from text_summary.utils.common import read_yaml,create_directories

# CONGIF MANAGER
class ConfigurationManager:
    def __init__(self,config_path=CONFIG_FILE_PATH,params_path=PARAMS_FILE_PATH):
        # reading the yaml files
        self.config = read_yaml(config_path)
        self.param = read_yaml(params_path)

        # create directories
        create_directories([self.config.artifacts_root])

    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir    = config.root_dir,
            STATUS_FILE = config.STATUS_FILE,
            ALL_REQUIRED_FILES=config.ALL_REQUIRED_FILES      
        )

        return data_validation_config


In [28]:
# src/components/data_validation.py

# imports
import os
from text_summary.logging import logger

class DataValidation:
    def __init__(self,config : DataValidationConfig):
        self.config = config

    def validate_all_file_exist(self) -> bool:
        try :
            validation_status = None

            all_files = os.listdir(os.path.join("artifacts","data_ingestion","samsum_dataset"))

            for file in all_files:
                if file in self.config.ALL_REQUIRED_FILES:
                    # required file is found
                    validation_status = True
                else:
                    # file not found
                    validation_status = False
                with open(self.config.STATUS_FILE,"a") as f:
                    f.write(f"File name : `{file}` status : {validation_status} \n")
            return validation_status
        except Exception as e:
            raise e 

In [29]:
# src/pipeline/stage_02_data_validation.py
try:
    configManager = ConfigurationManager()
    data_val_config = configManager.get_data_validation_config()
    data_validation = DataValidation(data_val_config)
    data_validation.validate_all_file_exist()
except Exception as e:
    raise e

[2023-08-18 18:47:18,382 : INFO : common : YAML file `config\config.yaml` loaded successfully]
[2023-08-18 18:47:18,386 : INFO : common : YAML file `params.yaml` loaded successfully]
[2023-08-18 18:47:18,388 : INFO : common : `artifacts` Directory is created ]
[2023-08-18 18:47:18,388 : INFO : common : `artifacts/data_validation` Directory is created ]


# Data Transformation

In [2]:
# src/entity/__init__.py
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir : Path
    data_path : Path
    tokenizer_name : Path

In [3]:
# src/config/configuration.py
from text_summary.utils.common import read_yaml,create_directories
from text_summary.constants import *

class ConfigurationManager:
    def __init__(self,config_filepath=CONFIG_FILE_PATH,param_filepath=PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(param_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        create_directories([config.root_dir])
        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            tokenizer_name=config.tokenizer_name
        )
        return data_transformation_config

In [4]:
# src/components/data_transformation.py

# imports
from text_summary.logging import logger
from transformers import AutoTokenizer
import os
from datasets import load_from_disk,load_dataset

class DataTransformation:

    def __init__(self,config : DataTransformationConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=config.tokenizer_name)

    def convert_to_features(self,example_batch):
        input_encodings = self.tokenizer(example_batch['dialogue'],max_length=1024,truncation=True)

        with self.tokenizer.as_target_tokenizer():
            target_encodings = self.tokenizer(example_batch['summary'],max_length=128,truncation=True)

        return {
            'input_ids' : input_encodings['input_ids'],
            'attention_mask' : input_encodings['attention_mask'],
            'labels' : target_encodings['input_ids']
        }
    
    def convert(self):
        data_samsum = load_from_disk(self.config.data_path)
        data_samsum_pt = data_samsum.map(self.convert_to_features,batched=True)
        data_samsum_pt.save_to_disk(os.path.join(self.config.root_dir,"samsum_dataset"))

In [6]:
# src/pipeline/stage_03_data_transformation
try:
    configManager = ConfigurationManager()
    data_transformation_config = configManager.get_data_transformation_config()
    data_transformation = DataTransformation(data_transformation_config)
    data_transformation.convert()
except Exception as e:
    raise e

[2023-08-19 18:54:30,377 : INFO : common : YAML file `config\config.yaml` loaded successfully]
[2023-08-19 18:54:30,388 : INFO : common : YAML file `params.yaml` loaded successfully]
[2023-08-19 18:54:30,396 : INFO : common : `artifacts` Directory is created ]
[2023-08-19 18:54:30,396 : INFO : common : `artifacts/data_transformation` Directory is created ]


Downloading (…)okenizer_config.json: 100%|██████████| 88.0/88.0 [00:00<?, ?B/s]
Downloading (…)lve/main/config.json: 100%|██████████| 1.12k/1.12k [00:00<?, ?B/s]
Downloading (…)ve/main/spiece.model: 100%|██████████| 1.91M/1.91M [00:00<00:00, 1.92MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 65.0/65.0 [00:00<?, ?B/s]
Map: 100%|██████████| 14732/14732 [00:05<00:00, 2559.32 examples/s]
Map: 100%|██████████| 819/819 [00:00<00:00, 2519.25 examples/s]
Map: 100%|██████████| 818/818 [00:00<00:00, 2387.49 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 14732/14732 [00:00<00:00, 257453.92 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 102017.55 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 818/818 [00:00<00:00, 51124.90 examples/s]
