In [1]:
import os
%pwd

'/Users/likhit/Documents/Coding/MLOps/TextSummarization/research'

In [2]:
os.chdir("../")
%pwd

'/Users/likhit/Documents/Coding/MLOps/TextSummarization'

In [3]:
from dataclasses import dataclass
from pathlib import Path

# @dataclass automatically creates an __init__ method, so you don’t have to manually define it. This method will take each of the fields (like root_dir, source_url, etc.) as an argument, making it easier to initialize the class with configuration values.



class DataIngestionConfig:
    def __init__(self, root_dir, data_path,tokenizer_name):
        self.root_dir = root_dir
        self.data_path = data_path
        self.tokenizer_name = tokenizer_name


In [4]:
from src.text_Summarizer.constants import *
from src.text_Summarizer.utils.common import read_yaml, create_directories

In [5]:
class ConfigurationManager:

    def __init__(self,config_path = CONFIG_FILE_PATH,params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config_path)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self)-> DataIngestionConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataIngestionConfig(
            root_dir=Path(config.root_dir),
            data_path = Path(config.data_path),
            tokenizer_name = config.tokenizer_name
        )

        return data_transformation_config 


In [6]:
import os
from transformers import AutoTokenizer
import zipfile
from src.text_Summarizer.logging import logger
from datasets import load_from_disk



  from .autonotebook import tqdm as notebook_tqdm


[2024-11-01 16:18:08,398: INFO: config PyTorch version 2.5.1 available.]


In [7]:
class DataTransformation:
    def __init__ (self,config:DataIngestionConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)

    def convert_dataset_to_features_usageable(self,data):

        input_encoding = self.tokenizer(data['dialogue'] , max_length = 1024 , truncation = True)

        with self.tokenizer.as_target_tokenizer():
            target_encoding = self.tokenizer(data['summary'] , max_length = 1024 , truncation = True)
        
        return {
            'input_ids' : input_encoding['input_ids'],
            'attention_mask' : input_encoding['attention_mask'],
            'labels' : target_encoding['input_ids']
        }
    def convert(self):
        
        dataset_samsum = load_from_disk(self.config.data_path)

        dataset_samsum_pt = dataset_samsum.map(self.convert_dataset_to_features_usageable,batched = True)

        dataset_samsum_pt.save_to_disk(os.path.join(self.config.root_dir,"samsum_dataset"))









In [8]:
config = ConfigurationManager()
data_transformation_config = config.get_data_transformation_config()

data_tranformation = DataTransformation(data_transformation_config)
data_tranformation.convert()







[2024-11-01 16:18:08,624: INFO: common yaml file : config/config.yaml loaded successfully]
[2024-11-01 16:18:08,625: INFO: common yaml file : params.yaml loaded successfully]
[2024-11-01 16:18:08,625: INFO: common Create directory as artifacts]
[2024-11-01 16:18:08,626: INFO: common Create directory as artifacts/data_transformation]


Map: 100%|██████████| 14732/14732 [00:01<00:00, 7569.18 examples/s]
Map: 100%|██████████| 819/819 [00:00<00:00, 7754.43 examples/s]
Map: 100%|██████████| 818/818 [00:00<00:00, 7978.84 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 14732/14732 [00:00<00:00, 942603.49 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 210550.72 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 818/818 [00:00<00:00, 186160.64 examples/s]
