In [11]:
import os

In [12]:
%pwd
# ## Text Sumarizer
# os.chdir('d:\\Text Sumarizer')

'd:\\Text Sumarizer'

In [31]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    tokenizer_name: str

In [32]:
from src.text_summarizer.constants import *
from src.text_summarizer.utils.common import read_yaml, create_directories

In [33]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath: str = CONFIG_PATH,
        params_filepath: str = PARAMS_PATH,
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        create_directories([config.root_dir])  # Ensure directory exists

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            tokenizer_name=config.tokenizer_name
        )

        return data_transformation_config

In [34]:
import os
from src.text_summarizer.logging import logging
from transformers import AutoTokenizer
from datasets import load_dataset, load_from_disk

In [39]:
import os
from transformers import AutoTokenizer
from datasets import load_from_disk

class DataTransformation:
    def __init__(self, config):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)

    ## Convert example to feature
    def convert_examples_to_features(self, example_batch):
        ## Input token IDs
        input_encodings = self.tokenizer(
            example_batch['dialogue'], max_length=1024, truncation=True, padding="max_length"
        )

        ## Target token IDs (using text_target to avoid deprecation warning)
        target_encodings = self.tokenizer(
            text_target=example_batch['summary'], max_length=128, truncation=True, padding="max_length"
        )

        return {
            'input_ids': input_encodings['input_ids'],
            'attention_mask': input_encodings['attention_mask'],
            'labels': target_encodings['input_ids']
        }
    
    def convert(self):
        ## Ensure dataset path exists
        if not os.path.exists(self.config.data_path):
            raise FileNotFoundError(f"Dataset path {self.config.data_path} not found. Ensure the dataset is correctly saved.")

        ## Load dataset
        dataset_samsum = load_from_disk(self.config.data_path)
        
        ## Convert examples to features
        dataset_samsum_pt = dataset_samsum.map(self.convert_examples_to_features, batched=True)

        ## Ensure root directory exists before saving
        os.makedirs(self.config.root_dir, exist_ok=True)

        ## Save processed dataset
        dataset_samsum_pt.save_to_disk(os.path.join(self.config.root_dir, "samsum_dataset"))



In [40]:
config = ConfigurationManager()
data_transformation_config = config.get_data_transformation_config()

# Fix: Use DataTransformation instead of DataTransformationConfig
data_transformation = DataTransformation(config=data_transformation_config)
data_transformation.convert()


[2025-03-12 02:15:09,823: INFO: common]: yaml file: config\config.yaml loaded successfully
[2025-03-12 02:15:09,827: INFO: common]: yaml file: params.yaml loaded successfully
[2025-03-12 02:15:09,830: INFO: common]: created directory at: Artifacts/data_ingestion
[2025-03-12 02:15:09,833: INFO: common]: created directory at: Artifacts/data_transformation


Map: 100%|██████████| 14732/14732 [00:18<00:00, 789.59 examples/s]
Map: 100%|██████████| 819/819 [00:00<00:00, 875.56 examples/s]
Map: 100%|██████████| 818/818 [00:01<00:00, 816.94 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 14732/14732 [00:00<00:00, 105699.93 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 25664.45 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 818/818 [00:00<00:00, 20221.61 examples/s]
