## Data Transformation

In [1]:
import os
%pwd

'd:\\MLOPs\\End to end NLP Project with HuggingFace and Transformers\\research'

In [2]:
os.chdir("../")

In [3]:
%pwd

'd:\\MLOPs\\End to end NLP Project with HuggingFace and Transformers'

In [4]:
from dataclasses import dataclass
from pathlib import Path


@dataclass
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    tokenizer_name: Path

In [5]:
from src.textSummarizer.constants import *
from src.textSummarizer.utils.common import read_yaml,create_directories

In [6]:
class ConfigurationManager:
    def __init__(self, config_path=CONFIG_FILE_PATH,
                params_filepath=PARAMS_FILE_PATH):
        self.config=read_yaml(config_path)
        self.params=read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self)->DataTransformationConfig:
        config=self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config=DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            tokenizer_name=config.tokenizer_name
        )
        return data_transformation_config

In [7]:
import os
from src.textSummarizer.logging import logger
from transformers import AutoTokenizer
from datasets import load_from_disk

  from .autonotebook import tqdm as notebook_tqdm


## Data Transformation Componenet

In [8]:
from pathlib import Path

# Set the root of your project explicitly
project_root = Path(r"D:\MLOPs\End to end NLP Project with HuggingFace and Transformers")

raw_data_dir = project_root / "artifacts/data_ingestion"

train_csv = raw_data_dir / "samsum-train.csv"
print("Looking for train CSV at:", train_csv.resolve())
print("Exists?", train_csv.exists())


Looking for train CSV at: D:\MLOPs\End to end NLP Project with HuggingFace and Transformers\artifacts\data_ingestion\samsum-train.csv
Exists? True


In [9]:
# import os
# from pathlib import Path
# import pandas as pd
# from datasets import Dataset, DatasetDict
# from transformers import AutoTokenizer

# # ------------------------------
# # Configuration class
# # ------------------------------
# class DataTransformationConfig:
#     def __init__(self):
#         # Tokenizer and dataset paths
#         self.tokenizer_name = "google/pegasus-cnn_dailymail"
        
#         # ✅ Raw CSV dataset folder (directly where CSVs are)
#         self.raw_data_dir = Path("artifacts/data_ingestion")
        
#         # Folder to save tokenized dataset
#         self.transformed_data_root = Path("artifacts/data_transformation")
#         self.transformed_data_path = self.transformed_data_root / "samsum_dataset"

#         # Tokenization settings
#         self.max_input_length = 512
#         self.max_target_length = 128

# # ------------------------------
# # Data Transformation class
# # ------------------------------
# class DataTransformation:
#     def __init__(self, config: DataTransformationConfig):
#         self.config = config
#         self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)

#     def convert_examples_to_features(self, example_batch):
#         """Tokenize dialogue and summary"""
#         inputs = self.tokenizer(
#             example_batch['dialogue'],
#             max_length=self.config.max_input_length,
#             padding='max_length',
#             truncation=True
#         )
#         targets = self.tokenizer(
#             example_batch['summary'],
#             max_length=self.config.max_target_length,
#             padding='max_length',
#             truncation=True
#         )
#         return {
#             'input_ids': inputs['input_ids'],
#             'attention_mask': inputs['attention_mask'],
#             'labels': targets['input_ids']
#         }

#     def convert(self):
#         # 1️⃣ Load CSVs directly from raw_data_dir
#         train_csv = self.config.raw_data_dir / "samsum-train.csv"
#         val_csv = self.config.raw_data_dir / "samsum-validation.csv"
#         test_csv = self.config.raw_data_dir / "samsum-test.csv"

#         # ✅ Check if files exist
#         for file_path in [train_csv, val_csv, test_csv]:
#             if not file_path.exists():
#                 raise FileNotFoundError(f"CSV file not found: {file_path}")

#         train_df = pd.read_csv(train_csv)
#         val_df = pd.read_csv(val_csv)
#         test_df = pd.read_csv(test_csv)

#         # 2️⃣ Convert pandas DataFrames to Hugging Face Datasets
#         dataset = DatasetDict({
#             'train': Dataset.from_pandas(train_df),
#             'validation': Dataset.from_pandas(val_df),
#             'test': Dataset.from_pandas(test_df)
#         })

#         # 3️⃣ Tokenize datasets
#         tokenized_dataset = dataset.map(self.convert_examples_to_features, batched=True)

#         # 4️⃣ Save tokenized dataset
#         os.makedirs(self.config.transformed_data_root, exist_ok=True)
#         tokenized_dataset.save_to_disk(self.config.transformed_data_path)
#         print(f"Tokenized dataset saved at {self.config.transformed_data_path}")

# # ------------------------------
# # Usage
# # ------------------------------
# if __name__ == "__main__":
#     config = DataTransformationConfig()
#     transformer = DataTransformation(config=config)
#     transformer.convert()


In [10]:
import os
from pathlib import Path
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

# ------------------------------
# Configuration class
# ------------------------------
class DataTransformationConfig:
    def __init__(self):
        # Tokenizer and dataset paths
        self.tokenizer_name = "google/pegasus-cnn_dailymail"
        
        # ✅ Raw CSV dataset folder (directly where CSVs are)
        self.raw_data_dir = Path("artifacts/data_ingestion")
        
        # Folder to save tokenized dataset
        self.transformed_data_root = Path("artifacts/data_transformation")
        self.transformed_data_path = self.transformed_data_root / "samsum_dataset"

        # Tokenization settings
        self.max_input_length = 512
        self.max_target_length = 128

# ------------------------------
# Data Transformation class
# ------------------------------
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)

    def convert_examples_to_features(self, example_batch):
        """Tokenize dialogue and summary safely"""
        # ✅ Ensure all values are strings and handle NaN properly
        dialogues = [str(x) if x is not None else "" for x in example_batch['dialogue']]
        summaries = [str(x) if x is not None else "" for x in example_batch['summary']]

        inputs = self.tokenizer(
            dialogues,
            max_length=self.config.max_input_length,
            padding='max_length',
            truncation=True
        )
        targets = self.tokenizer(
            summaries,
            max_length=self.config.max_target_length,
            padding='max_length',
            truncation=True
        )
        return {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            'labels': targets['input_ids']
        }

    def convert(self):
        # 1️⃣ Load CSVs directly from raw_data_dir
        train_csv = self.config.raw_data_dir / "samsum-train.csv"
        val_csv = self.config.raw_data_dir / "samsum-validation.csv"
        test_csv = self.config.raw_data_dir / "samsum-test.csv"

        # ✅ Check if files exist
        for file_path in [train_csv, val_csv, test_csv]:
            if not file_path.exists():
                raise FileNotFoundError(f"CSV file not found: {file_path}")

        # ✅ Read CSVs and drop rows with missing dialogue/summary
        train_df = pd.read_csv(train_csv).dropna(subset=['dialogue', 'summary'])
        val_df = pd.read_csv(val_csv).dropna(subset=['dialogue', 'summary'])
        test_df = pd.read_csv(test_csv).dropna(subset=['dialogue', 'summary'])

        # Debug: Print small sample before tokenization
        print("Sample of train data after cleaning:")
        print(train_df.head())

        # 2️⃣ Convert pandas DataFrames to Hugging Face Datasets
        dataset = DatasetDict({
            'train': Dataset.from_pandas(train_df),
            'validation': Dataset.from_pandas(val_df),
            'test': Dataset.from_pandas(test_df)
        })

        # 3️⃣ Tokenize datasets
        tokenized_dataset = dataset.map(self.convert_examples_to_features, batched=True)

        # 4️⃣ Save tokenized dataset
        os.makedirs(self.config.transformed_data_root, exist_ok=True)
        tokenized_dataset.save_to_disk(self.config.transformed_data_path)
        print(f"✅ Tokenized dataset saved at {self.config.transformed_data_path.resolve()}")

# ------------------------------
# Usage
# ------------------------------
if __name__ == "__main__":
    config = DataTransformationConfig()
    transformer = DataTransformation(config=config)
    transformer.convert()


Sample of train data after cleaning:
         id                                           dialogue  \
0  13818513  Amanda: I baked  cookies. Do you want some?\r\...   
1  13728867  Olivia: Who are you voting for in this electio...   
2  13681000  Tim: Hi, what's up?\r\nKim: Bad mood tbh, I wa...   
3  13730747  Edward: Rachel, I think I'm in ove with Bella....   
4  13728094  Sam: hey  overheard rick say something\r\nSam:...   

                                             summary  
0  Amanda baked cookies and will bring Jerry some...  
1  Olivia and Olivier are voting for liberals in ...  
2  Kim may try the pomodoro technique recommended...  
3  Edward thinks he is in love with Bella. Rachel...  
4  Sam is confused, because he overheard Rick com...  


Map: 100%|██████████| 14731/14731 [00:09<00:00, 1502.42 examples/s]
Map: 100%|██████████| 818/818 [00:00<00:00, 1959.24 examples/s]
Map: 100%|██████████| 819/819 [00:00<00:00, 1395.93 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 14731/14731 [00:00<00:00, 190682.54 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 818/818 [00:00<00:00, 48196.85 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 53044.90 examples/s]

✅ Tokenized dataset saved at D:\MLOPs\End to end NLP Project with HuggingFace and Transformers\artifacts\data_transformation\samsum_dataset





In [12]:
from datasets import load_from_disk

dataset_path = "D:/MLOPs/End to end NLP Project with HuggingFace and Transformers/artifacts/data_transformation/samsum_dataset"
dataset_samsum_pt = load_from_disk(dataset_path)

print(dataset_samsum_pt)
print(dataset_samsum_pt["train"][0])  # just to verify


DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14731
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 818
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 819
    })
})
{'id': '13818513', 'dialogue': "Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)", 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.', '__index_level_0__': 0, 'input_ids': [12195, 151, 125, 7091, 3659, 107, 842, 119, 245, 181, 152, 10508, 151, 7435, 147, 12195, 151, 125, 131, 267, 650, 119, 3469, 29344, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 