# Data transformation

In this script we will be focusing on transforming our raw data into a more suitable format for further analysis or model training.

We first need to make sure that we are working in the correct directory, we want the main directory to be `mlopsProject`. Make sure to run this only once on your local machine, or restart the kernel if you want to rerun all

In [1]:
import os

Assuming that `03_data_transformation.ipynb` is in `mlopsProject/research`

In [2]:
os.chdir('../')

current_path = os.getcwd() 
print(current_path) # Should be /mlopsProject

/home/corti/Desktop/mlopsProject


In [3]:
from dataclasses import dataclass
from pathlib import Path

In [4]:
@dataclass(frozen = True)
class DataTransformationConfig:
    root_dir : Path  # The root directory where data transformation artifacts will be stored
    data_path : Path  # The path to the dataset that will be transformed
    tokenizer_name : str  # The name of the tokenizer that will be used to transform the data

In [5]:
from ConversationSummarizer.constants import *
from ConversationSummarizer.utils.common import read_yaml, create_directories

In [6]:
# Define a class for managing configurations
class ConfigurationManager:
    # Initialize the ConfigurationManager with paths to the configuration and parameters files
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):
        
        # Read the configuration and parameters files
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        
        # Create the root directory for storing artifacts
        create_directories([self.config.artifacts_root])
        
    # Define a method for getting the data transformation configuration
    def get_data_transformation_config(self) -> DataTransformationConfig:
        
        # Get the data transformation configuration from the config file
        config = self.config.data_transformation
        
        # Create the root directory for data transformation, if it doesn't already exist
        create_directories([config.root_dir])
        
        # Create a DataTransformationConfig object with the configuration values
        data_transformation_config = DataTransformationConfig(
            root_dir = config.root_dir,
            data_path = config.data_path,
            tokenizer_name = config.tokenizer_name
        )
        
        # Return the DataTransformationConfig object
        return data_transformation_config

In [7]:
import os
from transformers import AutoTokenizer
from datasets import load_from_disk
from ConversationSummarizer.logging import logger

  from .autonotebook import tqdm as notebook_tqdm


[2024-01-31 23:34:58,590: INFO: config: PyTorch version 2.2.0 available.]


In [8]:
class DataTransformation:
    
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        # Load the tokenizer from the pretrained model specified in the config
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_name)
    
    
    def convert_examples_to_features(self,example_batch):
        """
        Convert a batch of examples to model features.

        Args:
            example_batch: A batch of examples.
                Each example is a dictionary with 'dialogue' and 'summary' keys.

        Returns:
            Dict: A dictionary with keys 'input_ids', 'attention_mask', and 'labels'.
                Each value is a list of tokenized inputs.

        Raises:
            ValueError: If 'dialogue' or 'summary' keys are not in example_batch.
        """
        if 'dialogue' not in example_batch or 'summary' not in example_batch:
            raise ValueError("'dialogue' and 'summary' keys must be in example_batch")

        # Tokenize the 'dialogue' field of each example in the batch.
        input_encodings = self.tokenizer(example_batch['dialogue'], max_length = MAX_INPUT_LENGTH, truncation=True)

        # Use the tokenizer as a target tokenizer.
        with self.tokenizer.as_target_tokenizer():
            # Tokenize the 'summary' field of each example in the batch.
            target_encodings = self.tokenizer(example_batch['summary'], max_length = MAX_TARGET_LENGTH, truncation=True)

        # Return a dictionary containing the input IDs, attention masks, and labels for each example in the batch.
        return {
            'input_ids': input_encodings['input_ids'],
            'attention_mask': input_encodings['attention_mask'],
            'labels': target_encodings['input_ids']
        }
        
    def convert(self):
        # Load the dataset from the specified path
        samsum_dataset = load_from_disk(self.config.data_path)

        # Apply the function 'convert_examples_to_features' to all elements in the dataset
        # The 'map' function applies the specified function to each element in the dataset
        # The 'batched=True' argument means that the function is applied to batches of elements, not individual elements
        samsum_dataset_processed = samsum_dataset.map(self.convert_examples_to_features, batched=True)

        # Save the processed dataset to the specified path
        # The 'os.path.join' function is used to create the path by joining the root directory and the file name
        samsum_dataset_processed.save_to_disk(os.path.join(self.config.root_dir, 'samsum_dataset_processed'))
                

In [9]:
try:
    # Instantiate ConfigurationManager and get the data transformation configuration
    config = ConfigurationManager() 
    data_transformation_config = config.get_data_transformation_config()

    # Instantiate DataTransformation with the configuration and perform data transformation
    data_transformation = DataTransformation(config = data_transformation_config)
    data_transformation.convert()

except (TypeError, AttributeError) as e:
    # Log the exception before raising it
    logger.error(f"An error occurred during data transformation: {str(e)}")
    raise

[2024-01-31 23:34:58,764: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-01-31 23:34:58,765: INFO: common: yaml file: params.yaml loaded successfully]
[2024-01-31 23:34:58,766: INFO: common: created directory at: artifacts]
[2024-01-31 23:34:58,767: INFO: common: created directory at: artifacts/data_transformation]


Map: 100%|██████████| 14732/14732 [00:02<00:00, 5526.11 examples/s]
Map: 100%|██████████| 819/819 [00:00<00:00, 5241.27 examples/s]
Map: 100%|██████████| 818/818 [00:00<00:00, 5645.90 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 14732/14732 [00:00<00:00, 130972.26 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 120683.49 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 818/818 [00:00<00:00, 155260.23 examples/s]
