In [1]:
!pip install transformers
!pip install datasets
!pip install sentencepiece
!pip install torch

Collecting torch
  Using cached torch-2.0.1-cp311-cp311-win_amd64.whl (172.3 MB)
Collecting sympy (from torch)
  Using cached sympy-1.12-py3-none-any.whl (5.7 MB)
Collecting networkx (from torch)
  Using cached networkx-3.1-py3-none-any.whl (2.1 MB)
Collecting jinja2 (from torch)
  Using cached Jinja2-3.1.2-py3-none-any.whl (133 kB)
Collecting MarkupSafe>=2.0 (from jinja2->torch)
  Obtaining dependency information for MarkupSafe>=2.0 from https://files.pythonhosted.org/packages/be/bb/08b85bc194034efbf572e70c3951549c8eca0ada25363afc154386b5390a/MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl.metadata
  Downloading MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl.metadata (3.1 kB)
Collecting mpmath>=0.19 (from sympy->torch)
  Using cached mpmath-1.3.0-py3-none-any.whl (536 kB)
Downloading MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl (17 kB)
Installing collected packages: mpmath, sympy, networkx, MarkupSafe, jinja2, torch
Successfully installed MarkupSafe-2.1.3 jinja2-3.1.2 mpmath-1.3.0 networkx-3.1 s

In [12]:
!pip install protobuf


Collecting protobuf
  Obtaining dependency information for protobuf from https://files.pythonhosted.org/packages/5e/46/5b9674a33cbf690ffdd79ab1863767a66461cd06ea7aeb9f90e4e50be7a5/protobuf-4.24.3-cp310-abi3-win_amd64.whl.metadata
  Downloading protobuf-4.24.3-cp310-abi3-win_amd64.whl.metadata (540 bytes)
Downloading protobuf-4.24.3-cp310-abi3-win_amd64.whl (430 kB)
   ---------------------------------------- 0.0/430.5 kB ? eta -:--:--
   ---------------------------------------- 0.0/430.5 kB ? eta -:--:--
   ---------------------------------------- 0.0/430.5 kB ? eta -:--:--
   -- ------------------------------------- 30.7/430.5 kB ? eta -:--:--
   --- ----------------------------------- 41.0/430.5 kB 653.6 kB/s eta 0:00:01
   ------- ------------------------------- 81.9/430.5 kB 651.6 kB/s eta 0:00:01
   ------------------------ --------------- 266.2/430.5 kB 1.6 MB/s eta 0:00:01
   ---------------------------- ----------- 307.2/430.5 kB 1.9 MB/s eta 0:00:01
   ------------------------

In [2]:
import os
os.chdir("../")

In [36]:

import json
import sys
from dataclasses import dataclass
from pathlib import Path
from src.constants.constants import CONFIG_PATH, PARAMS_PATH
from src.loging import logger
from src.utils.common import get_size, create_directories
from src.utils.common import read_yaml
from transformers import AutoTokenizer
from datasets import load_dataset, load_from_disk, Dataset, DatasetDict

In [21]:
@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    tokenizer_name: str

In [22]:
class ConfigurationManager:
    def __init__(self, config_path, params_path):
          
        self.config = read_yaml(config_path)
        self.params = read_yaml(params_path)
        
        create_directories([self.config["data_transformation"]["root_dir"]])
        #create_directories([self.config["data_transformation"]["data_path"]])
        
    def  get_data_transformation_config(self):
        return DataTransformationConfig(
            root_dir = Path(self.config["data_transformation"]["root_dir"]),
            data_path = Path(self.config["data_transformation"]["data_path"]),
            tokenizer_name = self.config["data_transformation"]["tokenizer_name"]
        )      

 

In [41]:
class DataTransformation:
    def __init__(self, config):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_name)
        
    def convert_examples_to_features(self, example_batch):
        input_encodings = self.tokenizer(example_batch['dialogue'], max_length=1024, truncation=True)

        target_encodings = self.tokenizer(text_target=example_batch['summary'], max_length = 128, truncation = True )

        return {
            "input_ids": input_encodings["input_ids"],
            "attention_mask": input_encodings["attention_mask"],
            "labels": target_encodings["input_ids"]

        }
        
    def transform_data_to_transformers_format(self):
        # Load each of your datasets
        with open(os.path.join(self.config.data_path, "train.json"), 'r') as file:
            train_data = json.load(file)

        with open(os.path.join(self.config.data_path, "test.json"), 'r') as file:
            test_data = json.load(file)
        
        with open(os.path.join(self.config.data_path, "val.json"), 'r') as file:
            val_data = json.load(file)

        # Convert each list of dictionaries to a HuggingFace Dataset
        train_dataset = Dataset.from_dict({k: [dic[k] for dic in train_data] for k in train_data[0]})
        test_dataset = Dataset.from_dict({k: [dic[k] for dic in test_data] for k in test_data[0]})
        val_dataset = Dataset.from_dict({k: [dic[k] for dic in val_data] for k in val_data[0]})
        
        
        
        dataset_dict = DatasetDict({
            'train': train_dataset,
            'test': test_dataset,  
            'validation': val_dataset
        })
        
        save_path = os.path.join(self.config.data_path, "transformers_format_data")
        dataset_dict.save_to_disk(save_path)
        
    def convert_to_features(self):
        load_path = os.path.join(self.config.data_path, "transformers_format_data")
        loaded_dataset_dict = load_from_disk(load_path)

        # Assuming self.convert_examples_to_features is defined elsewhere in your class
        loaded_train_dataset = loaded_dataset_dict['train'].map(self.convert_examples_to_features, batched=True)
        loaded_test_dataset = loaded_dataset_dict['test'].map(self.convert_examples_to_features, batched=True)
        loaded_val_dataset = loaded_dataset_dict['validation'].map(self.convert_examples_to_features, batched=True)

        # Combine the processed datasets
        processed_dataset_dict = DatasetDict({
            'train': loaded_train_dataset,
            'test': loaded_test_dataset,
            'validation': loaded_val_dataset
        })

        save_path = os.path.join(self.config.root_dir, "transformed_data")
        processed_dataset_dict.save_to_disk(save_path)
        
        
        

In [42]:
# Create pipeline
try:
    logger.info(">>>>> stage: data_transformation")
    config_manager = ConfigurationManager(CONFIG_PATH, PARAMS_PATH)
    data_transformation_config = config_manager.get_data_transformation_config()
    data_transformation = DataTransformation(data_transformation_config)
    data_transformation.transform_data_to_transformers_format()
    data_transformation.convert_to_features()
    logger.info("stage completed successfully")
except Exception as e:
    logger.error(e)
    raise e
    

[2023-09-08 10:44:27,254]: INFO: 49870760: >>>>> stage: data_transformation]
[2023-09-08 10:44:27,260]: INFO: common: Successfully read yaml file from config\config.yaml]
[2023-09-08 10:44:27,263]: INFO: common: Successfully read yaml file from params.yaml]
[2023-09-08 10:44:27,266]: INFO: common: Created directory: data/data_transformation]


Saving the dataset (1/1 shards): 100%|██████████| 14732/14732 [00:00<00:00, 728644.21 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 81921.56 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 818/818 [00:00<00:00, 60473.09 examples/s]
Map: 100%|██████████| 14732/14732 [00:05<00:00, 2502.19 examples/s]
Map: 100%|██████████| 819/819 [00:00<00:00, 2523.62 examples/s]
Map: 100%|██████████| 818/818 [00:00<00:00, 2531.21 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 14732/14732 [00:00<00:00, 306719.51 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 74223.44 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 818/818 [00:00<00:00, 55774.96 examples/s]

[2023-09-08 10:44:36,184]: INFO: 49870760: stage completed successfully]



