In [1]:
from importlib.metadata import version
print("version: ", version("torch"))
print("version: ", version("tiktoken"))

version:  2.4.1
version:  0.7.0


In [2]:
import os

In [3]:
os.chdir("../")

In [4]:
%pwd

'/home/aman/Desktop/Cognitext'

In [6]:
from dataclasses import dataclass
from pathlib import Path

In [22]:
@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    tokenizer: str
    max_length: int
    stride: int 
    save_dir: Path

In [8]:
from Cognitext.constants import *
from Cognitext.utils.common import read_yaml, create_directories

In [23]:
class ConfigurationManager:
    def __init__(self, config = CONFIG_FILE_PATH,params = PARAMS_FILE_PATH):
        self.config =  read_yaml(config)
        self.params = read_yaml(params)

        create_directories([self.config.artifacts_root])

    def get_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        params = self.params.DataLoaderParams

        get_transformation_connfig = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path= config.data_path,
            tokenizer= config.tokenizer,
            max_length= params.max_length,
            stride= params.stride,
            save_dir = config.save_dir
        )
        return get_transformation_connfig

In [10]:

import tiktoken
from datasets import load_from_disk
import re

[2024-09-19 16:12:12,310: INFO: utils: NumExpr defaulting to 4 threads.]


  from .autonotebook import tqdm as notebook_tqdm


[2024-09-19 16:12:14,743: INFO: config: PyTorch version 2.4.1 available.]
[2024-09-19 16:12:14,749: INFO: config: TensorFlow version 2.17.0 available.]


In [11]:
import torch

In [12]:
from torch.utils.data import Dataset, DataLoader

In [13]:
import json
import csv

In [29]:
class DataTransformation(Dataset):
    def __init__(self, config = DataTransformationConfig):
        self.config = config
        create_directories([self.config.root_dir])

        Data = load_from_disk(self.config.data_path)
        Data = Data["text"]
        Data = " ".join(Data)

        self.input_ids = []
        self.target_ids = []

        self.tokenizer = tiktoken.get_encoding(self.config.tokenizer)
        token_ids = self.tokenizer.encode(Data , allowed_special={"|endoftext|"})

        for i in range(0, len(token_ids) - self.config.max_length, self.config.stride):
            input_chunk = token_ids[i: i + self.config.max_length]
            target_chunk = token_ids[i+1: i + self.config.max_length + 1]

            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    def _len_(self):
        return len(self.input_ids)
    
    def __getitem__(self , idx):
        return self.input_ids[idx] , self.target_ids[idx]
    
    def save(self, file_type = "json"):
        file_path = self.config.save_dir
        data_to_save = [
            {
                "input_ids": input_tensor.tolist(),
                "target_ids": target_tensor.tolist()
            }
            for input_tensor, target_tensor in zip(self.input_ids, self.target_ids)
        ]

        if file_type == "json":
            with open(file_path, "w") as json_file:
                json.dump(data_to_save, json_file, indent=4)
            print(f"Data successfully saved to {file_path} as JSON.")
        
        elif file_type == "csv":
            with open(file_path, "w", newline="") as csv_file:
                writer = csv.DictWriter(csv_file, fieldnames=["input_ids", "target_ids"])
                writer.writeheader()
                for row in data_to_save:
                    writer.writerow(row)
            print(f"Data successfully saved to {file_path} as CSV.")
        
        else:
            raise ValueError("Unsupported file type. Please choose either 'json' or 'csv'.")


In [30]:
try:
    config = ConfigurationManager()
    get_transformation_config = config.get_transformation_config()
    transformation = DataTransformation(config = get_transformation_config)
    transformation._len_()
    transformation.save()
except Exception as e:
    raise e

[2024-09-19 17:55:02,030: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-09-19 17:55:02,049: INFO: common: yaml file: params.yaml loaded successfully]
[2024-09-19 17:55:02,057: INFO: common: created directory at: artifacts]
[2024-09-19 17:55:02,064: INFO: common: created directory at: artifacts/data_transformation]


Data successfully saved to artifacts/data_transformation/Data.json as JSON.
