In [11]:
import os

In [12]:
# Go to the directory of the script (MLOPS_Project\)

os.chdir("/home/nicola/Projects/MLOPS_Project")

In [13]:
os.getcwd()

'/home/nicola/Projects/MLOPS_Project'

In [14]:
from dataclasses import dataclass 
from pathlib import Path

In [15]:
@dataclass(frozen=True)
class DataPreprocessingConfig:
    root_dir : Path 
    data_path : Path 
    tokenizer_name : str

In [16]:
from mlopsProject.constants import *
from mlopsProject.utils.common import read_yaml, create_directory, read_jsonl_to_dataset

In [17]:
class ConfigurationManager:
    def __init__(
            self,
            config_filepath = CONFIG_FILE_PATH,
            params_filepath = PARAMS_FILE_PATH):

            self.config = read_yaml(config_filepath)
            self.params = read_yaml(params_filepath)

            create_directory([self.config.artifacts_root])

    def get_data_preprocessing_config(self) -> DataPreprocessingConfig:
        
        config = self.config.data_preprocessing

        create_directory([config.root_dir])

        data_preprocessing_config = DataPreprocessingConfig(
            root_dir = config.root_dir,
            data_path = config.data_path,
            tokenizer_name = config.tokenizer_name
        )

        return data_preprocessing_config

In [18]:
import os
from mlopsProject.logging import logger 
from transformers import AutoTokenizer
from datasets import DatasetDict, load_from_disk, concatenate_datasets

In [19]:
class DataPreprocessing:
    
    def __init__(self,config : DataPreprocessingConfig):
        self.config = config 
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_name)
    
    def create_DatasetDict(self):
        raw_dataset = DatasetDict({
            "train": read_jsonl_to_dataset(self.config.data_path + "/train.jsonl"),
            "test": read_jsonl_to_dataset(self.config.data_path + "/test.jsonl"),
            "validation": read_jsonl_to_dataset(self.config.data_path + "/validation.jsonl")
            })
        raw_dataset.save_to_disk(self.config.data_path)

    def process(self):
        raw_data = load_from_disk(self.config.data_path)
        tokenized_inputs = concatenate_datasets(
            [raw_data["train"],raw_data["validation"],raw_data["test"]]
            ).map(lambda x: self.tokenizer(x["paragraph"], truncation=True), batched=True, remove_columns=['answers', 'questions', 'paragraph', 'questions_answers'])
        tokenized_targets = concatenate_datasets(
            [raw_data["train"],raw_data["validation"],raw_data["test"]]
            ).map(lambda x: self.tokenizer(x["questions_answers"], truncation=True), batched=True, remove_columns=['answers', 'questions', 'paragraph', 'questions_answers'])

        tok_input_max = max([len(x) for x in tokenized_inputs["input_ids"]])
        tok_target_max = max([len(x) for x in tokenized_targets["input_ids"]])
        
        def preprocess_function(sample,padding = "max_length"):
            
            inputs = ["Generate question and answer: " + item for item in sample["paragraph"]]

            model_inputs = self.tokenizer(inputs, max_length=tok_input_max, padding=padding, truncation=True)

            labels = self.tokenizer(text_target=sample["questions_answers"], max_length=tok_target_max, padding=padding, truncation=True)

            if padding == "max_length":
                labels["input_ids"] = [
                    [(l if l != self.tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
                ]

            model_inputs["labels"] = labels["input_ids"]
            return model_inputs

        tokenized_dataset = raw_data.map(preprocess_function, batched=True, remove_columns=["paragraph", "questions_answers", "answers","questions"])
        tokenized_dataset.save_to_disk(os.path.join(self.config.root_dir, "tokenized_dataset"))
    

In [20]:
try:
    config = ConfigurationManager()
    data_preprocessing_config = config.get_data_preprocessing_config()
    data_preprocessing = DataPreprocessing(config = data_preprocessing_config)
    data_preprocessing.create_DatasetDict()
    data_preprocessing.process()
    logger.info("Data preprocess completed!")
except Exception as e:
    raise e

[2024-01-28 22:28:02,933: INFO: common: file: config/config.yaml loaded correctly]
[2024-01-28 22:28:02,934: INFO: common: file: params.yaml loaded correctly]
[2024-01-28 22:28:02,936: INFO: common: directory artifacts created]
[2024-01-28 22:28:02,937: INFO: common: directory artifacts/data_preprocessing created]


Saving the dataset (1/1 shards): 100%|██████████| 16462/16462 [00:00<00:00, 508230.22 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2429/2429 [00:00<00:00, 320074.28 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2067/2067 [00:00<00:00, 283413.74 examples/s]
Map: 100%|██████████| 20958/20958 [00:02<00:00, 8565.22 examples/s]
Map: 100%|██████████| 20958/20958 [00:01<00:00, 12810.08 examples/s]
Map: 100%|██████████| 16462/16462 [00:12<00:00, 1371.39 examples/s]
Map: 100%|██████████| 2429/2429 [00:01<00:00, 1367.56 examples/s]
Map: 100%|██████████| 2067/2067 [00:01<00:00, 1347.62 examples/s]


Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


Saving the dataset (1/1 shards): 100%|██████████| 16462/16462 [00:00<00:00, 197654.46 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2429/2429 [00:00<00:00, 110557.29 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2067/2067 [00:00<00:00, 77749.62 examples/s]

[2024-01-28 22:28:25,001: INFO: 2523898715: Data preprocess completed!]



