In [4]:
import os

In [5]:
%pwd

'e:\\github_clone\\Patients-Condition-Classification-Using-Drug-Reviews\\research'

In [6]:
os.chdir('../')

In [7]:
%pwd

'e:\\github_clone\\Patients-Condition-Classification-Using-Drug-Reviews'

In [8]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationconfig:
    root_dir: Path
    data_path: Path
    tokenizer_name: Path

In [9]:
from PatientConditonClassification.constants import *
from PatientConditonClassification.utils.common import read_yaml, create_directories
import torch
from sklearn.utils.class_weight import compute_class_weight
import torch.nn as nn

In [10]:
class ConfigurationManager:
    def __init__(
            self, 
            config_filepath = CONFIG_FILE_PATH,
            params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        
        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationconfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationconfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            tokenizer_name=config.tokenizer_name
        )
        return data_transformation_config

In [11]:
from transformers import DistilBertModel, DistilBertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import pickle
import pandas as pd
import numpy as np
import json

In [20]:
class DataTransformation:
    def __init__(self, batch, config= DataTransformationconfig):
        self.config = config
        self.tokenizer = DistilBertTokenizer.from_pretrained(config.tokenizer_name)
        self.batch = batch
        self.files = ['test', 'train', 'val']
        self.mapping = {'Depression':0, 'Pain':1, 'Anxiety':2, 'Acne':3, 'Birth Control':4}
        #self.device = "cuda" if torch.cuda.is_available() else "cpu"
        
    def convert_sample_to_feature(self, sample_batch, labels):
        tokens_train = self.tokenizer.batch_encode_plus(
                        sample_batch.tolist(),
                        max_length = 512,
                        pad_to_max_length=True,
                        truncation=True,
                        return_token_type_ids=False
                    )
        seq = torch.tensor(tokens_train['input_ids'])
        mask = torch.tensor(tokens_train['attention_mask'])
        label = torch.tensor(labels.tolist())

        train_data = TensorDataset(seq, mask, label)
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=self.batch)

        return seq, mask, label

    def save_transformed_data(self):
        for i in range(3):
            df = pd.read_csv(os.path.join(self.config.data_path, self.files[i], 'drug_review.csv'))
            labels = df['condition'].map(self.mapping)
            seq, mask, label = self.convert_sample_to_feature(sample_batch=df['review'], labels=labels)
            
            if self.files[i] == "train":
                class_weights = compute_class_weight(
                                                class_weight = "balanced",
                                                classes = np.unique(df['condition']),
                                                y = df['condition']
                                            )
                class_weights = dict(zip(np.unique(df['condition']), class_weights))

                with open(Path(os.path.join(self.config.root_dir, "class_weights.json")), 'w') as json_file:
                    json.dump(class_weights, json_file, indent=2)

            create_directories([Path(os.path.join(self.config.root_dir, "drug_review", self.files[i]))])
            output_file = Path(os.path.join(self.config.root_dir,'drug_review',str(self.files[i]), 'processed_data.pkl'))
            
            with open(output_file, 'wb') as f:
                processed_data = {'seq': seq, 'mask': mask, 'label': label}
                pickle.dump(processed_data, f)

In [21]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(batch=32, config=data_transformation_config)
    data_transformation.save_transformed_data()
except Exception as e:
    raise e

[2023-11-15 20:46:31,611: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-11-15 20:46:31,612: INFO: common: yaml file: params.yaml loaded successfully]
[2023-11-15 20:46:31,612: INFO: common: created directory at: artifacts]
[2023-11-15 20:46:31,612: INFO: common: created directory at: artifacts/data_transformation]




[2023-11-15 20:47:07,626: INFO: common: created directory at: artifacts\data_transformation\drug_review\test]




[2023-11-15 20:47:46,069: INFO: common: created directory at: artifacts\data_transformation\drug_review\train]




[2023-11-15 20:48:03,840: INFO: common: created directory at: artifacts\data_transformation\drug_review\val]
