In [1]:
import os

In [2]:
pwd("../")

'f:\\Files\\DSML\\Condition2Cure\\notebook'

In [3]:
os.chdir("../")

In [4]:
pwd("../")

'f:\\Files\\DSML\\Condition2Cure'

In [5]:
from pathlib import Path
from dataclasses import dataclass

In [6]:
from Condition2Cure.utils.helpers import *
from Condition2Cure.constants import *
from Condition2Cure.utils.execptions import *

In [7]:
@dataclass
class DataTransformationConfig:
    root_dir: Path
    cleaned_data_path: str
    target_column: str
    max_features: int
    ngram_range: tuple
    svd_components: int
    features_path: str
    labels_path: str
    vectorizer_path: str
    svd_path: str
    label_encoder_path: str

In [8]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.feature_engineering
        params = self.params.vectorizer
        schema = self.schema.target_column

        create_directories([config.root_dir])

        data_transformation_config =  DataTransformationConfig(
            root_dir=config.root_dir,
            cleaned_data_path=config.cleaned_data_path,
            vectorizer_path=config.vectorizer_path,
            features_path=config.features_path,
            labels_path=config.labels_path,
            label_encoder_path=config.label_encoder_path,
            target_column=schema.name,
            max_features=params.max_features,
            ngram_range=params.ngram_range,
            svd_path=config.svd_path,
            svd_components=params.svd_components
        )

        return data_transformation_config


In [9]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder
from Condition2Cure.utils.helpers import create_directories
from Condition2Cure import logger

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def transform(self):
        logger.info("Loading cleaned data...")
        df = pd.read_csv(self.config.cleaned_data_path)

        if 'clean_review' not in df.columns or self.config.target_column not in df.columns:
            raise ValueError("Required columns missing in cleaned data.")

        df['clean_review'] = df['clean_review'].fillna("")

        logger.info("Fitting TF-IDF vectorizer...")
        vectorizer = TfidfVectorizer(
            max_features=self.config.max_features,
            ngram_range=tuple(self.config.ngram_range)
        )
        X_tfidf = vectorizer.fit_transform(df['clean_review'])

        logger.info("Reducing dimensionality using TruncatedSVD...")
        svd = TruncatedSVD(n_components=self.config.svd_components)
        X_reduced = svd.fit_transform(X_tfidf)

        logger.info("Encoding labels...")
        label_encoder = LabelEncoder()
        y = label_encoder.fit_transform(df[self.config.target_column])

        logger.info("Saving vectorizer, SVD, label encoder...")
        create_directories([os.path.dirname(self.config.vectorizer_path)])
        joblib.dump(vectorizer, self.config.vectorizer_path)
        joblib.dump(svd, self.config.svd_path)
        joblib.dump(label_encoder, self.config.label_encoder_path)

        logger.info("Saving features and labels...")
        np.save(self.config.features_path, X_reduced)
        np.save(self.config.labels_path, y)

        logger.info("Data transformation complete.")

In [10]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    feature_engineering = DataTransformation(config=data_transformation_config)
    feature_engineering.transform()
except Exception as e:
    raise CustomException(str(e), sys) 

[2025-06-21 16:17:10,450: INFO: helpers: yaml file: config\config.yaml loaded successfully]
[2025-06-21 16:17:10,450: INFO: helpers: yaml file: config\params.yaml loaded successfully]
[2025-06-21 16:17:10,472: INFO: helpers: yaml file: config\schema.yaml loaded successfully]
[2025-06-21 16:17:10,477: INFO: helpers: created directory at: artifacts]
[2025-06-21 16:17:10,480: INFO: helpers: created directory at: artifacts/feature_engineering]
[2025-06-21 16:17:10,482: INFO: 837293412: Loading cleaned data...]
[2025-06-21 16:17:11,294: INFO: 837293412: Fitting TF-IDF vectorizer...]
[2025-06-21 16:17:19,906: INFO: 837293412: Reducing dimensionality using TruncatedSVD...]
[2025-06-21 16:17:56,042: INFO: 837293412: Encoding labels...]
[2025-06-21 16:17:56,089: INFO: 837293412: Saving vectorizer, SVD, label encoder...]
[2025-06-21 16:17:56,089: INFO: helpers: created directory at: artifacts/feature_engineering]
[2025-06-21 16:17:56,775: INFO: 837293412: Saving features and labels...]
[2025-06-