In [1]:
import os

In [2]:
pwd("../")

'f:\\Files\\DSML\\Condition2Cure\\notebook'

In [3]:
os.chdir("../")

In [4]:
pwd("../")

'f:\\Files\\DSML\\Condition2Cure'

In [5]:
from pathlib import Path
from dataclasses import dataclass

In [6]:
from Condition2Cure.constants import *
from Condition2Cure.utils.helpers import *
from Condition2Cure.utils.execptions import CustomException

In [7]:
@dataclass(frozen=True)
class DataCleaningConfig:
    root_dir: Path
    data_path: Path
    cleaned_data_path: Path

In [8]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_cleaning_config(self) -> DataCleaningConfig:
        config = self.config.data_cleaning
        create_directories([config.root_dir])
        
        data_cleaning_config =  DataCleaningConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            cleaned_data_path=config.cleaned_data_path
        )

        return data_cleaning_config


In [9]:
import os
import pandas as pd
from Condition2Cure.utils.helpers import create_directories
from Condition2Cure.entities.config_entity import DataCleaningConfig
from Condition2Cure.utils.nlp_utils import clean_text
from Condition2Cure import logger


class DataCleaning:
    def __init__(self, config: DataCleaningConfig):
        self.config = config

    def clean(self):
        logger.info("Reading raw data...")
        data = pd.read_csv(self.config.data_path)

        df = data[(data['condition'] == 'Birth Control') | (data['condition'] == 'Depression') | (data['condition'] == 'Pain') | (data['condition'] == 'Anxiety') | (data['condition'] == 'Acne') | (data['condition'] == 'Diabetes, Type 2') | (data['condition'] == 'High Blood Pressure')]

        if 'review' not in df.columns or 'condition' not in df.columns:
            raise ValueError("Input data must contain 'review' and 'condition' columns.")

        logger.info("Cleaning review texts...")
        df['clean_review'] = df['review'].astype(str).apply(clean_text)

        df.dropna(subset=['clean_review', 'condition'], inplace=True)

        create_directories([os.path.dirname(self.config.cleaned_data_path)])
        df.to_csv(self.config.cleaned_data_path, index=False, na_rep="")

        logger.info(f"Cleaned data saved at: {self.config.cleaned_data_path}")

[nltk_data] Downloading package stopwords to C:\Users\Javith
[nltk_data]     Naseem\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Javith
[nltk_data]     Naseem\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
try:
    config = ConfigurationManager()
    data_cleaning_config = config.get_data_cleaning_config()
    data_cleaning = DataCleaning(config=data_cleaning_config)
    data_cleaning = data_cleaning.clean()

except Exception as e:
    raise CustomException(str(e))

[2025-06-21 16:02:33,279: INFO: helpers: yaml file: config\config.yaml loaded successfully]
[2025-06-21 16:02:33,282: INFO: helpers: yaml file: config\params.yaml loaded successfully]
[2025-06-21 16:02:33,287: INFO: helpers: yaml file: config\schema.yaml loaded successfully]
[2025-06-21 16:02:33,289: INFO: helpers: created directory at: artifacts]
[2025-06-21 16:02:33,290: INFO: helpers: created directory at: artifacts/data_cleaning]
[2025-06-21 16:02:33,292: INFO: 354709159: Reading raw data...]


[2025-06-21 16:02:34,623: INFO: 354709159: Cleaning review texts...]
[2025-06-21 16:02:55,923: INFO: helpers: created directory at: artifacts/data_cleaning]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_review'] = df['review'].astype(str).apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(subset=['clean_review', 'condition'], inplace=True)


[2025-06-21 16:02:57,421: INFO: 354709159: Cleaned data saved at: artifacts/data_cleaning/cleaned.csv]
