In [1]:
import os

In [None]:
%pwd

In [3]:
os.chdir("../")

In [None]:
%pwd

In [5]:
from dataclasses import dataclass
from pathlib import Path

# config.yaml
@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [6]:
from MentalHealthAnalysis import logger
from src.MentalHealthAnalysis.constants import *
from src.MentalHealthAnalysis.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        try:
            config = self.config.get('data_transformation', {})
            create_directories([config.get('root_dir', '')])

            data_transformation_config = DataTransformationConfig(
                root_dir=config.get('root_dir', ''),
                data_path=config.get('data_path', ''),
            )
            return data_transformation_config

        except Exception as e:
            raise RuntimeError(f"Error in data transformation config: {e}")


In [None]:
import os
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from scipy.sparse import hstack
from imblearn.over_sampling import RandomOverSampler
import nltk

# Ensure NLTK data is downloaded
nltk.download('punkt')

In [9]:
class DataTransformation: 
    def __init__(self, config):
        self.config = config
    
    def read_data(self):
        """Reads the dataset from the specified path."""
        self.data = pd.read_csv(self.config.data_path)
    
    def preprocessing(self):
        """Performs preprocessing steps such as handling missing values and text cleaning."""
        self.data.dropna(inplace=True)

        # Reset index after dropping rows
        self.data.reset_index(drop=True, inplace=True)

        # Create additional features based on text length and number of sentences
        self.data['num_of_characters'] = self.data['statement'].str.len()
        self.data['num_of_sentences'] = self.data['statement'].apply(lambda x: len(sent_tokenize(x)))
        
        # Lowercasing the text
        self.data['statement'] = self.data['statement'].str.lower()
    
    def remove_patterns(self, text):
        """Removes unwanted patterns such as URLs, markdown links, handles, and punctuation."""
        text = re.sub(r'http[s]?://\S+', '', text)
        text = re.sub(r'\[.*?\]\(.*?\)', '', text)
        text = re.sub(r'@\w+', '', text)
        text = re.sub(r'[^\w\s]', '', text)
        return text.strip()
    
    def clean_text(self):
        """Applies the `remove_patterns` function to clean the text."""
        self.data['statement'] = self.data['statement'].apply(self.remove_patterns)
    
    def tokenize_and_stem(self):
        """Tokenizes and stems the statements."""
        self.data['tokens'] = self.data['statement'].apply(word_tokenize)
        
        # Initialize the stemmer
        stemmer = PorterStemmer()

        # Function to stem tokens
        def stem_tokens(tokens):
            return [stemmer.stem(token) for token in tokens]  # Return list of tokens
        
        self.data['tokens_stemmed'] = self.data['tokens'].apply(stem_tokens)

    def vectorize_text(self):
        """Vectorizes the text using TF-IDF and combines it with numerical features."""
    
        # Print columns to debug

        # Label encoding target variable 'y'
        #target_columns = ['Anxiety', 'Bipolar', 'Depression', 'Personality disorder', 'Stress', 'Suicidal']
    
        # Check if target columns exist
        # if not all(col in self.data.columns for col in target_columns):
        #     raise KeyError(f"One or more target columns {target_columns} are not found in the dataset.")
    
        #y = self.data[target_columns]

        # Convert multi-label to single label (argmax across rows)
        #y_single = y.values.argmax(axis=1)
        onehot_encoder = OneHotEncoder(sparse=False)
        y_onehot = onehot_encoder.fit_transform(self.data['status'].values)
        self.data = y_onehot.argmax(axis=1)

        print(f"Columns in the dataset: {self.data.columns}")

        # Drop the target columns from X
        X = self.data.drop(target_columns, axis=1)

        # Split data into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y_single, test_size=0.2, random_state=101)

        # Initialize TF-IDF Vectorizer
        vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=50000)

        # Fit and transform TF-IDF on training data
        X_train_tfidf = vectorizer.fit_transform(X_train['tokens_stemmed'])
        X_test_tfidf = vectorizer.transform(X_test['tokens_stemmed'])

        # Extract numerical features
        X_train_num = X_train[['num_of_characters', 'num_of_sentences']].values
        X_test_num = X_test[['num_of_characters', 'num_of_sentences']].values

        # Combine TF-IDF and numerical features
        X_train_combined = hstack([X_train_tfidf, X_train_num])
        X_test_combined = hstack([X_test_tfidf, X_test_num])

        print(f'Number of feature words: {len(vectorizer.get_feature_names_out())}')

        # Apply Random Over-Sampling on the vectorized data
        ros = RandomOverSampler(random_state=101)
        X_train_resampled, y_train_resampled = ros.fit_resample(X_train_combined, y_train)

        print(f"Resampled training data shape: {X_train_resampled.shape}")
        return X_train_resampled, X_test_combined, y_train_resampled, y_test


    def train_test_split_save(self, X_train_resampled, X_test_combined, y_train_resampled, y_test):
        """Saves the processed train/test sets to the specified directory."""
        print("Saving transformed training and test sets...")

        # Combine X_train_resampled and y_train_resampled
        train = pd.DataFrame(X_train_resampled.toarray(), columns=[f'feature_{i}' for i in range(X_train_resampled.shape[1])])
        train['target'] = y_train_resampled.values

        # Combine X_test_combined and y_test
        test = pd.DataFrame(X_test_combined.toarray(), columns=[f'feature_{i}' for i in range(X_test_combined.shape[1])])
        test['target'] = y_test.values

        # Ensure directory exists
        os.makedirs(self.config.processed_data_dir, exist_ok=True)
    
        # Save to CSV
        train.to_csv(os.path.join(self.config.processed_data_dir, "train.csv"), index=False)
        test.to_csv(os.path.join(self.config.processed_data_dir, "test.csv"), index=False)

        print(f"Training data saved to {os.path.join(self.config.processed_data_dir, 'train.csv')}")
        print(f"Testing data saved to {os.path.join(self.config.processed_data_dir, 'test.csv')}")


In [None]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    
    # Call methods using the instance of the class, not the class name
    data_transformation.read_data()
    data_transformation.preprocessing()
    
    # Apply remove_patterns inside the clean_text method (since it’s already applied there)
    data_transformation.clean_text()
    
    # Tokenize and stem the text
    data_transformation.tokenize_and_stem()
    
    # Perform vectorization and resampling
    X_train_resampled, X_test_combined, y_train_resampled, y_test = data_transformation.vectorize_text()
    
    # Split and save the train/test data
    data_transformation.train_test_split_save(X_train_resampled, X_test_combined, y_train_resampled, y_test)

except Exception as e:
    raise e
