In [1]:
import os

In [None]:
%pwd

In [3]:
os.chdir("../")

In [None]:
%pwd

In [5]:
from dataclasses import dataclass
from pathlib import Path

# config.yaml
@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [6]:
from MentalHealthAnalysis import logger
from src.MentalHealthAnalysis.constants import *
from src.MentalHealthAnalysis.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        try:
            config = self.config.get('data_transformation', {})
            create_directories([config.get('root_dir', '')])

            data_transformation_config = DataTransformationConfig(
                root_dir=config.get('root_dir', ''),
                data_path=config.get('data_path', ''),
            )
            return data_transformation_config

        except Exception as e:
            raise RuntimeError(f"Error in data transformation config: {e}")


In [None]:
import os
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from scipy.sparse import hstack
from imblearn.over_sampling import RandomOverSampler
import nltk

# Ensure NLTK data is downloaded
nltk.download('punkt')

In [9]:
class DataTransformation:
    def __init__(self, config):
        self.config = config
    
    def read_data(self):
        """Reads the dataset from the specified path."""
        self.data = pd.read_csv(self.config.data_path)
    
    def preprocessing(self):
        """Performs preprocessing steps such as handling missing values and text cleaning."""
        self.data.dropna(inplace=True)
        
        # Label encoding target variable 'y'
        lbl_enc = LabelEncoder()
        self.data['status'] = lbl_enc.fit_transform(self.data['status'].values)
        
        # Create additional features based on text length and number of sentences
        self.data['num_of_characters'] = self.data['statement'].str.len()
        self.data['num_of_sentences'] = self.data['statement'].apply(lambda x: len(sent_tokenize(x)))
        
        # Lowercasing the text
        self.data['statement'] = self.data['statement'].str.lower()
    
    def remove_patterns(self, text):
        """Removes unwanted patterns such as URLs, markdown links, handles, and punctuation."""
        text = re.sub(r'http[s]?://\S+', '', text)
        text = re.sub(r'\[.*?\]\(.*?\)', '', text)
        text = re.sub(r'@\w+', '', text)
        text = re.sub(r'[^\w\s]', '', text)
        return text.strip()
    
    def clean_text(self):
        """Applies the `remove_patterns` function to clean the text."""
        self.data['statement'] = self.data['statement'].apply(self.remove_patterns)
        print("Text cleaning completed.")
        print(self.data.head())
    
    def tokenize_and_stem(self):
        """Tokenizes and stems the statements."""
        self.data['tokens'] = self.data['statement'].apply(word_tokenize)
        print("Tokenization completed.")
        print(self.data.head())
        
        # Initialize the stemmer
        stemmer = PorterStemmer()

        # Function to stem tokens
        def stem_tokens(tokens):
            return ' '.join(stemmer.stem(str(token)) for token in tokens)
        
        self.data['tokens_stemmed'] = self.data['tokens'].apply(stem_tokens)
        print("Stemming completed.")
        print(self.data.head())

    def vectorize_text(self):
        """Vectorizes the text using TF-IDF and combines it with numerical features."""
        # Split data into train and test sets
        X = self.data.drop('status', axis=1)
        y = self.data['status']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
        
        # Initialize TF-IDF Vectorizer
        vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=50000)
        print("TF-IDF Vectorizer initialized.")
        print(X.head())
        
        # Fit and transform TF-IDF on training data
        X_train_tfidf = vectorizer.fit_transform(X_train['tokens_stemmed'])
        X_test_tfidf = vectorizer.transform(X_test['tokens_stemmed'])

        # Extract numerical features
        X_train_num = X_train[['num_of_characters', 'num_of_sentences']].values
        X_test_num = X_test[['num_of_characters', 'num_of_sentences']].values

        # Combine TF-IDF and numerical features
        X_train_combined = hstack([X_train_tfidf, X_train_num])
        X_test_combined = hstack([X_test_tfidf, X_test_num])

        print(f'Number of feature words: {len(vectorizer.get_feature_names_out())}')

        # Apply Random Over-Sampling on the vectorized data
        ros = RandomOverSampler(random_state=101)
        X_train_resampled, y_train_resampled = ros.fit_resample(X_train_combined, y_train)

        print(f"Resampled training data shape: {X_train_resampled.shape}")
        return X_train_resampled, X_test_combined, y_train_resampled, y_test

    def train_test_split_save(self):
        """Splits the data into train/test sets and saves them to the specified directory."""
        print("Splitting data into training and test sets...")
        self.data.head()
        train, test = train_test_split(self.data, test_size=0.2, random_state=101)
        
        train.to_csv(os.path.join(self.data, "train.csv"), index=False)
        test.to_csv(os.path.join(self.data, "test.csv"), index=False)
        print("Training Saved.")
        train.head()
        print("Testing Saved.") 
        test.head()

        logger.info("Splited data into training and test sets")
        logger.info(train.shape)
        logger.info(test.shape)

        print(f"Training data shape: {train.shape}")
        print(f"Testing data shape: {test.shape}")



In [None]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    
    # Call methods using the instance of the class, not the class name
    data_transformation.read_data()
    data_transformation.preprocessing()
    
    # Apply remove_patterns inside the clean_text method (since it’s already applied there)
    data_transformation.clean_text()
    
    # Tokenize and stem the text
    data_transformation.tokenize_and_stem()
    
    # Perform vectorization and resampling
    X_train_resampled, X_test_combined, y_train_resampled, y_test = data_transformation.vectorize_text()
    
    # Split and save the train/test data
    data_transformation.train_test_split_save()

except Exception as e:
    raise e
