In [1]:
import os

In [2]:
%pwd

'/workspaces/Mental-Health-Sentiment-Analysis/research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'/workspaces/Mental-Health-Sentiment-Analysis'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    learning_rate: float
    max_depth: int
    n_estimators: int
    target_column: str

In [6]:
from src.MentalHealthAnalysis.constants import *
from src.MentalHealthAnalysis.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        try:
            config = self.config.get('model_trainer', {})
            params = self.params.get('xgboost_params', {})
            schema = self.schema.get('TARGET_COLUMN', {})

            create_directories([config.get('root_dir', '')])

            model_trainer_config = ModelTrainerConfig(
                root_dir=config.get('root_dir', ''),
                train_data_path=config.get('train_data_path', ''),
                test_data_path=config.get('test_data_path', ''),
                model_name=config.get('model_name', ''),
                learning_rate=params.get('learning_rate', 0.5),  # Default value if not present
                max_depth=params.get('max_depth', 7),  # Default value if not present
                n_estimators=params.get('n_estimators', 500),  # Default value if not present
                target_column=schema.get('name', '')
            )
            return model_trainer_config
        except Exception as e:
            raise RuntimeError(f"Error in model trainer config: {e}")

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from xgboost import XGBClassifier
import pandas as pd
import os
import joblib

In [9]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def train(self):
        # Load train and test data
        train_data = pd.read_csv(self.config.train_data_path)
        test_data = pd.read_csv(self.config.test_data_path)

        # Separate features and target
        train_x = train_data.drop([self.config.target_column], axis=1)
        test_x = test_data.drop([self.config.target_column], axis=1)
        train_y = train_data[self.config.target_column]
        test_y = test_data[self.config.target_column]

        # Fill NaN values for text columns with empty strings
        train_x['statement'] = train_x['statement'].fillna('').astype(str)
        test_x['statement'] = test_x['statement'].fillna('').astype(str)

        # List of text columns to apply TF-IDF
        text_columns = ['statement', 'num_of_characters', 'num_of_sentences', 'tokens', 'tokens_stemmed']

        # List of numeric columns for scaling
        numeric_columns = train_x.select_dtypes(include=['float64', 'int64']).columns.tolist()

        # Define the ColumnTransformer: TF-IDF for text, StandardScaler for numerical features
        preprocessor = ColumnTransformer(
            transformers=[
                ('text', TfidfVectorizer(), 'statement'),  # Apply TF-IDF to 'statement' column
                ('num', StandardScaler(), numeric_columns)  # Apply StandardScaler to numerical columns
            ],
            remainder='drop'  # Drop other columns if not specified
        )

        # Create a pipeline that includes preprocessing and model training
        model_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', XGBClassifier(learning_rate=0.5, max_depth=7, n_estimators=500, random_state=101))
        ])

        # Train the model using the pipeline
        model_pipeline.fit(train_x, train_y)

        # Predict the labels on the test data
        y_pred = model_pipeline.predict(test_x)

        # Evaluate model performance using accuracy
        accuracy = accuracy_score(test_y, y_pred)
        print(f"Model Accuracy: {accuracy:.4f}")

        # Ensure the root directory exists
        os.makedirs(self.config.root_dir, exist_ok=True)

        # Save the trained model
        model_path = os.path.join(self.config.root_dir, self.config.model_name)
        joblib.dump(model_pipeline, model_path)

        print(f"Model saved at {model_path}")

    # Optional: Add a method for resampling
    def _resample(self, X, y):
        # Example of resampling: upsampling the minority class
        df = pd.concat([pd.DataFrame(X), pd.Series(y, name='target')], axis=1)
        majority_class = df['target'].mode()[0]
        minority_class = df['target'].value_counts().index[1]
        
        df_majority = df[df['target'] == majority_class]
        df_minority = df[df['target'] == minority_class]
        
        # Upsample the minority class
        df_minority_upsampled = resample(df_minority, 
                                         replace=True, 
                                         n_samples=len(df_majority), 
                                         random_state=101)
        
        # Combine majority class with upsampled minority class
        df_upsampled = pd.concat([df_majority, df_minority_upsampled])
        
        # Separate features and target
        X_upsampled = df_upsampled.drop(['target'], axis=1)
        y_upsampled = df_upsampled['target']
        
        return X_upsampled, y_upsampled


In [10]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()  # Get the model trainer configuration
    model_trainer = ModelTrainer(config=model_trainer_config)  # Instantiate ModelTrainer
    model_trainer.train()  # Call the train method
except Exception as e:
    raise e

[2024-09-22 14:41:37,117: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-09-22 14:41:37,119: INFO: common: yaml file: params.yaml loaded successfully]
[2024-09-22 14:41:37,121: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-09-22 14:41:37,121: INFO: common: created directory at: artifacts]
[2024-09-22 14:41:37,122: INFO: common: created directory at: artifacts/model_trainer]


Model Accuracy: 0.8730
Model saved at artifacts/model_trainer/model.joblib
