<a href="https://colab.research.google.com/github/Hari-Priya-18/B6_PFDS_1372/blob/main/code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# --- Step 1: Data Acquisition and Merging ---
# This framework assumes multiple survey-based datasets are available in CSV format.
# A key challenge is the heterogeneity of data sources.[1, 2]
# This step involves loading and carefully merging them based on common features.

def load_and_merge_datasets(file_paths):
    """Loads multiple datasets and merges them into a single DataFrame."""
    all_data = [] # Corrected indentation and initialization
    for path in file_paths:
        df = pd.read_csv(path)
        # Standardize column names (e.g., 'academic_pressure', 'sleep_quality')
        # to ensure consistency across datasets.
        # This is a crucial step to address data fragmentation.[2]
        all_data.append(df)

    # Concatenate all dataframes.
    merged_df = pd.concat(all_data, ignore_index=True)
    return merged_df

# --- Step 2: Data Preprocessing and Feature Engineering ---
# The abstract mentions survey-based academic, lifestyle, behavioral, and health features.
# Research confirms the importance of a holistic view of student life.[3]
# This step involves cleaning the data and preparing it for the models.

def preprocess_data(df):
    """Handles data preprocessing including cleaning, imputation, and feature creation."""
    # Define features based on research insights.[3, 4]
    # Updated column names based on the available columns in the merged DataFrame
    academic_features = ['Academic Pressure', 'Work Pressure', 'CGPA']
    lifestyle_features = ['Sleep Duration', 'Dietary Habits', 'Social Media Usage (Hours per day)'] # Using Social Media Usage as a proxy for social time
    health_features = ['anxiety_level', 'self_esteem', 'mental_health_history']

    # Target variables for joint prediction.
    # Updated column names based on the available columns in the merged DataFrame
    target_variables = ['stress_level', 'Depression']

    # Separate features and targets.
    # Select only the columns that exist in the DataFrame
    all_features = academic_features + lifestyle_features + health_features
    existing_features = [col for col in all_features if col in df.columns]

    if not existing_features:
        print("Error: No specified feature columns were found in the DataFrame.")
        print("Please check the column names in your datasets and update the feature lists in preprocess_data.")
        return None, None, None, None, None # Return None values to indicate failure

    X = df[existing_features]

    existing_targets = [col for col in target_variables if col in df.columns]
    y = df[existing_targets]

    if not existing_targets:
        print("Error: No specified target columns were found in the DataFrame.")
        print("Please check the column names in your datasets and update the target lists in preprocess_data.")
        return None, None, None, None, None # Return None values to indicate failure


    # Split data into training and testing sets (e.g., 80:20 ratio).[5]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Identify numerical and categorical features.
    numerical_features = X.select_dtypes(include=np.number).columns
    categorical_features = X.select_dtypes(include='object').columns

    # Create preprocessing pipelines for numerical and categorical data.
    # Imputation handles missing values.[5]
    numerical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')), # Corrected initialization and indentation
                                            ('scaler', StandardScaler())]) # Corrected indentation

    categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), # Corrected initialization and indentation
                                              ('onehot', OneHotEncoder(handle_unknown='ignore'))]) # Corrected indentation

    # Combine transformers using ColumnTransformer.
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    return preprocessor, X_train, X_test, y_train, y_test

# --- Corrected Step 3: Handling Class Imbalance (e.g., with SMOTE) ---
# Mental health datasets are often imbalanced. SMOTE is a common technique to mitigate this issue.[1, 6]

def apply_smote(X_train, y_train):
    """Applies SMOTE to the training data to address class imbalance."""
    smote = SMOTE(random_state=42)
    # The target variable must be a 2D array. We reshape it here.
    # NOTE: Assuming 'Depression' is a target variable and is suitable for SMOTE.
    # You may need to adjust this based on which target variable you want to resample.
    if 'Depression' not in y_train.columns:
         print("Error: 'Depression' column not found in the target variables for SMOTE.")
         return X_train, y_train # Return original data if target not found

    X_resampled, y_resampled = smote.fit_resample(X_train, y_train['Depression'])
    return X_resampled, y_resampled

# --- Corrected Step 4: Model Training and Evaluation ---
# The abstract lists five models: Logistic Regression, Random Forest, SVM, XGBoost, and an ANN.
# We will evaluate each using the specified metrics.[7]

def train_and_evaluate_models(preprocessor, X_train, X_test, y_train, y_test):
    """Trains and evaluates the proposed machine learning models."""
    # Check if preprocessing was successful and data is available
    if X_train is None or y_train is None or X_test is None or y_test is None or preprocessor is None:
        print("Model training skipped due to missing data or preprocessor.")
        return {}

    models = {
        'Logistic Regression': LogisticRegression(max_iter=1000), # Good baseline model.[8]
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42), # Robust and high-performing.[9, 6]
        'SVM': SVC(probability=True, random_state=42), # Effective in high-dimensional spaces.[10]
        'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42), # Superior performance is consistently reported.[6]
        'ANN': Sequential([ # Corrected indentation
            Dense(64, activation='relu'), # Corrected indentation
            Dense(32, activation='relu'), # Corrected indentation
            Dense(1, activation='sigmoid') # Sigmoid for binary classification (stress/depression). # Corrected indentation
        ])
    }

    results = {}

    # NOTE: The original code assumed a single target ('depression') for training and evaluation.
    # Since the target variables are now 'stress_level' and 'Depression', this needs to be handled.
    # For simplicity and to avoid a multi-label classification setup which requires significant changes,
    # I will train and evaluate models for 'Depression' as the target variable, similar to the original code's structure.
    # If you need to predict both 'stress_level' and 'Depression' jointly, a multi-label approach is required.

    target_column_for_training = 'Depression' # Change to 'stress_level' if needed

    if target_column_for_training not in y_train.columns or target_column_for_training not in y_test.columns:
        print(f"Error: Target column '{target_column_for_training}' not found in the target variables in either train or test sets.")
        return {}

    # Drop rows with NaN in the target variable from training data
    train_data = pd.concat([X_train, y_train], axis=1)
    train_data.dropna(subset=[target_column_for_training], inplace=True)
    X_train_cleaned = train_data[X_train.columns]
    y_train_cleaned = train_data[target_column_for_training]


    for name, model in models.items():
        print(f"Training {name}...")

        # Create a pipeline that first preprocesses, then trains the model.
        # This ensures consistency in the data transformation process.
        if name == 'ANN':
            # ANN requires a different training approach.
            X_train_processed = preprocessor.fit_transform(X_train_cleaned)
            X_test_processed = preprocessor.transform(X_test)

            # Dynamically set the input shape for the ANN based on the preprocessed data
            models[name] = Sequential([
                Dense(64, activation='relu', input_shape=(X_train_processed.shape[1],)),
                Dense(32, activation='relu'),
                Dense(1, activation='sigmoid')
            ])
            model = models[name] # Update the model variable to the new Sequential model

            model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
            # Train on the selected target column
            model.fit(X_train_processed, y_train_cleaned, epochs=10, batch_size=32, verbose=0)
            y_pred_proba = model.predict(X_test_processed).flatten()
            y_pred = (y_pred_proba > 0.5).astype(int)
        else:
            pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                       ('classifier', model)])

            # Train on the selected target column
            pipeline.fit(X_train_cleaned, y_train_cleaned)
            y_pred = pipeline.predict(X_test)
            y_pred_proba = pipeline.predict_proba(X_test)[:, 1] if hasattr(pipeline.named_steps['classifier'], 'predict_proba') else None

        # --- Model Evaluation ---
        # Evaluate using the metrics from the abstract: accuracy, precision, recall, F1-score, and ROC-AUC.[7]
        # These metrics provide a "balanced view of model reliability".[7]

        # Align y_test and y_pred by dropping rows with NaN in y_test
        y_test_cleaned = y_test[target_column_for_training].dropna()
        y_pred_cleaned = pd.Series(y_pred, index=y_test.index).loc[y_test_cleaned.index]
        y_pred_proba_cleaned = pd.Series(y_pred_proba, index=y_test.index).loc[y_test_cleaned.index] if y_pred_proba is not None else None


        # Evaluate against the selected target column
        accuracy = accuracy_score(y_test_cleaned, y_pred_cleaned)
        precision = precision_score(y_test_cleaned, y_pred_cleaned, zero_division=0)
        recall = recall_score(y_test_cleaned, y_pred_cleaned, zero_division=0)
        f1 = f1_score(y_test_cleaned, y_pred_cleaned, zero_division=0)
        roc_auc = roc_auc_score(y_test_cleaned, y_pred_proba_cleaned) if y_pred_proba_cleaned is not None else 'N/A'


        results[name] = {
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1-score': f1,
            'ROC-AUC': roc_auc
        }

    return results

# --- Main Execution Block ---
if __name__ == '__main__':
    # You must update this list with your actual file paths and ensure the columns match the code.
    file_paths = ['/content/student_depression_dataset.csv', '/content/StressLevelDataset.csv', '/content/Student_Mental_Stress_and_Coping_Mechanisms.csv', '/content/Mental Health Dataset.csv']

    # Load and preprocess the data.
    df = load_and_merge_datasets(file_paths)
    preprocessor, X_train, X_test, y_train, y_test = preprocess_data(df)

    # Train and evaluate the models.
    results = train_and_evaluate_models(preprocessor, X_train, X_test, y_train, y_test)

    # Print the performance results.
    print("Model Performance Results:")
    for model, metrics in results.items():
        print(f"\n--- {model} ---")
        for metric, value in metrics.items():
            # Check if the value is a number before formatting
            if isinstance(value, (int, float)):
                print(f"{metric}: {value:.4f}")
            else:
                print(f"{metric}: value")

Training Logistic Regression...


 'mental_health_history']. At least one non-missing value is needed for imputation with strategy='mean'.
 'mental_health_history']. At least one non-missing value is needed for imputation with strategy='mean'.
 'mental_health_history']. At least one non-missing value is needed for imputation with strategy='mean'.


Training Random Forest...


 'mental_health_history']. At least one non-missing value is needed for imputation with strategy='mean'.
 'mental_health_history']. At least one non-missing value is needed for imputation with strategy='mean'.
 'mental_health_history']. At least one non-missing value is needed for imputation with strategy='mean'.


Training SVM...


 'mental_health_history']. At least one non-missing value is needed for imputation with strategy='mean'.
 'mental_health_history']. At least one non-missing value is needed for imputation with strategy='mean'.
 'mental_health_history']. At least one non-missing value is needed for imputation with strategy='mean'.


Training XGBoost...


 'mental_health_history']. At least one non-missing value is needed for imputation with strategy='mean'.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
 'mental_health_history']. At least one non-missing value is needed for imputation with strategy='mean'.
 'mental_health_history']. At least one non-missing value is needed for imputation with strategy='mean'.


Training ANN...


 'mental_health_history']. At least one non-missing value is needed for imputation with strategy='mean'.
 'mental_health_history']. At least one non-missing value is needed for imputation with strategy='mean'.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2014/2014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
Model Performance Results:

--- Logistic Regression ---
Accuracy: 0.7394
Precision: 0.7572
Recall: 0.8043
F1-score: 0.7800
ROC-AUC: 0.8030

--- Random Forest ---
Accuracy: 0.6803
Precision: 0.7144
Recall: 0.7386
F1-score: 0.7263
ROC-AUC: 0.7271

--- SVM ---
Accuracy: 0.7344
Precision: 0.7209
Recall: 0.8774
F1-score: 0.7915
ROC-AUC: 0.7865

--- XGBoost ---
Accuracy: 0.7312
Precision: 0.7510
Recall: 0.7959
F1-score: 0.7728
ROC-AUC: 0.7890

--- ANN ---
Accuracy: 0.7392
Precision: 0.7448
Recall: 0.8307
F1-score: 0.7854
ROC-AUC: 0.8027
