<a href="https://colab.research.google.com/github/Hari-Priya-18/B6_PFDS_1372/blob/main/Project_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# --- Step 1: Data Acquisition and Merging ---
# This framework assumes multiple survey-based datasets are available in CSV format.
# A key challenge is the heterogeneity of data sources.[3, 4]
# This step involves loading and carefully merging them based on common features.

def load_and_merge_datasets(file_paths):
    """Loads multiple datasets and merges them into a single DataFrame."""
    all_data = []
    for path in file_paths:
        df = pd.read_csv(path)
        # Standardize column names (e.g., 'academic_pressure', 'sleep_quality')
        # to ensure consistency across datasets.
        # This is a crucial step to address data fragmentation.[4]
        all_data.append(df)

    # Concatenate all dataframes.
    merged_df = pd.concat(all_data, ignore_index=True)
    return merged_df

# Example usage:
# file_paths = ['dataset_1.csv', 'dataset_2.csv', 'dataset_3.csv']
# df = load_and_merge_datasets(file_paths)

# --- Step 2: Data Preprocessing and Feature Engineering ---
# The abstract mentions survey-based academic, lifestyle, behavioral, and health features.
# Research confirms the importance of a holistic view of student life.[5]
# This step involves cleaning the data and preparing it for the models.

def preprocess_data(df):
    """Handles data preprocessing including cleaning, imputation, and feature creation."""
    # Define features based on research insights.[5, 6]
    # NOTE: These column names are based on the available columns in the merged DataFrame.
    # You may need to adjust these based on the specific columns in your datasets.
    academic_features = ['academic_pressure_placeholder', 'workload_placeholder', 'cgpa_placeholder'] # Replace with actual column names
    lifestyle_features = ['sleep_patterns_placeholder', 'eating_habits_placeholder', 'social_time_placeholder'] # Replace with actual column names
    health_features = ['Anxiety_Level', 'self_esteem_placeholder', 'mental_health_history_placeholder'] # Replace with actual column names for self_esteem and history

    # Target variables for joint prediction.
    # Note: These are example column names; they should match the datasets.
    # Based on the available columns, 'Depression' and 'stress_level' seem relevant.
    target_variables = ['stress_level', 'Depression']

    # Separate features and targets.
    # Select only the columns that exist in the DataFrame
    all_features = academic_features + lifestyle_features + health_features
    existing_features = [col for col in all_features if col in df.columns]

    if not existing_features:
        print("Error: No specified feature columns were found in the DataFrame.")
        print("Please check the column names in your datasets and update the feature lists in preprocess_data.")
        return None, None, None, None, None # Return None values to indicate failure

    X = df[existing_features]

    existing_targets = [col for col in target_variables if col in df.columns]
    y = df[existing_targets]

    if not existing_targets:
        print("Error: No specified target columns were found in the DataFrame.")
        print("Please check the column names in your datasets and update the target lists in preprocess_data.")
        return None, None, None, None, None # Return None values to indicate failure


    # Split data into training and testing sets (e.g., 80:20 ratio).[7]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Identify numerical and categorical features.
    numerical_features = X.select_dtypes(include=np.number).columns
    categorical_features = X.select_dtypes(include='object').columns

    # Create preprocessing pipelines for numerical and categorical data.
    # Imputation handles missing values.[7]
    numerical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),
                                            ('scaler', StandardScaler())])

    categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                              ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    # Combine transformers using ColumnTransformer.
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    return preprocessor, X_train, X_test, y_train, y_test

# --- Step 3: Handling Class Imbalance (e.g., with SMOTE) ---
# Mental health datasets are often imbalanced.[3]
# SMOTE is a common technique to mitigate this issue.[3, 8]

def apply_smote(X_train, y_train):
    """Applies SMOTE to the training data to address class imbalance."""
    smote = SMOTE(random_state=42)
    # This example handles a single target; a more complex approach would be needed
    # for multi-label prediction.
    # NOTE: Assuming 'Depression' is a target variable and is suitable for SMOTE.
    # You may need to adjust this based on which target variable you want to resample.
    if 'Depression' not in y_train.columns:
         print("Error: 'Depression' column not found in the target variables for SMOTE.")
         return X_train, y_train # Return original data if target not found

    X_resampled, y_resampled = smote.fit_resample(X_train, y_train['Depression'])
    return X_resampled, y_resampled

# --- Step 4: Model Training and Evaluation ---
# The abstract lists five models: Logistic Regression, Random Forest, SVM, XGBoost, and an ANN.
# We will evaluate each using the specified metrics.[9]

def train_and_evaluate_models(preprocessor, X_train, X_test, y_train, y_test):
    """Trains and evaluates the proposed machine learning models."""
    # Check if preprocessing was successful and data is available
    if X_train is None or y_train is None or X_test is None or y_test is None or preprocessor is None:
        print("Model training skipped due to missing data or preprocessor.")
        return {}

    models = {
        'Logistic Regression': LogisticRegression(max_iter=1000), # Good baseline model.[10]
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42), # Robust and high-performing.[11, 8]
        'SVM': SVC(probability=True, random_state=42), # Effective in high-dimensional spaces.[12]
        'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42), # Superior performance is consistently reported.[8]
        'ANN': Sequential([
            Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
            Dense(32, activation='relu'),
            Dense(1, activation='sigmoid') # Sigmoid for binary classification (stress/depression).
        ])
    }

    results = {}

    # NOTE: The original code assumed a single target ('depression') for training and evaluation.
    # Since the target variables are now 'stress_level' and 'Depression', this needs to be handled.
    # For simplicity and to avoid a multi-label classification setup which requires significant changes,
    # I will train and evaluate models for 'Depression' as the target variable, similar to the original code's structure.
    # If you need to predict both 'stress_level' and 'Depression' jointly, a multi-label approach is required.

    target_column_for_training = 'Depression' # Change to 'stress_level' if needed

    if target_column_for_training not in y_train.columns:
        print(f"Error: Target column '{target_column_for_training}' not found in the target variables.")
        return {}


    for name, model in models.items():
        print(f"Training {name}...")

        # Create a pipeline that first preprocesses, then trains the model.
        # This ensures consistency in the data transformation process.
        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('classifier', model)])

        # Fit the model and make predictions.
        if name == 'ANN':
            # ANN requires a different training approach.
            X_train_processed = preprocessor.fit_transform(X_train)
            X_test_processed = preprocessor.transform(X_test)

            model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
            # Train on the selected target column
            model.fit(X_train_processed, y_train[target_column_for_training], epochs=10, batch_size=32, verbose=0)
            y_pred_proba = model.predict(X_test_processed).flatten()
            y_pred = (y_pred_proba > 0.5).astype(int)
        else:
            # Train on the selected target column
            pipeline.fit(X_train, y_train[target_column_for_training])
            y_pred = pipeline.predict(X_test)
            y_pred_proba = pipeline.predict_proba(X_test)[:, 1] if hasattr(pipeline.named_steps['classifier'], 'predict_proba') else None

        # --- Model Evaluation ---
        # Evaluate using the metrics from the abstract: accuracy, precision, recall, F1-score, and ROC-AUC.[9]
        # These metrics provide a "balanced view of model reliability".[9]

        # Evaluate against the selected target column
        accuracy = accuracy_score(y_test[target_column_for_training], y_pred)
        precision = precision_score(y_test[target_column_for_training], y_pred, zero_division=0)
        recall = recall_score(y_test[target_column_for_training], y_pred, zero_division=0)
        f1 = f1_score(y_test[target_column_for_training], y_pred, zero_division=0)
        roc_auc = roc_auc_score(y_test[target_column_for_training], y_pred_proba) if y_pred_proba is not None else 'N/A'


        results[name] = {
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1-score': f1,
            'ROC-AUC': roc_auc
        }

    return results

# Final step: Print the results.
# print("Model Performance Results:")
# for model, metrics in results.items():
#     print(f"\n--- {model} ---")
#     for metric, value in metrics.items():
#         print(f"{metric}: {value:.4f}")

In [12]:
# Example usage of the functions:
# First, you need to load and preprocess the data.
# Replace with your actual file paths
file_paths = ['/content/student_depression_dataset.csv', '/content/StressLevelDataset.csv', '/content/Student_Mental_Stress_and_Coping_Mechanisms.csv', '/content/Mental Health Dataset.csv']
df = load_and_merge_datasets(file_paths)

# Print column names to help identify correct features and targets
print("DataFrame columns:")
print(df.columns)

# Preprocess the data
preprocessor, X_train, X_test, y_train, y_test = preprocess_data(df)

# Apply SMOTE (optional, uncomment if needed and adjust for multi-label if necessary)
# X_resampled, y_resampled = apply_smote(X_train, y_train)

# Train and evaluate the models
results = train_and_evaluate_models(preprocessor, X_train, X_test, y_train, y_test)

# Print the results
print("Model Performance Results:")
for model, metrics in results.items():
    print(f"\n--- {model} ---")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")

DataFrame columns:
Index(['id', 'Gender', 'Age', 'City', 'Profession', 'Academic Pressure',
       'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction',
       'Sleep Duration', 'Dietary Habits', 'Degree',
       'Have you ever had suicidal thoughts ?', 'Work/Study Hours',
       'Financial Stress', 'Family History of Mental Illness', 'Depression',
       'anxiety_level', 'self_esteem', 'mental_health_history', 'depression',
       'headache', 'blood_pressure', 'sleep_quality', 'breathing_problem',
       'noise_level', 'living_conditions', 'safety', 'basic_needs',
       'academic_performance', 'study_load', 'teacher_student_relationship',
       'future_career_concerns', 'social_support', 'peer_pressure',
       'extracurricular_activities', 'bullying', 'stress_level', 'Student ID',
       'Academic Performance (GPA)', 'Study Hours Per Week',
       'Social Media Usage (Hours per day)',
       'Sleep Duration (Hours per night)',
       'Physical Exercise (Hours per week)