<a href="https://colab.research.google.com/github/Hari-Priya-18/B6_PFDS_1372/blob/main/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.ensemble import RandomForestClassifier
import numpy as np

datasets = {
    "Depression": df1,
    "Stress": df2,
    "Coping": df3,
    "MentalHealth": df4
}

target_columns = {
    "Depression": "Depression",   # change to actual target column name
    "Stress": "Stress_Level",     # Corrected target column name for Stress dataset
    "Coping": "Stress Coping Mechanisms",  # change to actual target column name
    "MentalHealth": "treatment"  # change to actual target column name
}

for name, data in datasets.items():
    # Skip processing for Coping and MentalHealth datasets for now
    if name in ["Coping", "MentalHealth"]:
        print(f"\n===== Skipping {name} Dataset due to unresolved target column name =====")
        continue

    print(f"\n===== {name} Dataset =====")
    target = target_columns[name]

    # Drop rows with missing target
    data = data.dropna(subset=[target])

    X = data.drop(columns=[target])
    y = data[target]

    # Identify categorical and numerical features
    categorical_features = X.select_dtypes(include=['object', 'category']).columns
    numerical_features = X.select_dtypes(include=np.number).columns

    # Create a column transformer for preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='passthrough' # Keep other columns (if any)
    )

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42 # Removed stratify due to potential issues with multi-class targets and small sample sizes after dropping NaNs
    )

    # Build pipeline with preprocessor, SMOTE, and Random Forest
    pipe = ImbPipeline([
        ('preprocessor', preprocessor),
        ('smote', SMOTE(random_state=42)),
        ('clf', RandomForestClassifier(random_state=42))
    ])

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'clf__n_estimators': [100, 200, 300],
        'clf__max_depth': [None, 10, 20, 30]
    }

    # Perform GridSearchCV
    grid_search = GridSearchCV(pipe, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Get the best model
    best_model = grid_search.best_estimator_

    # Predict with the best model
    y_pred = best_model.predict(X_test)

    # Accuracy + classification report
    acc = accuracy_score(y_test, y_pred)
    print("Accuracy:", acc)
    print(classification_report(y_test, y_pred))
    print("Best parameters found:", grid_search.best_params_)


===== Depression Dataset =====
