<a href="https://colab.research.google.com/github/Hari-Priya-18/B6_PFDS_1372/blob/main/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.ensemble import RandomForestClassifier
import numpy as np

datasets = {
    "Depression": df1,
    "Stress": df2,
    "Coping": df3,
    "MentalHealth": df4
}

target_columns = {
    "Depression": "Depression",   # change to actual target column name
    "Stress": "Stress_Level",     # Corrected target column name for Stress dataset
    "Coping": "Stress Coping Mechanisms",  # change to actual target column name
    "MentalHealth": "treatment"  # change to actual target column name
}

for name, data in datasets.items():
    # Skip processing for Coping and MentalHealth datasets for now
    # if name in ["Coping", "MentalHealth"]:
    #     print(f"\n===== Skipping {name} Dataset due to unresolved target column name =====")
    #     continue

    print(f"\n===== {name} Dataset =====")
    target = target_columns[name]

    # Drop rows with missing target
    data = data.dropna(subset=[target])

    X = data.drop(columns=[target])
    y = data[target]

    # Identify categorical and numerical features
    categorical_features = X.select_dtypes(include=['object', 'category']).columns
    numerical_features = X.select_dtypes(include=np.number).columns

    # Create a column transformer for preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='passthrough' # Keep other columns (if any)
    )

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42 # Removed stratify due to potential issues with multi-class targets and small sample sizes after dropping NaNs
    )

    # Build pipeline with preprocessor, SMOTE, and Random Forest
    pipe = ImbPipeline([
        ('preprocessor', preprocessor),
        ('smote', SMOTE(random_state=42)),
        ('clf', RandomForestClassifier(random_state=42))
    ])

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'clf__n_estimators': [100, 200, 300],
        'clf__max_depth': [None, 10, 20, 30]
    }

    # Perform GridSearchCV
    grid_search = GridSearchCV(pipe, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Get the best model
    best_model = grid_search.best_estimator_

    # Predict with the best model
    y_pred = best_model.predict(X_test)

    # Accuracy + classification report
    acc = accuracy_score(y_test, y_pred)
    print("Accuracy:", acc)
    print(classification_report(y_test, y_pred))
    print("Best parameters found:", grid_search.best_params_)

In [3]:
import pandas as pd

# Load the datasets
df1 = pd.read_csv('/content/student_depression_dataset.csv')
df2 = pd.read_csv('/content/StressLevelDataset.csv')
df3 = pd.read_csv('/content/Student_Mental_Stress_and_Coping_Mechanisms.csv')
df4 = pd.read_csv('/content/Mental Health Dataset.csv')

In [7]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np

def run_xgboost(dataset, target_col, name):
    print(f"\n===== XGBoost Results for {name} =====")

    dataset = dataset.dropna(subset=[target_col])
    X = dataset.drop(columns=[target_col], errors="ignore")
    y = dataset[target_col]

    # Identify categorical and numerical features
    categorical_features = X.select_dtypes(include=['object', 'category']).columns
    numerical_features = X.select_dtypes(include=np.number).columns

    # Create a column transformer for preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='passthrough' # Keep other columns (if any)
    )

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42 # Removed stratify due to potential issues with multi-class targets and small sample sizes after dropping NaNs
    )

    # Apply preprocessing
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)


    model = XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
        random_state=42
    )

    model.fit(X_train_processed, y_train)
    y_pred = model.predict(X_test_processed)

    acc = accuracy_score(y_test, y_pred)
    print("Accuracy:", acc)
    print(classification_report(y_test, y_pred))

# Run on Depression dataset
run_xgboost(df1, "Depression", "Depression")

# Run on Stress dataset
run_xgboost(df2, "stress_level", "Stress")


===== XGBoost Results for Depression =====
Accuracy: 0.83049632682315
              precision    recall  f1-score   support

           0       0.81      0.78      0.79      2343
           1       0.84      0.87      0.86      3238

    accuracy                           0.83      5581
   macro avg       0.83      0.82      0.83      5581
weighted avg       0.83      0.83      0.83      5581


===== XGBoost Results for Stress =====
Accuracy: 0.8863636363636364
              precision    recall  f1-score   support

           0       0.90      0.84      0.87        76
           1       0.86      0.90      0.88        73
           2       0.90      0.92      0.91        71

    accuracy                           0.89       220
   macro avg       0.89      0.89      0.89       220
weighted avg       0.89      0.89      0.89       220



In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np

# Assuming X_train and y_train are already defined and contain the data

# Identify categorical and numerical features
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns
numerical_features = X_train.select_dtypes(include=np.number).columns

# Create a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough' # Keep other columns (if any)
)

# Create a pipeline with preprocessing and the RandomForestClassifier
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier(random_state=42))
])

param_dist = {
    "clf__n_estimators": [100, 200, 300, 500],
    "clf__max_depth": [None, 10, 20, 30],
    "clf__min_samples_split": [2, 5, 10],
    "clf__min_samples_leaf": [1, 2, 4],
    "clf__max_features": ["sqrt", "log2"]
}

search = RandomizedSearchCV(pipe, param_distributions=param_dist,
                            n_iter=20, cv=5, scoring="accuracy",
                            n_jobs=-1, random_state=42)

search.fit(X_train, y_train)

print("Best parameters:", search.best_params_)
print("Best CV accuracy:", search.best_score_)

In [None]:
from sklearn.ensemble import VotingClassifier

voting = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=300, random_state=42)),
        ('xgb', XGBClassifier(n_estimators=300, learning_rate=0.05, random_state=42)),
    ],
    voting='soft'
)

voting.fit(X_train, y_train)
print("Ensemble Accuracy:", voting.score(X_test, y_test))
