<a href="https://colab.research.google.com/github/Hari-Priya-18/B6_PFDS_1372/blob/main/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.ensemble import RandomForestClassifier
import numpy as np

datasets = {
    "Depression": df1,
    "Stress": df2,

}

target_columns = {
    "Depression": "Depression",   # change to actual target column name
    "Stress": "stress_level",     # Corrected target column name for Stress dataset
}

for name, data in datasets.items():
    # Skip processing for Coping and MentalHealth datasets for now
    # if name in ["Coping", "MentalHealth"]:
    #     print(f"\n===== Skipping {name} Dataset due to unresolved target column name =====")
    #     continue

    print(f"\n===== {name} Dataset =====")
    target = target_columns[name]

    # Drop rows with missing target
    data = data.dropna(subset=[target])

    X = data.drop(columns=[target])
    y = data[target]

    # Identify categorical and numerical features
    categorical_features = X.select_dtypes(include=['object', 'category']).columns
    numerical_features = X.select_dtypes(include=np.number).columns

    # Create a column transformer for preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='passthrough' # Keep other columns (if any)
    )

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42 # Removed stratify due to potential issues with multi-class targets and small sample sizes after dropping NaNs
    )

    # Build pipeline with preprocessor, SMOTE, and Random Forest
    pipe = ImbPipeline([
        ('preprocessor', preprocessor),
        ('smote', SMOTE(random_state=42)),
        ('clf', RandomForestClassifier(random_state=42))
    ])

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'clf__n_estimators': [100, 200, 300],
        'clf__max_depth': [None, 10, 20, 30]
    }

    # Perform GridSearchCV
    grid_search = GridSearchCV(pipe, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Get the best model
    best_model = grid_search.best_estimator_

    # Predict with the best model
    y_pred = best_model.predict(X_test)

    # Accuracy + classification report
    acc = accuracy_score(y_test, y_pred)
    print("Accuracy:", acc)
    print(classification_report(y_test, y_pred))
    print("Best parameters found:", grid_search.best_params_)


===== Depression Dataset =====
Accuracy: 0.8288837125963089
              precision    recall  f1-score   support

           0       0.81      0.78      0.79      2343
           1       0.84      0.86      0.85      3238

    accuracy                           0.83      5581
   macro avg       0.83      0.82      0.82      5581
weighted avg       0.83      0.83      0.83      5581

Best parameters found: {'clf__max_depth': 20, 'clf__n_estimators': 200}

===== Stress Dataset =====
Accuracy: 0.8636363636363636
              precision    recall  f1-score   support

           0       0.84      0.87      0.85        76
           1       0.91      0.85      0.88        73
           2       0.85      0.87      0.86        71

    accuracy                           0.86       220
   macro avg       0.87      0.86      0.86       220
weighted avg       0.87      0.86      0.86       220

Best parameters found: {'clf__max_depth': 10, 'clf__n_estimators': 300}


In [2]:
import pandas as pd

# Load the datasets
df1 = pd.read_csv('/content/student_depression_dataset.csv')
df2 = pd.read_csv('/content/StressLevelDataset.csv')

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np

def run_xgboost(dataset, target_col, name):
    print(f"\n===== XGBoost Results for {name} =====")

    dataset = dataset.dropna(subset=[target_col])
    X = dataset.drop(columns=[target_col], errors="ignore")
    y = dataset[target_col]

    # Identify categorical and numerical features
    categorical_features = X.select_dtypes(include=['object', 'category']).columns
    numerical_features = X.select_dtypes(include=np.number).columns

    # Create a column transformer for preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='passthrough' # Keep other columns (if any)
    )

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42 # Removed stratify due to potential issues with multi-class targets and small sample sizes after dropping NaNs
    )

    # Apply preprocessing
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)


    model = XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
        random_state=42
    )

    model.fit(X_train_processed, y_train)
    y_pred = model.predict(X_test_processed)

    acc = accuracy_score(y_test, y_pred)
    print("Accuracy:", acc)
    print(classification_report(y_test, y_pred))

# Run on Depression dataset
run_xgboost(df1, "Depression", "Depression")

# Run on Stress dataset
run_xgboost(df2, "stress_level", "Stress")


===== XGBoost Results for Depression =====
Accuracy: 0.83049632682315
              precision    recall  f1-score   support

           0       0.81      0.78      0.79      2343
           1       0.84      0.87      0.86      3238

    accuracy                           0.83      5581
   macro avg       0.83      0.82      0.83      5581
weighted avg       0.83      0.83      0.83      5581


===== XGBoost Results for Stress =====
Accuracy: 0.8863636363636364
              precision    recall  f1-score   support

           0       0.90      0.84      0.87        76
           1       0.86      0.90      0.88        73
           2       0.90      0.92      0.91        71

    accuracy                           0.89       220
   macro avg       0.89      0.89      0.89       220
weighted avg       0.89      0.89      0.89       220



In [5]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np

# Assuming X_train and y_train are already defined and contain the data

# Identify categorical and numerical features
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns
numerical_features = X_train.select_dtypes(include=np.number).columns

# Create a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough' # Keep other columns (if any)
)

# Create a pipeline with preprocessing and the RandomForestClassifier
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier(random_state=42))
])

param_dist = {
    "clf__n_estimators": [100, 200, 300, 500],
    "clf__max_depth": [None, 10, 20, 30],
    "clf__min_samples_split": [2, 5, 10],
    "clf__min_samples_leaf": [1, 2, 4],
    "clf__max_features": ["sqrt", "log2"]
}

search = RandomizedSearchCV(pipe, param_distributions=param_dist,
                            n_iter=20, cv=5, scoring="accuracy",
                            n_jobs=-1, random_state=42)

search.fit(X_train, y_train)

print("Best parameters:", search.best_params_)
print("Best CV accuracy:", search.best_score_)

Best parameters: {'clf__n_estimators': 300, 'clf__min_samples_split': 10, 'clf__min_samples_leaf': 1, 'clf__max_features': 'log2', 'clf__max_depth': None}
Best CV accuracy: 0.8875


In [7]:
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier # Added import for XGBClassifier

voting = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=300, random_state=42)),
        ('xgb', XGBClassifier(n_estimators=300, learning_rate=0.05, random_state=42)),
    ],
    voting='soft'
)

voting.fit(X_train, y_train)
print("Ensemble Accuracy:", voting.score(X_test, y_test))

Ensemble Accuracy: 0.8818181818181818


In [8]:
from sklearn.model_selection import cross_val_score
import numpy as np

model = XGBClassifier(
    n_estimators=500, max_depth=8, learning_rate=0.05,
    subsample=0.9, colsample_bytree=0.9, eval_metric="logloss", random_state=42
)

scores = cross_val_score(model, X, y, cv=10, scoring="accuracy")
print("Cross-validation accuracies:", scores)
print("Mean accuracy:", np.mean(scores))


Cross-validation accuracies: [0.9        0.9        0.88181818 0.9        0.80909091 0.92727273
 0.88181818 0.8        0.95454545 0.84545455]
Mean accuracy: 0.8800000000000001


In [11]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

stack = StackingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=300, random_state=42)),
        ('xgb', XGBClassifier(n_estimators=500, learning_rate=0.05, random_state=42))
    ],
    final_estimator=LogisticRegression(),
    cv=5
)

stack.fit(X_train, y_train)
print("Stacking Accuracy:", stack.score(X_test, y_test))


Stacking Accuracy: 0.8636363636363636


In [29]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer # Import SimpleImputer
from imblearn.pipeline import Pipeline as ImbPipeline # Import Pipeline for consistency
import numpy as np
# from xgboost.callback import EarlyStopping # Import EarlyStopping
import pandas as pd # Import pandas

def run_xgboost_tuned(dataset, target_col, name):
    print(f"\n===== Tuned XGBoost Results for {name} =====")

    # Create a copy to avoid modifying the original dataframe
    data = dataset.copy()

    # Drop rows with missing target
    data = data.dropna(subset=[target_col])
    X = data.drop(columns=[target_col], errors="ignore")
    y = data[target_col]

    # --- Feature Engineering ---
    if name == "Depression":
        # Ensure all relevant columns are numeric
        cols_to_convert = ['Work/Study Hours', 'Sleep Duration', 'Social Media Usage (Hours per day)', 'Financial Stress', 'Academic Pressure', 'Extracurricular Activities', 'Peer Pressure']
        for col in cols_to_convert:
            if col in X.columns:
                X[col] = pd.to_numeric(X[col], errors='coerce')

        # Example Feature Engineering for Depression dataset
        if 'Work/Study Hours' in X.columns and 'Sleep Duration' in X.columns:
            X['study_sleep_ratio'] = X['Work/Study Hours'] / (X['Sleep Duration'] + 1) # Add 1 to avoid division by zero
        if 'Social Media Usage (Hours per day)' in X.columns and 'Work/Study Hours' in X.columns:
             X['social_study_ratio'] = X['Social Media Usage (Hours per day)'] / (X['Work/Study Hours'] + 1)
        # Removed the problematic line causing TypeError
        # if 'Financial Stress' in X.columns and 'Academic Pressure' in X.columns:
        #      X['financial_academic_interaction'] = X['Financial Stress'] * X['Academic Pressure']
        # Additional Features for Depression
        if 'Academic Pressure' in X.columns and 'Extracurricular Activities' in X.columns:
            X['academic_extracurricular_balance'] = X['Academic Pressure'] - X['Extracurricular Activities']
        if 'Social Media Usage (Hours per day)' in X.columns and 'Peer Pressure' in X.columns:
            X['social_peer_interaction'] = X['Social Media Usage (Hours per day)'] * X['Peer Pressure']


    elif name == "Stress":
        # Ensure all relevant columns are numeric
        cols_to_convert = ['anxiety_level', 'self_esteem', 'sleep_duration', 'physical_activity', 'depression', 'headache', 'blood_pressure', '2d_echo', 'cardiac_stress_test', 'sugar_level', 'bmi', 'heart_rate', 'lung_capacity', 'cholesterol_level', 'income_level', 'future_career_concerns', 'social_support', 'peer_pressure', 'extracurricular_activities', 'bullying']
        for col in cols_to_convert:
            if col in X.columns:
                 X[col] = pd.to_numeric(X[col], errors='coerce')

        # Example Feature Engineering for Stress dataset
        if 'anxiety_level' in X.columns and 'self_esteem' in X.columns:
            X['anxiety_self_esteem_ratio'] = X['anxiety_level'] / (X['self_esteem'] + 1)
        if 'sleep_duration' in X.columns and 'physical_activity' in X.columns:
             X['sleep_activity_interaction'] = X['sleep_duration'] * X['physical_activity']
        if 'anxiety_level' in X.columns and 'depression' in X.columns:
             X['anxiety_depression_sum'] = X['anxiety_level'] + X['depression']
        # Additional Features for Stress
        if 'heart_rate' in X.columns and 'blood_pressure' in X.columns:
            X['cardio_indicator'] = X['heart_rate'] * X['blood_pressure']
        if 'future_career_concerns' in X.columns and 'income_level' in X.columns:
            X['career_income_stress'] = X['future_career_concerns'] / (X['income_level'] + 1)
        if 'social_support' in X.columns and 'peer_pressure' in X.columns:
             X['social_peer_balance'] = X['social_support'] - X['peer_pressure']


    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y if y.nunique() > 1 else None
    )

    # Identify and drop columns with all NaN values AFTER train/test split and feature engineering
    cols_with_all_nan = X_train.columns[X_train.isnull().all()].tolist()
    if cols_with_all_nan:
        print(f"Dropping columns with all NaN values: {cols_with_all_nan}")
        X_train = X_train.drop(columns=cols_with_all_nan)
        X_test = X_test.drop(columns=cols_with_all_nan) # Drop same columns from test set

    # Identify categorical and numerical features AFTER dropping columns
    categorical_features = X_train.select_dtypes(include=['object', 'category']).columns
    numerical_features = X_train.select_dtypes(include=np.number).columns

    # Create a preprocessor with imputation and scaling for numerical features
    numerical_transformer = ImbPipeline([ # Use ImbPipeline here for consistency if needed later, otherwise Pipeline is fine
        ('imputer', SimpleImputer(strategy='mean')), # Impute missing values
        ('scaler', StandardScaler())
    ])

    # Create a column transformer for preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features), # Use the numerical pipeline
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='passthrough'
    )

    # Apply preprocessing
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    # Handle imbalance if classes are skewed (only for binary classification)
    scale_pos_weight = float(np.sum(y_train == 0)) / np.sum(y_train == 1) if len(np.unique(y_train)) == 2 else 1
    # Set eval_metric based on the number of classes
    eval_metric_name = "logloss" if len(np.unique(y)) <= 2 else "mlogloss"


    # Tuned XGBoost
    model = XGBClassifier(
        n_estimators=1500,        # Increased estimators
        learning_rate=0.02,       # Slightly decreased learning rate
        max_depth=9,              # Increased depth
        subsample=0.9,
        colsample_bytree=0.8,     # Slightly decreased colsample
        gamma=1,                  # regularization
        reg_lambda=2,             # L2 regularization
        scale_pos_weight=scale_pos_weight,
        eval_metric=eval_metric_name, # Use selected eval_metric
        random_state=42,
        use_label_encoder=False
    )

    # Early stopping - Removed early_stopping_rounds as it's not supported
    model.fit(
        X_train_processed, y_train,
        eval_set=[(X_test_processed, y_test)],
        # early_stopping_rounds=30 # Use early_stopping_rounds argument - REMOVED
    )

    y_pred = model.predict(X_test_processed)

    acc = accuracy_score(y_test, y_pred)
    print("Accuracy:", acc)
    print(classification_report(y_test, y_pred))

# Run on Depression dataset
run_xgboost_tuned(df1, "Depression", "Depression")

# Run on Stress dataset
run_xgboost_tuned(df2, "stress_level", "Stress")


===== Tuned XGBoost Results for Depression =====
Dropping columns with all NaN values: ['Sleep Duration', 'study_sleep_ratio']
[0]	validation_0-logloss:0.68306
[1]	validation_0-logloss:0.67387
[2]	validation_0-logloss:0.66463
[3]	validation_0-logloss:0.65612
[4]	validation_0-logloss:0.64746
[5]	validation_0-logloss:0.63932
[6]	validation_0-logloss:0.63259
[7]	validation_0-logloss:0.62523
[8]	validation_0-logloss:0.61784
[9]	validation_0-logloss:0.61048
[10]	validation_0-logloss:0.60338


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[11]	validation_0-logloss:0.59692
[12]	validation_0-logloss:0.59044
[13]	validation_0-logloss:0.58399
[14]	validation_0-logloss:0.57877
[15]	validation_0-logloss:0.57277
[16]	validation_0-logloss:0.56697
[17]	validation_0-logloss:0.56142
[18]	validation_0-logloss:0.55612
[19]	validation_0-logloss:0.55211
[20]	validation_0-logloss:0.54706
[21]	validation_0-logloss:0.54218
[22]	validation_0-logloss:0.53773
[23]	validation_0-logloss:0.53300
[24]	validation_0-logloss:0.52840
[25]	validation_0-logloss:0.52412
[26]	validation_0-logloss:0.51981
[27]	validation_0-logloss:0.51571
[28]	validation_0-logloss:0.51166
[29]	validation_0-logloss:0.50770
[30]	validation_0-logloss:0.50406
[31]	validation_0-logloss:0.50089
[32]	validation_0-logloss:0.49743
[33]	validation_0-logloss:0.49471
[34]	validation_0-logloss:0.49181
[35]	validation_0-logloss:0.48848
[36]	validation_0-logloss:0.48538
[37]	validation_0-logloss:0.48268
[38]	validation_0-logloss:0.48010
[39]	validation_0-logloss:0.47758
[40]	validatio

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[65]	validation_0-mlogloss:0.41010
[66]	validation_0-mlogloss:0.40608
[67]	validation_0-mlogloss:0.40218
[68]	validation_0-mlogloss:0.39839
[69]	validation_0-mlogloss:0.39458
[70]	validation_0-mlogloss:0.39101
[71]	validation_0-mlogloss:0.38728
[72]	validation_0-mlogloss:0.38386
[73]	validation_0-mlogloss:0.38066
[74]	validation_0-mlogloss:0.37759
[75]	validation_0-mlogloss:0.37435
[76]	validation_0-mlogloss:0.37138
[77]	validation_0-mlogloss:0.36815
[78]	validation_0-mlogloss:0.36486
[79]	validation_0-mlogloss:0.36206
[80]	validation_0-mlogloss:0.35925
[81]	validation_0-mlogloss:0.35673
[82]	validation_0-mlogloss:0.35413
[83]	validation_0-mlogloss:0.35146
[84]	validation_0-mlogloss:0.34865
[85]	validation_0-mlogloss:0.34589
[86]	validation_0-mlogloss:0.34340
[87]	validation_0-mlogloss:0.34072
[88]	validation_0-mlogloss:0.33843
[89]	validation_0-mlogloss:0.33626
[90]	validation_0-mlogloss:0.33391
[91]	validation_0-mlogloss:0.33170
[92]	validation_0-mlogloss:0.32965
[93]	validation_0-ml