In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

# Load dataset
df = pd.read_csv("students_mental_health_survey (1).csv")


In [3]:
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100
missing_data = pd.DataFrame({'Missing Values': missing_values, 'Percentage (%)': missing_percentage})
print(missing_data[missing_data['Missing Values'] > 0])


               Missing Values  Percentage (%)
CGPA                       12        0.170891
Substance_Use              15        0.213614


In [5]:
# Filling missing CGPA with mean
df['CGPA'].fillna(df['CGPA'].mean(), inplace=True)

# Filling missing Substance_Use with mode
df['Substance_Use'].fillna(df['Substance_Use'].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['CGPA'].fillna(df['CGPA'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Substance_Use'].fillna(df['Substance_Use'].mode()[0], inplace=True)


In [7]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = ['Course', 'Gender', 'Relationship_Status', 'Sleep_Quality', 
                    'Physical_Activity', 'Diet_Quality', 'Social_Support', 
                    'Counseling_Service_Use', 'Chronic_Illness', 
                    'Extracurricular_Involvement', 'Residence_Type', 
                    'Substance_Use', 'Family_History']

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Feature Engineering
df['Mental_Health_Score'] = (df['Stress_Level'] + df['Depression_Score'] + df['Anxiety_Score']) / 3
df['Lifestyle_Score'] = (df['Sleep_Quality'] + df['Physical_Activity'] + df['Diet_Quality'] + df['Social_Support']) / 4
df['Academic_Stress'] = df['Semester_Credit_Load'] * df['Stress_Level']
df['Financial_Impact'] = df['Financial_Stress'] * (df['Depression_Score'] + df['Anxiety_Score']) / 2

# Target variable
y = (df['Mental_Health_Score'] > 2).astype(int)

# Features
df.drop(columns=['Mental_Health_Score', 'Stress_Level', 'Depression_Score', 'Anxiety_Score'], inplace=True)
X = df.copy()

# Scaling numerical features
scaler = StandardScaler()
numerical_cols = ['Lifestyle_Score', 'Academic_Stress', 'Financial_Impact', 'CGPA',
                  'Financial_Stress', 'Semester_Credit_Load', 'Age']
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

# Balancing the data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)


In [9]:
!pip install lightgbm



In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import StackingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression

voting_model = VotingClassifier(
    estimators=[
        ('xgb', XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42)),
        ('lgbm', LGBMClassifier(n_estimators=100, random_state=42)),
        ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42))
    ],
    voting='soft'  # or 'hard'
)

voting_model.fit(X_train, y_train)
voting_pred = voting_model.predict(X_test)
voting_acc = accuracy_score(y_test, voting_pred)

print(f"Voting Classifier Accuracy: {voting_acc:.4f}")


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 3150, number of negative: 3149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000680 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1799
[LightGBM] [Info] Number of data points in the train set: 6299, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500079 -> initscore=0.000318
[LightGBM] [Info] Start training from score 0.000318
Voting Classifier Accuracy: 0.9575
