Feature Engineering

In [5]:
import pandas as pd

In [6]:
data = pd.read_csv('/home/any1/Documents/I2_Data_Science/Final_Project/data/processed_v2_eda_student_depression.csv')

In [7]:
# Preview the data
data.head()

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,2.0,Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,2.0,Moderate,BSc,No,3.0,2.0,Yes,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,1.0,Healthy,BA,No,9.0,1.0,Yes,0
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,3.0,Moderate,BCA,Yes,4.0,5.0,Yes,1
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,2.0,Moderate,M.Tech,Yes,1.0,1.0,No,0


In [8]:
# Create Total Pressure feature
data['Total Pressure'] = data['Academic Pressure'] + data['Work Pressure']

In [9]:
# Bin CGPA into categories
data['CGPA_Category'] = pd.cut(data['CGPA'], bins=[0, 5, 7, 10], labels=['Low', 'Medium', 'High'])

In [10]:
# Normalize CGPA
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data['CGPA_normalized'] = scaler.fit_transform(data[['CGPA']])

In [11]:
from sklearn.preprocessing import LabelEncoder

# Columns to encode
categorical_columns = ['Gender', 'City', 'Profession', 'Dietary Habits', 
                       'Degree', 'Have you ever had suicidal thoughts ?', 
                       'Family History of Mental Illness']

# Apply Label Encoding
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le  # Save encoder for reverse mapping


In [12]:
# Identify important features using mutual information
from sklearn.feature_selection import mutual_info_classif

In [13]:
X = data.drop(['Depression', 'id'], axis=1)
y = data['Depression']

In [14]:
print(X.dtypes)


Gender                                      int64
Age                                       float64
City                                        int64
Profession                                  int64
Academic Pressure                         float64
Work Pressure                             float64
CGPA                                      float64
Study Satisfaction                        float64
Job Satisfaction                          float64
Sleep Duration                            float64
Dietary Habits                              int64
Degree                                      int64
Have you ever had suicidal thoughts ?       int64
Work/Study Hours                          float64
Financial Stress                          float64
Family History of Mental Illness            int64
Total Pressure                            float64
CGPA_Category                            category
CGPA_normalized                           float64
dtype: object


In [15]:
# One-hot encode categorical variables
X_encoded = pd.get_dummies(X, drop_first=True)

In [16]:
# Calculate feature importance
importance = mutual_info_classif(X_encoded, y)
feature_importance = pd.Series(importance, index=X_encoded.columns)
print(feature_importance.sort_values(ascending=False))

Have you ever had suicidal thoughts ?    0.157901
Academic Pressure                        0.122964
Total Pressure                           0.121201
Financial Stress                         0.072932
Age                                      0.032080
Dietary Habits                           0.024210
Work/Study Hours                         0.023081
Study Satisfaction                       0.017627
Family History of Mental Illness         0.010202
Degree                                   0.008962
CGPA_Category_High                       0.007911
CGPA                                     0.005181
Sleep Duration                           0.004839
Gender                                   0.004579
Profession                               0.003299
City                                     0.002846
CGPA_Category_Medium                     0.002300
Job Satisfaction                         0.001145
CGPA_normalized                          0.000914
Work Pressure                            0.000000


In [17]:
import numpy as np

# Check for NaN or infinite values
print(np.any(np.isnan(X)))
print(np.any(np.isinf(X)))

# Replace problematic values if any
X = X.replace([np.inf, -np.inf], np.nan).dropna()


TypeError: Object with dtype category cannot perform the numpy op isnan

Machine Learning Implementation

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, roc_auc_score


In [19]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)


In [20]:
# Train Logistic Regression model
model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
# Train Decision Tree model
model_dt = DecisionTreeClassifier()
model_dt.fit(X_train, y_train)

In [22]:
# Evaluate Logistic Regression
y_pred_lr = model_lr.predict(X_test)
print("Logistic Regression Report:")
print(classification_report(y_test, y_pred_lr))
print(f"Logistic Regression ROC-AUC: {roc_auc_score(y_test, model_lr.predict_proba(X_test)[:, 1])}")


Logistic Regression Report:
              precision    recall  f1-score   support

           0       0.82      0.78      0.80      2343
           1       0.85      0.88      0.86      3238

    accuracy                           0.84      5581
   macro avg       0.83      0.83      0.83      5581
weighted avg       0.84      0.84      0.84      5581

Logistic Regression ROC-AUC: 0.9136414119885051


In [23]:
# Evaluate Decision Tree
y_pred_dt = model_dt.predict(X_test)
print("Decision Tree Report:")
print(classification_report(y_test, y_pred_dt))
print(f"Decision Tree ROC-AUC: {roc_auc_score(y_test, model_dt.predict_proba(X_test)[:, 1])}")


Decision Tree Report:
              precision    recall  f1-score   support

           0       0.70      0.71      0.71      2343
           1       0.79      0.78      0.79      3238

    accuracy                           0.75      5581
   macro avg       0.75      0.75      0.75      5581
weighted avg       0.75      0.75      0.75      5581

Decision Tree ROC-AUC: 0.7466046207053088


In [24]:
feature_importances = model_dt.feature_importances_
for name, importance in zip(X.columns, feature_importances):
    print(f"{name}: {importance:.2f}")


Gender: 0.01
Age: 0.07
City: 0.07
Profession: 0.00
Academic Pressure: 0.06
Work Pressure: 0.00
CGPA: 0.05
Study Satisfaction: 0.04
Job Satisfaction: 0.00
Sleep Duration: 0.03
Dietary Habits: 0.03
Degree: 0.06
Have you ever had suicidal thoughts ?: 0.30
Work/Study Hours: 0.06
Financial Stress: 0.07
Family History of Mental Illness: 0.02
Total Pressure: 0.08
CGPA_Category: 0.05
CGPA_normalized: 0.00


In [26]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}
grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)


Best Parameters: {'max_depth': 5, 'min_samples_split': 5}
Best ROC-AUC: 0.8979697222548587


In [27]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for Decision Tree
param_grid = {
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5, 10]
}

# Perform GridSearchCV
grid = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, scoring='roc_auc')
grid.fit(X_train, y_train)

# Display best parameters and ROC-AUC score
print(f"Best Parameters: {grid.best_params_}")
print(f"Best ROC-AUC: {grid.best_score_}")


Best Parameters: {'max_depth': 5, 'min_samples_split': 10}
Best ROC-AUC: 0.8979697222548587
