In [13]:
# Core tools
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid", palette="pastel")

# Modeling tools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, f1_score

# Load dataset
df = pd.read_csv("Cleaned_Final.csv")

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2022 entries, 0 to 2021
Data columns (total 51 columns):
 #   Column                                                           Non-Null Count  Dtype  
---  ------                                                           --------------  -----  
 0   Age                                                              2022 non-null   float64
 1   Academic_Year                                                    2022 non-null   int64  
 2   Current_CGPA                                                     2022 non-null   float64
 3   waiver_or_scholarship                                            2022 non-null   int64  
 4   PSS1                                                             2022 non-null   int64  
 5   PSS2                                                             2022 non-null   int64  
 6   PSS3                                                             2022 non-null   int64  
 7   PSS4                                      

In [15]:
# ===============================
# 🧪 Feature Selection & Train-Test Split
# ===============================

# 1. Drop PHQ items and leakage-related columns
phq_cols = [col for col in df.columns if col.startswith("PHQ")]
leakage_cols = [
    "Depression_Value", "Anxiety_Value", "Stress_Value",
    "Anxiety_Label", "Stress_Label"
]
columns_to_drop = [col for col in phq_cols + leakage_cols if col in df.columns]

# 2. Target before dropping
y = df["Depression_Label"]

# 3. Drop leakage columns
X = df.drop(columns=columns_to_drop + ["Depression_Label"])

# 4. Encode target
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# 5. Standardize features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# 6. Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# 7. Class label mapping (for interpretation)
label_encoder.classes_


array(['Mild Depression', 'Minimal Depression', 'Moderate Depression',
       'Moderately Severe Depression', 'No Depression',
       'Severe Depression'], dtype=object)

In [16]:
import xgboost 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [17]:
# Initialize cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define scoring metrics
scoring = {
    'accuracy': 'accuracy',
    'f1_weighted': 'f1_weighted'
}

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': xgboost.XGBClassifier(random_state=42, use_label_encoder=False)
}

# Perform cross-validation for each model
cv_results = {}
for name, model in models.items():
    print(f"\n🔍 {name}")
    
    # Perform cross-validation
    cv_scores = cross_val_score(model, X_scaled, y_encoded, cv=cv, scoring='accuracy')
    cv_f1 = cross_val_score(model, X_scaled, y_encoded, cv=cv, scoring='f1_weighted')
    
    print(f"Cross-validation Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    print(f"Cross-validation F1 Score: {cv_f1.mean():.4f} (+/- {cv_f1.std() * 2:.4f})")
    
    # Store results
    cv_results[name] = {
        'accuracy_mean': cv_scores.mean(),
        'accuracy_std': cv_scores.std(),
        'f1_mean': cv_f1.mean(),
        'f1_std': cv_f1.std()
    }

# Create a DataFrame to display results
cv_results_df = pd.DataFrame(cv_results).T
print("\n📊 Cross-validation Results Summary:")
print(cv_results_df)


🔍 Logistic Regression
Cross-validation Accuracy: 0.4872 (+/- 0.0461)
Cross-validation F1 Score: 0.4783 (+/- 0.0482)

🔍 Random Forest
Cross-validation Accuracy: 0.5084 (+/- 0.0564)
Cross-validation F1 Score: 0.4991 (+/- 0.0587)

🔍 XGBoost
Cross-validation Accuracy: 0.5292 (+/- 0.0541)
Cross-validation F1 Score: 0.5244 (+/- 0.0584)

📊 Cross-validation Results Summary:
                     accuracy_mean  accuracy_std   f1_mean    f1_std
Logistic Regression       0.487164      0.023031  0.478324  0.024079
Random Forest             0.508437      0.028202  0.499095  0.029344
XGBoost                   0.529197      0.027040  0.524432  0.029217


In [18]:
# ===============================
# ⚙️ Handle Class Imbalance with Normalized Class Weights
# ===============================

# Calculate class weights
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Compute class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_encoded),
    y=y_encoded
)

# Normalize class weights to sum to 1
class_weights_normalized = class_weights / np.sum(class_weights)

# Create dictionaries of class weights (both original and normalized)
class_weight_dict = dict(zip(np.unique(y_encoded), class_weights))
class_weight_dict_normalized = dict(zip(np.unique(y_encoded), class_weights_normalized))

print("Original class weights:")
for class_label, weight in zip(label_encoder.classes_, class_weights):
    print(f"{class_label}: {weight:.2f}")

print("\nNormalized class weights:")
for class_label, weight in zip(label_encoder.classes_, class_weights_normalized):
    print(f"{class_label}: {weight:.2f}")

# Initialize models with normalized class weights
weighted_models = {
    'Logistic Regression': LogisticRegression(
        random_state=42, 
        max_iter=1000,
        class_weight=class_weight_dict_normalized
    ),
    'Random Forest': RandomForestClassifier(
        random_state=42,
        class_weight=class_weight_dict_normalized
    ),
    'XGBoost': xgboost.XGBClassifier(
        random_state=42,
        use_label_encoder=False,
        scale_pos_weight=len(y_encoded) / (2 * np.bincount(y_encoded)[1:].sum())  # XGBoost specific weight
    )
}

# Perform cross-validation with weighted models
weighted_cv_results = {}
for name, model in weighted_models.items():
    print(f"\n🔍 {name} (with normalized class weights)")
    
    # Perform cross-validation
    cv_scores = cross_val_score(model, X_scaled, y_encoded, cv=cv, scoring='accuracy')
    cv_f1 = cross_val_score(model, X_scaled, y_encoded, cv=cv, scoring='f1_weighted')
    
    print(f"Cross-validation Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    print(f"Cross-validation F1 Score: {cv_f1.mean():.4f} (+/- {cv_f1.std() * 2:.4f})")
    
    # Store results
    weighted_cv_results[name] = {
        'accuracy_mean': cv_scores.mean(),
        'accuracy_std': cv_scores.std(),
        'f1_mean': cv_f1.mean(),
        'f1_std': cv_f1.std()
    }

# Create a DataFrame to display results
weighted_cv_results_df = pd.DataFrame(weighted_cv_results).T
print("\n📊 Cross-validation Results with Normalized Class Weights:")
print(weighted_cv_results_df)

# Compare with previous results
print("\n📈 Performance Comparison (with vs without class weights):")
comparison_df = pd.DataFrame({
    'Without Weights': cv_results_df['accuracy_mean'],
    'With Normalized Weights': weighted_cv_results_df['accuracy_mean'],
    'Improvement': weighted_cv_results_df['accuracy_mean'] - cv_results_df['accuracy_mean']
})
print(comparison_df)

Original class weights:
Mild Depression: 0.82
Minimal Depression: 3.47
Moderate Depression: 0.74
Moderately Severe Depression: 0.66
No Depression: 7.66
Severe Depression: 0.67

Normalized class weights:
Mild Depression: 0.06
Minimal Depression: 0.25
Moderate Depression: 0.05
Moderately Severe Depression: 0.05
No Depression: 0.55
Severe Depression: 0.05

🔍 Logistic Regression (with normalized class weights)
Cross-validation Accuracy: 0.4278 (+/- 0.0366)
Cross-validation F1 Score: 0.4299 (+/- 0.0438)

🔍 Random Forest (with normalized class weights)
Cross-validation Accuracy: 0.5173 (+/- 0.0699)
Cross-validation F1 Score: 0.5088 (+/- 0.0729)

🔍 XGBoost (with normalized class weights)
Cross-validation Accuracy: 0.5292 (+/- 0.0541)
Cross-validation F1 Score: 0.5244 (+/- 0.0584)

📊 Cross-validation Results with Normalized Class Weights:
                     accuracy_mean  accuracy_std   f1_mean    f1_std
Logistic Regression       0.427808      0.018282  0.429877  0.021923
Random Forest      

## Here I drop the linear regression and random forest since they're less performant 

In [19]:
# ===============================
# 🚀 Final XGBoost Model Pipeline
# ===============================

# Import all necessary libraries
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, classification_report
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

def prepare_data(df):
    """Prepare the data for modeling"""
    # 1. Drop PHQ items and leakage-related columns
    phq_cols = [col for col in df.columns if col.startswith("PHQ")]
    leakage_cols = [
        "Depression_Value", "Anxiety_Value", "Stress_Value",
        "Anxiety_Label", "Stress_Label"
    ]
    columns_to_drop = [col for col in phq_cols + leakage_cols if col in df.columns]
    
    # 2. Get target before dropping
    y = df["Depression_Label"]
    
    # 3. Drop leakage columns
    X = df.drop(columns=columns_to_drop + ["Depression_Label"])
    
    # 4. Encode target
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    
    # 5. Standardize features
    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    
    return X_scaled, y_encoded, label_encoder

def train_xgboost_model(X, y):
    """Train and evaluate XGBoost model"""
    # Initialize cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    
    # Initialize XGBoost model
    model = xgb.XGBClassifier(
        random_state=RANDOM_SEED,
        use_label_encoder=False,
        scale_pos_weight=len(y) / (2 * np.bincount(y)[1:].sum())
    )
    
    # Perform cross-validation
    cv_scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    cv_f1 = cross_val_score(model, X, y, cv=cv, scoring='f1_weighted')
    
    print("Cross-validation Results:")
    print(f"Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    print(f"F1 Score: {cv_f1.mean():.4f} (+/- {cv_f1.std() * 2:.4f})")
    
    # Train final model on full dataset
    model.fit(X, y)
    
    return model

def main():
    # Load data
    print("Loading data...")
    df = pd.read_csv("Cleaned_Final.csv")
    
    # Prepare data
    print("\nPreparing data...")
    X_scaled, y_encoded, label_encoder = prepare_data(df)
    
    # Train model
    print("\nTraining XGBoost model...")
    model = train_xgboost_model(X_scaled, y_encoded)
    
    # Print class labels for reference
    print("\nClass Labels:")
    for i, label in enumerate(label_encoder.classes_):
        print(f"{i}: {label}")
    
    return model, label_encoder

if __name__ == "__main__":
    model, label_encoder = main()

Loading data...

Preparing data...

Training XGBoost model...
Cross-validation Results:
Accuracy: 0.5292 (+/- 0.0541)
F1 Score: 0.5244 (+/- 0.0584)

Class Labels:
0: Mild Depression
1: Minimal Depression
2: Moderate Depression
3: Moderately Severe Depression
4: No Depression
5: Severe Depression


In [20]:
pip install streamlit

Note: you may need to restart the kernel to use updated packages.


In [21]:
import streamlit as st
import pandas as pd