In [1]:
# Step 1: Import Libraries# Import Libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# NLP and ML Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import joblib

# Download NLTK data
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('punkt_tab', quiet=True)
except:
    pass

print("✓ All libraries imported successfully!\n")

✓ All libraries imported successfully!



In [5]:
# Step 2: Load and Explore Data
df = pd.read_csv("Pyschological_data.csv")
print(df.head(3))
print(f"Dataset Shape: {df.shape}")

   Age  Gender Education_Level Employment_Status  Sleep_Hours  Anxiety_Score  \
0   56    Male      Bachelor's           Retired          6.0              4   
1   69  Female      Bachelor's           Retired          8.8             18   
2   46  Female        Master's          Employed          5.3              5   

   Depression_Score  Stress_Level  Family_History_Mental_Illness  \
0                 2             9                              0   
1                 7             6                              0   
2                13             8                              0   

   Chronic_Illnesses Substance_Use  Financial_Stress  Work_Stress  \
0                  0          none                 4            3   
1                  0          none                 1            4   
2                  0          none                 8            7   

   Self_Esteem_Score  Loneliness_Score  \
0                  7                 1   
1                  7                 6   
2  

In [6]:
# Display basic info
print(f"\nColumn Names:\n{df.columns.tolist()}")


Column Names:
['Age', 'Gender', 'Education_Level', 'Employment_Status', 'Sleep_Hours', 'Anxiety_Score', 'Depression_Score', 'Stress_Level', 'Family_History_Mental_Illness', 'Chronic_Illnesses', 'Substance_Use', 'Financial_Stress', 'Work_Stress', 'Self_Esteem_Score', 'Loneliness_Score', 'Text']


In [7]:
# Step 3: Create Target Labels (Risk Categories)
def categorize_risk(row):
    """
    Categorize burnout/depression risk based on multiple factors
    """
    risk_score = 0

    # Depression and Anxiety scores (high weight)
    if row['Depression_Score'] >= 10:
        risk_score += 3
    elif row['Depression_Score'] >= 6:
        risk_score += 2
    elif row['Depression_Score'] >= 3:
        risk_score += 1

    if row['Anxiety_Score'] >= 12:
        risk_score += 3
    elif row['Anxiety_Score'] >= 7:
        risk_score += 2
    elif row['Anxiety_Score'] >= 4:
        risk_score += 1

    # Stress and Work factors
    if row['Stress_Level'] >= 7:
        risk_score += 2
    if row['Work_Stress'] >= 7:
        risk_score += 2

    # Sleep (less than 6 hours is concerning)
    if row['Sleep_Hours'] < 6:
        risk_score += 1

    # Self-esteem (low is concerning)
    if row['Self_Esteem_Score'] <= 4:
        risk_score += 1

    # Loneliness
    if row['Loneliness_Score'] >= 6:
        risk_score += 1

    # Categorize based on total risk score
    if risk_score >= 8:
        return 'High Risk'
    elif risk_score >= 4:
        return 'Moderate Risk'
    else:
        return 'Low Risk'

In [8]:
df['Risk_Category'] = df.apply(categorize_risk, axis=1)

print("Risk Category Distribution:")
print(df['Risk_Category'].value_counts())

Risk Category Distribution:
Risk_Category
Moderate Risk    1413
High Risk        1030
Low Risk          595
Name: count, dtype: int64


In [9]:
# Step 4: Text Preprocessing (NLP)
def preprocess_text(text):
    """Clean and preprocess text data"""
    if pd.isna(text):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]

    return ' '.join(tokens)

In [10]:
df['Processed_Text'] = df['Text'].apply(preprocess_text)
print("✓ Text preprocessing completed!")
print(f"\nExample:")
print(f"Original: {df['Text'].iloc[0][:100]}...")
print(f"Processed: {df['Processed_Text'].iloc[0][:100]}...")

✓ Text preprocessing completed!

Example:
Original: advice please  hey everyone please help me im so convinced i have a brain tumor okay so a week ago i...
Processed: advice please hey everyone please help convinced brain tumor okay week ago noticed head pressure mai...


In [11]:
# Step 5: Feature Engineering
# Text features
df['Text_Length'] = df['Text'].apply(len)
df['Word_Count'] = df['Text'].apply(lambda x: len(str(x).split()))

# Mental health keywords
negative_words = ['anxious', 'depressed', 'stress', 'panic', 'worry', 'fear', 'pain',
                  'exhausted', 'tired', 'overwhelmed', 'sad', 'hopeless', 'lonely']

df['Negative_Word_Count'] = df['Processed_Text'].apply(
    lambda x: sum(1 for word in negative_words if word in x.lower())
)

print("✓ Feature engineering completed!")
print(f"\nNew features created: Text_Length, Word_Count, Negative_Word_Count")

✓ Feature engineering completed!

New features created: Text_Length, Word_Count, Negative_Word_Count


In [12]:
# Step 6: Prepare Features for ML Model
# Select features
numerical_features = ['Age', 'Sleep_Hours', 'Anxiety_Score', 'Depression_Score',
                      'Stress_Level', 'Financial_Stress', 'Work_Stress',
                      'Self_Esteem_Score', 'Loneliness_Score', 'Text_Length',
                      'Word_Count', 'Negative_Word_Count']

categorical_features = ['Gender', 'Education_Level', 'Employment_Status', 'Substance_Use']

In [13]:
# Encode categorical variables
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    df[col + '_Encoded'] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# Prepare final feature set
feature_columns = numerical_features + [col + '_Encoded' for col in categorical_features]
X = df[feature_columns]
y = df['Risk_Category']

# Encode target variable
target_encoder = LabelEncoder()
y_encoded = target_encoder.fit_transform(y)

print(f"✓ Features prepared!")
print(f"Total features: {len(feature_columns)}")
print(f"Feature names: {feature_columns}")

✓ Features prepared!
Total features: 16
Feature names: ['Age', 'Sleep_Hours', 'Anxiety_Score', 'Depression_Score', 'Stress_Level', 'Financial_Stress', 'Work_Stress', 'Self_Esteem_Score', 'Loneliness_Score', 'Text_Length', 'Word_Count', 'Negative_Word_Count', 'Gender_Encoded', 'Education_Level_Encoded', 'Employment_Status_Encoded', 'Substance_Use_Encoded']


In [14]:
# Step 7: Split Data and Scale Features
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

In [15]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

Training set size: (2430, 16)
Test set size: (608, 16)


In [16]:
# Step 8: Train ML Model
model = RandomForestClassifier(n_estimators=100,max_depth=10,random_state=42,class_weight='balanced')

model.fit(X_train_scaled, y_train)
print("✓ Model training completed!")


✓ Model training completed!


In [17]:
# Step 9: Evaluate Model

y_pred = model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2%}\n")

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=target_encoder.classes_))

Accuracy: 95.89%

Classification Report:
               precision    recall  f1-score   support

    High Risk       0.98      0.94      0.96       206
     Low Risk       1.00      0.92      0.96       119
Moderate Risk       0.93      0.99      0.96       283

     accuracy                           0.96       608
    macro avg       0.97      0.95      0.96       608
 weighted avg       0.96      0.96      0.96       608



In [18]:
# Feature importance
feature_importance = pd.DataFrame({
    'Feature': feature_columns,
    'Importance': model.feature_importances_}).sort_values('Importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))


Top 10 Most Important Features:
             Feature  Importance
3   Depression_Score    0.197351
2      Anxiety_Score    0.162986
7  Self_Esteem_Score    0.130603
1        Sleep_Hours    0.127292
4       Stress_Level    0.111764
6        Work_Stress    0.082344
8   Loneliness_Score    0.066755
5   Financial_Stress    0.037519
0                Age    0.021177
9        Text_Length    0.013705


In [19]:
# Save model and preprocessing objects
import os

# Define the directory
directory = "C:\\burnout_depression_project\\models"

# Ensure the directory exists
if not os.path.exists(directory):
    os.makedirs(directory)

# Save model and preprocessing objects
joblib.dump(model, os.path.join(directory, 'burnout_model.pkl'))
joblib.dump(scaler, os.path.join(directory, 'scaler.pkl'))
joblib.dump(target_encoder, os.path.join(directory, 'target_encoder.pkl'))
joblib.dump(label_encoders, os.path.join(directory, 'label_encoders.pkl'))
joblib.dump(feature_columns, os.path.join(directory, 'feature_columns.pkl'))

print("✓ Model and encoders saved successfully!")
print("Files saved:")
print("  - models/burnout_model.pkl")
print("  - models/scaler.pkl")
print("  - models/target_encoder.pkl")
print("  - models/label_encoders.pkl")
print("  - models/feature_columns.pkl")

✓ Model and encoders saved successfully!
Files saved:
  - models/burnout_model.pkl
  - models/scaler.pkl
  - models/target_encoder.pkl
  - models/label_encoders.pkl
  - models/feature_columns.pkl


In [20]:
# Step 11: Create RAG Knowledge Base for Recommendations
recommendations_db = {
    'High Risk': {
        'immediate_actions': [
            "Seek professional help immediately from a licensed therapist or psychiatrist",
            "Contact a mental health crisis hotline if you're in immediate distress",
            "Talk to your doctor about your symptoms"
        ],
        'lifestyle_changes': [
            "Prioritize 7-9 hours of sleep per night",
            "Practice daily relaxation techniques (deep breathing, meditation)",
            "Reduce caffeine and alcohol intake",
            "Establish a daily routine with regular meal times"
        ],
        'support_resources': [
            "National Suicide Prevention Lifeline: 988",
            "Crisis Text Line: Text HOME to 741741",
            "SAMHSA National Helpline: 1-800-662-4357"
        ],
        'therapeutic_approaches': [
            "Cognitive Behavioral Therapy (CBT) - highly effective for anxiety and depression",
            "Dialectical Behavior Therapy (DBT) - helps with emotional regulation",
            "Mindfulness-Based Stress Reduction (MBSR)"
        ]
    },
    'Moderate Risk': {
        'immediate_actions': [
            "Schedule an appointment with a mental health professional",
            "Talk to someone you trust about how you're feeling",
            "Consider starting therapy or counseling"
        ],
        'lifestyle_changes': [
            "Maintain a consistent sleep schedule (7-8 hours)",
            "Exercise for 30 minutes, 3-5 times per week",
            "Practice stress management techniques daily",
            "Limit screen time, especially before bed",
            "Eat balanced meals at regular times"
        ],
        'self_care_activities': [
            "Engage in hobbies you enjoy",
            "Spend time in nature or outdoors",
            "Practice journaling to process emotions",
            "Connect with friends or family regularly"
        ],
        'workplace_strategies': [
            "Set clear boundaries between work and personal time",
            "Take regular breaks during work",
            "Discuss workload concerns with your supervisor",
            "Use vacation days to rest and recharge"
        ]
    },
    'Low Risk': {
        'maintenance_tips': [
            "Continue practicing good mental health habits",
            "Maintain your sleep routine",
            "Stay physically active",
            "Keep social connections strong"
        ],
        'preventive_measures': [
            "Learn to recognize early signs of stress",
            "Build a support network",
            "Practice gratitude daily",
            "Set realistic goals and expectations"
        ],
        'wellness_activities': [
            "Try new hobbies or activities",
            "Volunteer or help others",
            "Practice mindfulness or meditation",
            "Maintain work-life balance"
        ]
    }
}

In [21]:
# Save recommendations
import json
with open(r'C:\burnout_depression_project\models\recommendations_db.json', 'w') as f:
    json.dump(recommendations_db, f, indent=2)

print("✓ RAG knowledge base created and saved!")

✓ RAG knowledge base created and saved!


In [29]:
# Step 12: Test with New User (Prediction + RAG Recommendations)
def predict_and_recommend(new_user_data):
    """
    Predict risk and provide RAG-based recommendations
    """
    # Load saved models
    model = joblib.load(r"C:\burnout_depression_project\models\burnout_model.pkl")
    scaler = joblib.load(r"C:\burnout_depression_project\models\scaler.pkl")
    target_encoder = joblib.load(r"C:\burnout_depression_project\models\target_encoder.pkl")
    label_encoders = joblib.load(r"C:\burnout_depression_project\models\label_encoders.pkl")
    feature_columns = joblib.load(r"C:\burnout_depression_project\models\feature_columns.pkl")

    with open('C:/burnout_depression_project/models/recommendations_db.json', 'r') as f:
        recommendations_db = json.load(f)

    # Preprocess text
    new_user_data['Processed_Text'] = preprocess_text(new_user_data['Text'])
    new_user_data['Text_Length'] = len(new_user_data['Text'])
    new_user_data['Word_Count'] = len(new_user_data['Text'].split())
    new_user_data['Negative_Word_Count'] = sum(
        1 for word in negative_words if word in new_user_data['Processed_Text'].lower()
    )

    # Encode categorical features
    categorical_features = ['Gender', 'Education_Level', 'Employment_Status', 'Substance_Use']
    for col in categorical_features:
        new_user_data[col + '_Encoded'] = label_encoders[col].transform([new_user_data[col]])[0]

    # Prepare features
    features = [new_user_data[col] for col in feature_columns]
    features_scaled = scaler.transform([features])

    # Predict
    prediction = model.predict(features_scaled)[0]
    risk_category = target_encoder.inverse_transform([prediction])[0]
    probabilities = model.predict_proba(features_scaled)[0]

    # Get recommendations from RAG
    recommendations = recommendations_db[risk_category]

    return {
        'risk_category': risk_category,
        'confidence': max(probabilities) * 100,
        'recommendations': recommendations
    }

In [31]:
# Example new user
new_user = {
    'Age': 28,
    'Gender': 'Female',
    'Education_Level': 'Bachelor\'s',
    'Employment_Status': 'Employed',
    'Sleep_Hours': 5.0,
    'Anxiety_Score': 16,
    'Depression_Score': 12,
    'Stress_Level': 8,
    'Family_History_Mental_Illness': 1,
    'Chronic_Illnesses': 0,
    'Substance_Use': 'none',
    'Financial_Stress': 7,
    'Work_Stress': 9,
    'Self_Esteem_Score': 3,
    'Loneliness_Score': 8,
    'Text': 'I feel extremely overwhelmed at work constant anxiety and panic attacks cant sleep properly feeling hopeless and exhausted all the time'
}

print(f"\n{'='*60}")
print("PREDICTION RESULTS FOR NEW USER")
print(f"{'='*60}\n")
print(f"Risk Category: {result['risk_category']}")
print(f"Confidence: {result['confidence']:.1f}%\n")



PREDICTION RESULTS FOR NEW USER

Risk Category: High Risk
Confidence: 91.6%



In [32]:
print("PERSONALIZED RECOMMENDATIONS (RAG-BASED)")
print(f"{'='*60}\n")

for category, items in result['recommendations'].items():
    print(f"\n{category.replace('_', ' ').title()}:")
    for i, item in enumerate(items, 1):
        print(f"  {i}. {item}")

PERSONALIZED RECOMMENDATIONS (RAG-BASED)


Immediate Actions:
  1. Seek professional help immediately from a licensed therapist or psychiatrist
  2. Contact a mental health crisis hotline if you're in immediate distress
  3. Talk to your doctor about your symptoms

Lifestyle Changes:
  1. Prioritize 7-9 hours of sleep per night
  2. Practice daily relaxation techniques (deep breathing, meditation)
  3. Reduce caffeine and alcohol intake
  4. Establish a daily routine with regular meal times

Support Resources:
  1. National Suicide Prevention Lifeline: 988
  2. Crisis Text Line: Text HOME to 741741
  3. SAMHSA National Helpline: 1-800-662-4357

Therapeutic Approaches:
  1. Cognitive Behavioral Therapy (CBT) - highly effective for anxiety and depression
  2. Dialectical Behavior Therapy (DBT) - helps with emotional regulation
  3. Mindfulness-Based Stress Reduction (MBSR)
