In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import joblib # For saving the model

In [None]:
# Option 1: Load processed data if saved
# processed_df = pd.read_csv('../data/processed/processed_student_mental_health.csv')
# X_encoded = processed_df.drop('Depression_Target', axis=1)
# y = processed_df['Depression_Target']

# Option 2: Re-run preprocessing steps (more robust for a standalone script/notebook)
# For simplicity in this guide, we'll assume X_encoded and y are available
# if you ran the previous notebook in the same session.
# In a real project, you'd either load from file or have preprocessing functions.

# For this notebook to run independently, let's quickly re-do minimal preprocessing:
file_path = 'D:\mental-health-ai\data\raw\Student Mental health.csv'
df = pd.read_csv(file_path)

# Handle Age NaN
if df['Age'].isnull().any():
    df['Age'].fillna(df['Age'].median(), inplace=True)

# Target variable
df['Depression_Target'] = df['Do you have Depression?'].map({'Yes': 1, 'No': 0})
y = df['Depression_Target']

# CGPA mapping
def map_cgpa(cgpa_range):
    if isinstance(cgpa_range, str):
        if '3.50 - 4.00' in cgpa_range: return 3.75
        if '3.00 - 3.49' in cgpa_range: return 3.25
        if '2.50 - 2.99' in cgpa_range: return 2.75
        if '2.00 - 2.49' in cgpa_range: return 2.25
        if '0 - 1.99'   in cgpa_range: return 1.00
    return np.nan
df['CGPA_numeric'] = df['What is your CGPA?'].apply(map_cgpa)
df['CGPA_numeric'].fillna(df['CGPA_numeric'].median(), inplace=True)

# Features
feature_cols = [
    'Choose your gender', 'Age', 'Your current year of study',
    'CGPA_numeric', 'Do you have Anxiety?', 'Do you have Panic attack?',
    'Did you seek any specialist for a mental condition?'
]
X = df[feature_cols].copy()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Ensure Age is numeric after potential dummy creation if it was object
if 'Age' in X_encoded.columns: # 'Age' column might not exist if all are dummy vars
     X_encoded['Age'] = pd.to_numeric(X_encoded['Age'], errors='coerce').fillna(X_encoded['Age'].median() if not X_encoded['Age'].empty else 0)


print("Features (X_encoded) head:\n", X_encoded.head())
print("\nTarget (y) head:\n", y.head())
print("\nShape of X_encoded:", X_encoded.shape)
print("Shape of y:", y.shape)

FileNotFoundError: [Errno 2] No such file or directory: '../data/raw/student_mental_health.csv'