# Explainable AI (XAI) Analysis with SHAP

This notebook demonstrates how to use SHAP to interpret machine learning predictions for student mental health risk. It provides global and individual explanations to support counselors and students.

## 1. Import Required Libraries
Import pandas, numpy, scikit-learn, SHAP, and visualization libraries.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import shap

## 2. Load and Prepare Survey Data
Load the student survey dataset, handle missing values, encode categorical variables, and split into features and target risk levels.

In [None]:
# Load the survey data
# Replace 'student_mental_health.csv' with your actual file name
survey_df = pd.read_csv('student_mental_health.csv')

# Preview the data
survey_df.head()

# Handle missing values (simple example: drop rows with missing)
survey_df = survey_df.dropna()

# Encode categorical variables if needed
# Example: survey_df['gender'] = survey_df['gender'].astype('category').cat.codes

# Define features and target
feature_columns = [col for col in survey_df.columns if col not in ['stress_level', 'risk_level']]
X = survey_df[feature_columns]
y = survey_df['risk_level'] if 'risk_level' in survey_df.columns else survey_df['stress_level']

In [None]:
## 3. Train Model and Compute SHAP Values
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import shap

# Split data (using all for training for demonstration)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost model
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

# Initialize SHAP explainer
explainer = shap.Explainer(model, X_train)
shap_values = explainer(X_train)

# Display global feature importance
shap.summary_plot(shap_values, X_train, show=False)
import matplotlib.pyplot as plt
plt.show()

In [None]:
# Check class distribution in your training data
import matplotlib.pyplot as plt

plt.figure(figsize=(6,4))
survey_df['risk_level'].value_counts().plot(kind='bar', color=['#22c55e','#facc15','#ef4444'])
plt.title('Class Distribution: risk_level')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

# Optional: Print counts
print(survey_df['risk_level'].value_counts())