1. Installing the required libraries

In [1]:
!pip install pandas numpy scikit-learn matplotlib seaborn plotly streamlit joblib xgboost

Collecting plotly
  Downloading plotly-6.3.0-py3-none-any.whl.metadata (8.5 kB)
Collecting xgboost
  Downloading xgboost-3.0.4-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading plotly-6.3.0-py3-none-any.whl (9.8 MB)
   ---------------------------------------- 0.0/9.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.8 MB ? eta -:--:--
   - -------------------------------------- 0.3/9.8 MB ? eta -:--:--
   -- ------------------------------------- 0.5/9.8 MB 1.1 MB/s eta 0:00:09
   --- ------------------------------------ 0.8/9.8 MB 1.3 MB/s eta 0:00:07
   ------- -------------------------------- 1.8/9.8 MB 2.3 MB/s eta 0:00:04
   ------------ --------------------------- 3.1/9.8 MB 3.2 MB/s eta 0:00:03
   ------------------ --------------------- 4.5/9.8 MB 3.7 MB/s eta 0:00:02
   ----------------------- ---------------- 5.8/9.8 MB 4


[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


2. Creating a simulated dataset

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Set random seed for reproducibility
np.random.seed(42)

# Number of students
n_students = 1000

# Generate student IDs
student_ids = [f'STD{10000 + i}' for i in range(n_students)]

# Generate school IDs (640 schools)
school_ids = [f'SCH{1000 + i}' for i in np.random.randint(1, 641, n_students)]

# Generate regions (8 regions in Kenya)
regions = ['Nairobi', 'Central', 'Coast', 'Eastern', 'Nyanza', 'Rift Valley', 'Western', 'North Eastern']
student_regions = [np.random.choice(regions) for _ in range(n_students)]

# Generate genders
genders = ['Male', 'Female', 'Other']
student_genders = [np.random.choice(genders, p=[0.48, 0.48, 0.04]) for _ in range(n_students)]

# Generate ages (13-19 years)
student_ages = np.random.randint(13, 20, n_students)

# Generate survey dates (over the last 2 years)
start_date = datetime.now() - timedelta(days=730)
survey_dates = [start_date + timedelta(days=np.random.randint(0, 730)) for _ in range(n_students)]

# Generate well-being metrics (scaled 1-10)
growth_mindset = np.random.normal(6.5, 1.5, n_students).clip(1, 10)
self_esteem = np.random.normal(6.0, 1.8, n_students).clip(1, 10)
stress_levels = np.random.normal(5.5, 2.0, n_students).clip(1, 10)
social_support = np.random.normal(7.0, 1.5, n_students).clip(1, 10)
academic_pressure = np.random.normal(6.5, 1.7, n_students).clip(1, 10)

# Generate intervention flag (whether they received intervention)
intervention_status = np.random.choice([0, 1], n_students, p=[0.3, 0.7])

# Generate resilience score (target variable)
# Higher growth mindset, self-esteem, social support -> higher resilience
# Higher stress, academic pressure -> lower resilience
resilience_score = (
    0.3 * growth_mindset +
    0.4 * self_esteem +
    0.2 * social_support -
    0.3 * stress_levels -
    0.2 * academic_pressure +
    np.random.normal(0, 1, n_students)
)
resilience_score = ((resilience_score - resilience_score.min()) / 
                   (resilience_score.max() - resilience_score.min()) * 10).clip(1, 10)

# Create DataFrame
data = pd.DataFrame({
    'student_id': student_ids,
    'school_id': school_ids,
    'region': student_regions,
    'gender': student_genders,
    'age': student_ages,
    'survey_date': survey_dates,
    'growth_mindset': growth_mindset.round(1),
    'self_esteem': self_esteem.round(1),
    'stress_levels': stress_levels.round(1),
    'social_support': social_support.round(1),
    'academic_pressure': academic_pressure.round(1),
    'intervention_status': intervention_status,
    'resilience_score': resilience_score.round(1)
})

# Add risk classification (1 = high risk, 0 = low risk)
data['risk_classification'] = (data['resilience_score'] < 5).astype(int)

# Save to CSV
data.to_csv('student_wellbeing_data.csv', index=False)

3. Data Analysis and Model Building

In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import xgboost as xgb
import joblib

# Load the data
df = pd.read_csv('student_wellbeing_data.csv')

# Convert survey_date to datetime
df['survey_date'] = pd.to_datetime(df['survey_date'])

# Exploratory Data Analysis
def perform_eda(df):
    print("Dataset Overview:")
    print(df.head())
    print("\nSummary Statistics:")
    print(df.describe())
    
    # Plot distributions
    plt.figure(figsize=(15, 10))
    metrics = ['growth_mindset', 'self_esteem', 'stress_levels', 'social_support', 'academic_pressure', 'resilience_score']
    
    for i, metric in enumerate(metrics, 1):
        plt.subplot(2, 3, i)
        sns.histplot(df[metric], kde=True)
        plt.title(f'Distribution of {metric.replace("_", " ").title()}')
    
    plt.tight_layout()
    plt.savefig('metric_distributions.png')
    plt.close()
    
    # Correlation matrix
    plt.figure(figsize=(10, 8))
    corr = df[metrics].corr()
    sns.heatmap(corr, annot=True, cmap='coolwarm', center=0)
    plt.title('Correlation Matrix of Well-being Metrics')
    plt.savefig('correlation_matrix.png')
    plt.close()

# Perform EDA
perform_eda(df)

# Prepare data for modeling
def prepare_data(df):
    # Feature engineering
    df['year'] = df['survey_date'].dt.year
    df['month'] = df['survey_date'].dt.month
    
    # Select features and target
    features = ['growth_mindset', 'self_esteem', 'stress_levels', 
                'social_support', 'academic_pressure', 'age', 'gender']
    target = 'risk_classification'
    
    # One-hot encode categorical variables
    X = pd.get_dummies(df[features], columns=['gender'])
    y = df[target]
    
    return X, y

X, y = prepare_data(df)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build and train model
def build_model(X_train, y_train):
    model = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', xgb.XGBClassifier(
            objective='binary:logistic',
            eval_metric='logloss',
            n_estimators=100,
            max_depth=5,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42
        ))
    ])
    
    model.fit(X_train, y_train)
    return model

model = build_model(X_train, y_train)

# Evaluate model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    
    print("Model Evaluation:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nAccuracy:", accuracy_score(y_test, y_pred))
    
    # Feature importance
    plt.figure(figsize=(10, 6))
    feature_importances = model.named_steps['classifier'].feature_importances_
    features = X_train.columns
    importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
    importance_df = importance_df.sort_values('Importance', ascending=False)
    
    sns.barplot(x='Importance', y='Feature', data=importance_df)
    plt.title('Feature Importance')
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()

evaluate_model(model, X_test, y_test)

# After training your model, save it like this:
import joblib

# Create a dictionary containing both the model and expected columns
model_data = {
    'model': model,  # Your trained model
    'expected_columns': X_train.columns.tolist()  # List of expected feature columns
}

# Save the dictionary
joblib.dump(model_data, 'wellbeing_model.pkl')

Dataset Overview:
  student_id school_id       region  gender  age                survey_date  \
0   STD10000   SCH1103      Central    Male   14 2024-05-23 06:00:21.305069   
1   STD10001   SCH1436        Coast  Female   19 2024-05-26 06:00:21.305069   
2   STD10002   SCH1271  Rift Valley  Female   16 2023-09-26 06:00:21.305069   
3   STD10003   SCH1107      Central    Male   14 2024-02-22 06:00:21.305069   
4   STD10004   SCH1072      Western    Male   17 2024-01-28 06:00:21.305069   

   growth_mindset  self_esteem  stress_levels  social_support  \
0             5.2          5.3            3.6             5.6   
1             7.9          3.6            6.5             7.6   
2             6.8          6.2            5.6             8.9   
3             6.2          4.7            9.1            10.0   
4             2.6          6.8            3.4             7.5   

   academic_pressure  intervention_status  resilience_score  \
0                8.6                    1            

['wellbeing_model.pkl']