Simulate Labels (since synthetic data lacks ground truth):

In [2]:
# Example: Create proxy labels from synthetic data
# Academic Success: 1 if Academic_Performance > 75th percentile, else 0
df['Academic_Success'] = (df['Academic_Performance'] > df['Academic_Performance'].quantile(0.75)).astype(int)

# Wellbeing Decline: 1 if k6_overall > 75th percentile (higher distress)
df['Wellbeing_Decline'] = (df['k6_overall'] > df['k6_overall'].quantile(0.75)).astype(int)

# Peer Collaboration: 1 if Friends count > median
df['Friends_Count'] = df['Friends'].apply(lambda x: len(str(x).split(', ')) if x else 0)
df['Positive_Peer_Collab'] = (df['Friends_Count'] > df['Friends_Count'].median()).astype(int)

NameError: name 'df' is not defined

Train Models:

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

# Features (adjust based on your dataset)
features = [
    'Academic_Performance', 'isolated', 'WomenDifferent', 'language',
    'pwi_wellbeing', 'GrowthMindset', 'k6_overall', 'Manbox5_overall',
    'Masculinity_contrained', 'School_support_engage6', 'School_support_engage'
]

# Split data
X = df[features]
y_academic = df['Academic_Success']
y_wellbeing = df['Wellbeing_Decline']
y_peer = df['Positive_Peer_Collab']

# Train models
academic_model = XGBClassifier().fit(X, y_academic)
wellbeing_model = RandomForestClassifier().fit(X, y_wellbeing)
peer_model = XGBClassifier().fit(X, y_peer)

# Predict probabilities
df['Academic_Risk'] = academic_model.predict_proba(X)[:, 0]  # P(not succeeding)
df['Wellbeing_Risk'] = wellbeing_model.predict_proba(X)[:, 1]  # P(decline)
df['Peer_Score'] = peer_model.predict_proba(X)[:, 1]  # P(positive collaboration)

# Clustering-Based Allocation Engine
Group students into classes of 30 using their features and predicted risks/scores.

Step 1: Feature Engineering

In [None]:
# Combine raw features and predictions
cluster_features = features + ['Academic_Risk', 'Wellbeing_Risk', 'Peer_Score']

# Add network features (e.g., degree centrality)
df['Degree_Centrality'] = df['Friends_Count']  # Simplified example
cluster_features += ['Degree_Centrality']

Step 2: K-Means Clustering


In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Scale features
scaler = StandardScaler()
X_cluster = scaler.fit_transform(df[cluster_features])

# Cluster into 334 groups (10,000 / 30 ≈ 334)
n_clusters = len(df) // 30
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df['Class'] = kmeans.fit_predict(X_cluster)

Step 3: Spectral Clustering (Refinement)


In [None]:
from sklearn.cluster import SpectralClustering

# Refine clusters to balance academic/wellbeing risks
spectral = SpectralClustering(
    n_clusters=n_clusters,
    affinity='nearest_neighbors',
    random_state=42
)
df['Class_Refined'] = spectral.fit_predict(X_cluster)