In [35]:
# %% [markdown]
# # Enhanced GMM Clustering with Balanced Components
# **Objective**: Identify meaningful student groups with balanced cluster sizes

# %% [markdown]
# ## 1. Environment Setup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.mixture import BayesianGaussianMixture
from sklearn.metrics import silhouette_score, calinski_harabasz_score
import joblib
from sklearn.feature_selection import mutual_info_classif



In [36]:
# Set visualization style
plt.style.use('seaborn-v0_8')
sns.set_palette('viridis')
%matplotlib inline



In [37]:
# %% [markdown]
# ## 2. Data Loading & Preparation
# Load preprocessed data
df = pd.read_csv("../data/processed/preprocessed_students.csv")

# Select relevant features
features = ['Quiz_Scores', 'Time_Spent_on_Videos', 
            'Forum_Participation', 'Assignment_Completion_Rate',
            'Engagement_Level', 'Final_Exam_Score']
X = df[features]

# Add engineered features
df['Efficiency'] = df['Quiz_Scores'] / (df['Time_Spent_on_Videos'] + 1)
df['Consistency'] = df['Quiz_Scores'].rolling(3, min_periods=1).std().fillna(0)
features += ['Efficiency', 'Consistency']
X = df[features]



KeyError: "['Engagement_Level'] not in index"

In [None]:
# %% [markdown]
# ## 3. Feature Scaling & Selection
# Robust scaling
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

# Feature importance
mi_scores = mutual_info_classif(X_scaled, df['Dropout_Likelihood'])
pd.Series(mi_scores, index=features).sort_values().plot.barh()
plt.title('Feature Importance Scores')
plt.show()



In [None]:
# %% [markdown]
# ## 4. Optimized GMM Clustering
# Initialize Bayesian GMM
bgmm = BayesianGaussianMixture(
    n_components=4,
    weight_concentration_prior_type='dirichlet_process',
    weight_concentration_prior=0.01,  # Controls cluster balance
    covariance_type='full',
    random_state=42,
    n_init=3,
    max_iter=500
)

# Fit model
clusters = bgmm.fit_predict(X_scaled)

# Evaluate
score = silhouette_score(X_scaled, clusters)
ch_score = calinski_harabasz_score(X_scaled, clusters)

print(f"Silhouette Score: {score:.3f}")
print(f"Calinski-Harabasz Score: {ch_score:.0f}")
print("\nCluster Distribution:")
print(pd.Series(clusters).value_counts().sort_index())



In [None]:
# %% [markdown]
# ## 5. Cluster Analysis
df['Cluster'] = clusters

# Visualize component weights
plt.figure(figsize=(10,4))
plt.subplot(121)
plt.bar(range(bgmm.n_components), bgmm.weights_)
plt.title('Cluster Component Weights')
plt.xlabel('Cluster')
plt.ylabel('Weight')

# Visualize cluster means
cluster_means = df.groupby('Cluster')[features].mean()
plt.subplot(122)
sns.heatmap(cluster_means.T, annot=True, cmap='Blues')
plt.title('Normalized Feature Means by Cluster')
plt.tight_layout()
plt.show()



In [None]:
# %% [markdown]
# ## 6. Dimensionality Reduction & Visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(12,6))
sns.scatterplot(
    x=X_pca[:,0], y=X_pca[:,1],
    hue=df['Cluster'],
    palette='viridis',
    alpha=0.7,
    s=50,
    style=df['Cluster'],
    legend='full'
)

# Add cluster centers
centers_pca = pca.transform(bgmm.means_)
plt.scatter(centers_pca[:,0], centers_pca[:,1],
            c='red', marker='X', s=200, label='Centers')

plt.title(f'Student Clusters\nSilhouette: {score:.2f}, Calinski-Harabasz: {ch_score:.0f}')
plt.xlabel('PC1 ({:.1f}% variance)'.format(pca.explained_variance_ratio_[0]*100))
plt.ylabel('PC2 ({:.1f}% variance)'.format(pca.explained_variance_ratio_[1]*100))
plt.legend(bbox_to_anchor=(1.05, 1))
plt.tight_layout()
plt.savefig("../reports/figures/balanced_clusters.png", dpi=300)
plt.show()



In [None]:
# %% [markdown]
# ## 7. Results Export
# Save clustered data
df.to_csv("../data/processed/balanced_student_clusters.csv", index=False)

# Save model pipeline
joblib.dump({
    'scaler': scaler,
    'model': bgmm,
    'features': features
}, "../models/balanced_gmm_pipeline.pkl")

print("\n=== Results Saved ===")
print("Clustered data: data/processed/balanced_student_clusters.csv")
print("Model pipeline: models/balanced_gmm_pipeline.pkl")
print("Visualization: reports/figures/balanced_clusters.png")