In [None]:
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

: 

In [None]:
df = pd.read_csv("../data/processed/preprocessed_students.csv")
print(f"Data shape: {df.shape}")
df.head()

: 

In [None]:
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
X = df[numerical_cols]
print("Selected features:", list(numerical_cols))

In [None]:
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X)

In [None]:
plt.figure(figsize=(8,4))
plt.bar(range(1,3), pca.explained_variance_ratio_)
plt.title('PCA Explained Variance')
plt.xlabel('Principal Components')
plt.ylabel('Variance Ratio')
plt.show()

In [None]:
n_components_range = range(2, 6)
best_score = -1
best_gmm = None

for n in n_components_range:
    # Fit GMM
    gmm = GaussianMixture(
        n_components=n,
        covariance_type='full',
        random_state=42
    )
    clusters = gmm.fit_predict(X)
    
    # Evaluate
    score = silhouette_score(X, clusters)
    print(f"Components={n}: Silhouette={score:.3f}")
    
    if score > best_score:
        best_score = score
        best_gmm = gmm
        best_n = n

# Final clustering
df['Cluster'] = best_gmm.predict(X)
print(f"\nOptimal components: {best_n} (Silhouette={best_score:.3f})")

In [None]:
# Distribution of students across clusters
cluster_dist = df['Cluster'].value_counts().sort_index()
print("Cluster distribution:")
print(cluster_dist)

# Mean feature values per cluster
cluster_means = df.groupby('Cluster')[numerical_cols].mean()
print("\nCluster characteristics:")
cluster_means.style.background_gradient(cmap='Blues')

In [None]:
# ## 6. Visualization
plt.figure(figsize=(10,6))
sns.scatterplot(
    x=X_pca[:,0], y=X_pca[:,1],
    hue=df['Cluster'],
    palette='viridis',
    alpha=0.7,
    s=50,
    edgecolor='w',
    linewidth=0.5
)

# Add cluster centers
centers_pca = pca.transform(best_gmm.means_)
plt.scatter(
    centers_pca[:,0], centers_pca[:,1],
    c='red', marker='X', s=200,
    label='Cluster Centers'
)

plt.title(f'Student Clusters (GMM)\nSilhouette Score: {best_score:.2f}')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig("../reports/figures/gmm_clusters.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
df.to_csv("../data/processed/students_clustered.csv", index=False)

# Save model
joblib.dump(best_gmm, "../models/gmm_model.pkl")
print("Results saved to data/processed/ and models/")