In [None]:
# Cell 1: Install Dependencies
%pip install -q sentence-transformers scikit-learn chromadb matplotlib

In [None]:
# Cell 2: Import Libraries
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import random
import joblib
from sklearn.decomposition import PCA

In [None]:
# Cell 3: Load and Split Dataset
data = pd.read_csv("resume_screening_train.csv") 

train = data.sample(frac=0.8, random_state=42)
test = data.drop(train.index)

# Reset index
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

# Concatenate Role and Job Description
train['role_jobdesc'] = train['Role'] + " " + train['job_description']
test['role_jobdesc'] = test['Role'] + " " + test['job_description']

In [None]:
# Cell 4: Load SBERT and Encode Embeddings
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Encode only the role_jobdesc for clustering (no resume encoding for clustering)
role_jobdesc_embeddings_train = model.encode(train['role_jobdesc'].tolist(), show_progress_bar=True)
role_jobdesc_embeddings_test = model.encode(test['role_jobdesc'].tolist(), show_progress_bar=True)

# Encode resumes for cosine similarity evaluation later (not for clustering)
resume_embeddings_train = model.encode(train['resume'].tolist(), show_progress_bar=True)
resume_embeddings_test = model.encode(test['resume'].tolist(), show_progress_bar=True)

# For clustering, use only job description embeddings
X_train_embeddings = role_jobdesc_embeddings_train
X_test_embeddings = role_jobdesc_embeddings_test

In [None]:
# Cell 5: Elbow Method to Determine Best k (using only job description embeddings)
inertias = []
k_values = range(2, 15)

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_train_embeddings)  # Now using only job description embeddings
    inertias.append(kmeans.inertia_)

# Plot elbow curve
plt.plot(k_values, inertias, marker='o')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.show()

silhouette_scores = []
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X_train_embeddings)
    score = silhouette_score(X_train_embeddings, labels)
    silhouette_scores.append(score)

best_k = k_values[np.argmax(silhouette_scores)]
print(f"Best k selected: {best_k}")

In [None]:
# Cell 6: Train Final K-Means Model (on job descriptions only)
kmeans = KMeans(n_clusters=best_k, random_state=42)
kmeans.fit(X_train_embeddings)

In [None]:
# Cell 7: Cluster Visualization after Training (using job descriptions only)
pca = PCA(n_components=2)
train_2d = pca.fit_transform(X_train_embeddings)  # PCA on job description embeddings

# Get cluster assignments
train_clusters = kmeans.predict(X_train_embeddings)
centroids_2d = pca.transform(kmeans.cluster_centers_)

# Plot
plt.figure(figsize=(8,6))
plt.scatter(train_2d[:,0], train_2d[:,1], c=train_clusters, cmap='tab10', alpha=0.6, s=30)
plt.scatter(centroids_2d[:,0], centroids_2d[:,1], c='red', marker='X', s=200, label='Centroids')
plt.title("Cluster Visualization (Training Data - Job Descriptions)")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend()
plt.show()

In [None]:
# Cell 8: Evaluation on Training Data (job descriptions only)
train_labels = kmeans.predict(X_train_embeddings)
train_silhouette = silhouette_score(X_train_embeddings, train_labels)
print(f"Silhouette Score (Train): {train_silhouette:.4f}")

In [None]:
# Cell 9: Testing Phase (predict clusters for test job descriptions)
test_labels = kmeans.predict(X_test_embeddings)
test_silhouette = silhouette_score(X_test_embeddings, test_labels)
print(f"Silhouette Score (Test): {test_silhouette:.4f}")

In [None]:
# Cell 10: Cosine Similarity Evaluation (between resumes and job descriptions - for evaluation only)
sample_indices = random.sample(range(len(resume_embeddings_test)), 5)

for idx in sample_indices:
    resume_vec = resume_embeddings_test[idx].reshape(1, -1)
    role_jobdesc_vec = role_jobdesc_embeddings_test[idx].reshape(1, -1)
    sim_score = cosine_similarity(resume_vec, role_jobdesc_vec)[0][0]
    print(f"Sample {idx} | Cosine Similarity (Resume ↔ Role+JD): {sim_score:.4f}")

# Average similarity
all_similarities = [
    cosine_similarity(resume_embeddings_test[i].reshape(1, -1), role_jobdesc_embeddings_test[i].reshape(1, -1))[0][0]
    for i in range(len(resume_embeddings_test))
]
print(f"Average Cosine Similarity (Resume ↔ Role+JD, Test Set): {np.mean(all_similarities):.4f}")

In [None]:
# Cell 11: Save Artifacts
joblib.dump(kmeans, "kmeans_model.pkl")
print("KMeans model saved as kmeans_model.pkl")