# Embedding Analysis for Resumes

## 1. Setup & Imports

In [1]:

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

import plotly.express as px

## 2. Load Resume Dataset

In [2]:
data_path = "../data/UpdatedResumeDataSet.csv"
df = pd.read_csv(data_path)

st_col = "text" if "text" in df.columns else df.columns[0]  # fallback
df = df.dropna(subset=[st_col])

print("Total resumes loaded:", len(df))
df.head()

Total resumes loaded: 962


Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


## 3. Vectorize Resume Text (TF-IDF)

In [3]:
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(df[st_col]).toarray()

print("TF-IDF Embedding shape:", X.shape)

TF-IDF Embedding shape: (962, 34)


## 4. Dimensionality Reduction

In [4]:
# PCA or TSNE for 2D visualization
reducer = TSNE(n_components=2, random_state=42, perplexity=30)
X_embedded = reducer.fit_transform(X)

## 5. Clustering (KMeans)

In [5]:
k = 5  # Number of clusters (can tune)
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X)

df['cluster'] = clusters
df['x'] = X_embedded[:, 0]
df['y'] = X_embedded[:, 1]

## 6. Visualize Clusters

In [6]:
fig = px.scatter(df, x='x', y='y', color='cluster',
                 hover_data=[st_col],
                 title="Resume Clusters (TSNE + KMeans)",
                 color_continuous_scale='Viridis')
fig.show()

## 7. Analyze Cluster Keywords

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

def extract_top_keywords(texts, n=10):
    cv = CountVectorizer(stop_words='english')
    word_matrix = cv.fit_transform(texts)
    sum_words = word_matrix.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in cv.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:n]

for i in range(k):
    cluster_texts = df[df['cluster'] == i][st_col]
    keywords = extract_top_keywords(cluster_texts)
    print(f"\n🔹 Cluster {i} Top Keywords:")
    for word, freq in keywords:
        print(f"{word}: {freq}")


🔹 Cluster 0 Top Keywords:
developer: 224
java: 84
python: 48
etl: 40
dotnet: 28
sap: 24

🔹 Cluster 1 Top Keywords:
testing: 96
automation: 26

🔹 Cluster 2 Top Keywords:
hr: 44

🔹 Cluster 3 Top Keywords:
engineer: 144
devops: 55
web: 45
designing: 45
data: 40
science: 40
mechanical: 40
sales: 40
operations: 40
manager: 40

🔹 Cluster 4 Top Keywords:
hadoop: 42


## 8. Save Embeddings + Clusters

In [8]:

df[['x', 'y', 'cluster']].to_csv("../data/resume_embeddings.csv", index=False)
print("Embedding file saved.")


Embedding file saved.
