In [None]:
# 1. Imports & Paths
import importlib.util
import os, sys, re, pandas as pd
sys.path.append(os.path.join(os.pardir, "utils"))

DATA_DIR = os.path.join(os.pardir, "data")
resume_path = os.path.join(DATA_DIR, "resumes_cleaned.csv")
jobs_path   = os.path.join(DATA_DIR, "jobs_cleaned.csv")


EMB_DIR = os.path.join(os.pardir, "data/embeddings")
remb_path = os.path.join(EMB_DIR, "resume_embeddings.npy")
jemb_path   = os.path.join(EMB_DIR, "job_embeddings.npy")

In [None]:
%load_ext autoreload
%autoreload 2
from utils import *


In [None]:
resume_df = pd.read_csv(resume_path)
job_posts_df = pd.read_csv(jobs_path)
resume_embeddings = np.load(remb_path)
job_embeddings = np.load(jemb_path)

In [None]:
def domain_score_vector(text, domain_keywords=DOMAIN_KEYWORDS_REFERENCE):
    text_lower = str(text).lower()
    return {
        domain: sum(kw in text_lower for kw in keywords)
        for domain, keywords in domain_keywords.items()
    }

def detect_domain(text, domain_keywords=DOMAIN_KEYWORDS_REFERENCE):
    scores = domain_score_vector(text, domain_keywords)
    return max(scores, key=scores.get) if max(scores.values()) > 0 else "other"

In [None]:
scaler = MinMaxScaler()

In [None]:
resume_domain_vectors = [domain_score_vector(t) for t in resume_df["Resume_clean"]]
resume_domain_df = pd.DataFrame(resume_domain_vectors, index=resume_df.index)

resume_scaler = MinMaxScaler()
resume_domain_scaled = resume_scaler.fit_transform(resume_domain_df)

resume_features = np.hstack([resume_embeddings, resume_domain_scaled])

kmeans = KMeans(n_clusters=6, random_state=42)
resume_labels = kmeans.fit_predict(resume_features)
resume_df["cluster"] = resume_labels


In [None]:
resume_cluster_summary = (
    resume_df.drop(columns=["ID"], errors="ignore") 
            .join(resume_domain_df)
             .groupby("cluster")
             .mean(numeric_only=True)
)

print(resume_cluster_summary)

domain_cols =  ["Tech & IT", "Finance & Accounting", "Business & Sales", "Law & Advocacy", "Healthcare", "HR & Operations", "Creative & Design","Education",
 "Manufacturing & Construction", "Agriculture & Environment","Hospitality & Food", "Other Services"]
# domain_cols = ["hr", "finance", "it", "sales", "administration", "research"]

top_domain = resume_cluster_summary[domain_cols].idxmax(axis=1)
for c, dom in top_domain.items():
    print(f"Resume Cluster {c}: primarily {dom.upper()}")

In [None]:
# --- 10. Visualize clusters (PCA 2D) ---
pca = PCA(n_components=2, random_state=42)
reduced = pca.fit_transform(resume_features)

plt.figure(figsize=(8,6))
sns.scatterplot(x=reduced[:,0], y=reduced[:,1], hue=resume_labels, palette="tab10", s=10)
plt.title("Resume Clusters (Hybrid Embedding + Domain Features)")
plt.legend(title="Cluster")
plt.show()

# --- 11. Inspect domain distribution per cluster ---
domain_labels = [detect_domain(t) for t in resume_df["Resume_clean"]]
resume_df["domain_label"] = domain_labels
domain_cluster_ct = pd.crosstab(resume_df["domain_label"], resume_df["cluster"])
print(domain_cluster_ct)

In [None]:
job_domain_vectors = [domain_score_vector(t) for t in job_posts_df["job_text_clean"]]
job_domain_df = pd.DataFrame(job_domain_vectors, index=job_posts_df.index)

# --- 13. Scale domain features (separate scaler) ---
job_scaler = MinMaxScaler()
job_domain_scaled = job_scaler.fit_transform(job_domain_df)

# --- 14. Combine embeddings + domain features ---
job_features = np.hstack([job_embeddings, job_domain_scaled])

In [None]:
# --- Compute domain features ---
job_domain_vectors = [domain_score_vector(t) for t in job_posts_df["job_text_clean"]]
job_domain_df = pd.DataFrame(job_domain_vectors)

# --- Normalize domain scores ---
scaler = MinMaxScaler()
job_domain_scaled = scaler.fit_transform(job_domain_df)

# --- Combine embeddings + domain features ---
job_features = np.hstack([job_embeddings, job_domain_scaled])

# --- Cluster ---
kmeans_jobs = KMeans(n_clusters=6, random_state=42)
job_labels = kmeans_jobs.fit_predict(job_features)
job_posts_df["cluster"] = job_labels

# --- Summarize by domain ---
domain_cols = [
    "Tech & IT",
    "Finance & Accounting",
    "Business & Sales",
    "Law & Advocacy",
    "Healthcare",
    "HR & Operations",
    "Creative & Design",
    "Education",
    "Manufacturing & Construction",
    "Agriculture & Environment",
    "Hospitality & Food",
    "Other Services"
]
job_cluster_summary = (
    pd.concat([job_posts_df["cluster"], job_domain_df], axis=1)
      .groupby("cluster")[domain_cols]
      .mean()
)
print(job_cluster_summary.round(2))

# --- Identify top domain per job cluster ---
domain_cols = [
    "Tech & IT",
    "Finance & Accounting",
    "Business & Sales",
    "Law & Advocacy",
    "Healthcare",
    "HR & Operations",
    "Creative & Design",
    "Education",
    "Manufacturing & Construction",
    "Agriculture & Environment",
    "Hospitality & Food",
    "Other Services"
]

top_domain = job_cluster_summary[domain_cols].idxmax(axis=1)
for c, dom in top_domain.items():
    print(f"Job Cluster {c}: primarily {dom.upper()}")


In [None]:
pca = PCA(n_components=2, random_state=42)
job_reduced = pca.fit_transform(job_features)

plt.figure(figsize=(8,6))
plt.scatter(job_reduced[:,0], job_reduced[:,1],
            c=job_labels, cmap='tab10', s=10)
plt.title("Job Post Clusters (Hybrid Embedding + Domain Features)")
plt.show()
