In [None]:
# 1. Imports & Paths
import importlib.util

import os, sys, re, pandas as pd
sys.path.append(os.path.join(os.pardir, "utils"))

DATA_DIR = os.path.join(os.pardir, "data")
resume_path = os.path.join(DATA_DIR, "resumes_cleaned.csv")
jobs_path   = os.path.join(DATA_DIR, "jobs_cleaned.csv")


EMB_DIR = os.path.join(os.pardir, "data/embeddings")
remb_path = os.path.join(EMB_DIR, "resume_embeddings.npy")
jemb_path   = os.path.join(EMB_DIR, "job_embeddings.npy")

In [None]:
%load_ext autoreload
%autoreload 2
from utils import *


In [None]:
# 7clustering.ipynb ‚Äî Resume‚ÄìJob Domain Alignment + Visualization + Matching

# =============================================
# 1Ô∏è‚É£ Imports & Paths
# =============================================
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity



# Define paths
DATA_DIR = os.path.join(os.pardir, 'data')
EMB_DIR = os.path.join(DATA_DIR, 'embeddings')

resume_path = os.path.join(DATA_DIR, 'resumes_clustered.csv')
jobs_path   = os.path.join(DATA_DIR, 'jobs_cleaned.csv')
remb_path   = os.path.join(EMB_DIR, 'resume_embeddings.npy')
jemb_path   = os.path.join(EMB_DIR, 'job_embeddings.npy')

# =============================================
# 2Ô∏è‚É£ Load Data and Embeddings
# =============================================
resumes = pd.read_csv(resume_path)
jobs = pd.read_csv(jobs_path)

resume_emb = np.load(remb_path)
job_emb = np.load(jemb_path)

print(f"Loaded {len(resumes)} resumes and {len(jobs)} job posts.")

# =============================================
# 3Ô∏è‚É£ K-Means Clustering for Job Embeddings
# =============================================
n_clusters = 12
job_kmeans = KMeans(n_clusters=n_clusters, random_state=42)
jobs['PredictedCluster'] = job_kmeans.fit_predict(job_emb)

# =============================================
# 4Ô∏è‚É£ Keyword-Based Domain Labeling for Jobs
# =============================================
# def infer_keyword_domain(text, keywords_dict):
#     text = str(text).lower()
#     scores = {domain: sum(kw in text for kw in kws) for domain, kws in keywords_dict.items()}
#     return max(scores, key=scores.get) if scores else 'Unknown'

jobs['KeywordDomain'] = jobs['job_text_clean'].apply(lambda t: infer_keyword_domain(t, DOMAIN_KEYWORDS_REFERENCE))

# Assign cluster-level domain labels
cluster_labels = {}
for cid, group in jobs.groupby('PredictedCluster'):
    top_label = group['KeywordDomain'].value_counts().idxmax()
    cluster_labels[cid] = top_label

jobs['JobClusterDomainLabel'] = jobs['PredictedCluster'].map(cluster_labels)

# =============================================
# 5Ô∏è‚É£ Compute Cross-Domain Similarity
# =============================================
resume_domains = resumes['ClusterDomainLabel'].unique()
job_domains = jobs['JobClusterDomainLabel'].unique()

resume_centroids = []
for domain in resume_domains:
    cluster_vectors = resume_emb[resumes['ClusterDomainLabel'] == domain]
    resume_centroids.append(cluster_vectors.mean(axis=0))

job_centroids = []
for domain in job_domains:
    cluster_vectors = job_emb[jobs['JobClusterDomainLabel'] == domain]
    job_centroids.append(cluster_vectors.mean(axis=0))

similarity = cosine_similarity(resume_centroids, job_centroids)

# =============================================
# 6Ô∏è‚É£ Visualize Cross-Domain Alignment (Heatmap)
# =============================================
plt.figure(figsize=(10,7))
sns.heatmap(similarity, annot=True, fmt='.2f', cmap='YlGnBu',
            xticklabels=job_domains, yticklabels=resume_domains)
plt.title('Resume‚ÄìJob Domain Similarity (Cosine)')
plt.xlabel('Job Domains')
plt.ylabel('Resume Domains')
plt.show()

# =============================================
# 7Ô∏è‚É£ Visualization: Top 3 Matches per Resume Domain (Bar Chart)
# =============================================
summary_data = []
for i, resume_domain in enumerate(resume_domains):
    sims = similarity[i]
    sorted_idx = np.argsort(sims)[::-1][:3]
    top_matches = [(job_domains[j], sims[j]) for j in sorted_idx]
    for job_dom, score in top_matches:
        summary_data.append((resume_domain, job_dom, score))

summary_df = pd.DataFrame(summary_data, columns=['ResumeDomain', 'JobDomain', 'Similarity'])

plt.figure(figsize=(10,6))
sns.barplot(data=summary_df, x='ResumeDomain', y='Similarity', hue='JobDomain', palette='Set2')
plt.title('Top 3 Matching Job Domains for Each Resume Domain')
plt.ylabel('Cosine Similarity')
plt.ylim(0.6, 1.0)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

# =============================================
# 8Ô∏è‚É£ Text Summary of Domain Alignment
# =============================================
for i, resume_domain in enumerate(resume_domains):
    top_idx = np.argmax(similarity[i])
    best_match = job_domains[top_idx]
    score = similarity[i, top_idx]
    second_idx = np.argsort(similarity[i])[-2]
    second_best = job_domains[second_idx]
    second_score = similarity[i, second_idx]
    print(f"{resume_domain} resumes best match {best_match} jobs (similarity={score:.3f}), followed by {second_best} ({second_score:.3f}).")

# =============================================
# 9Ô∏è‚É£ Individual Resume‚ÜíJob Matching
# =============================================
# For each resume vector, find the closest job cluster centroid
resume_to_job = []
for idx, vec in enumerate(resume_emb):
    sims = cosine_similarity([vec], job_centroids)[0]
    top_idx = np.argmax(sims)
    best_job_domain = job_domains[top_idx]
    score = sims[top_idx]
    resume_to_job.append((idx, resumes.loc[idx, 'ClusterDomainLabel'], best_job_domain, score))

match_df = pd.DataFrame(resume_to_job, columns=['ResumeIndex', 'ResumeDomain', 'MatchedJobDomain', 'Similarity'])

# Save the mapping
match_out_path = os.path.join(DATA_DIR, 'resume_job_matches.csv')
match_df.to_csv(match_out_path, index=False)
print(f'‚úÖ Saved individual resume‚Üíjob domain matches to {match_out_path}')

# =============================================
# üîö Summary
# =============================================
print("\nPipeline complete:")
print("1. Generated job domain clusters.")
print("2. Compared resume‚Üîjob semantic centroids (heatmap + bar chart).")
print("3. Saved per-resume best job domain matches for future analysis.")

In [None]:
matches = pd.read_csv("../data/resume_job_matches.csv")
matches.head()

In [None]:
idx = 0  # change this number to see other examples

resume_text = resumes.loc[idx, "Resume_clean"]
resume_domain = matches.loc[idx, "ResumeDomain"]
job_domain = matches.loc[idx, "MatchedJobDomain"]

print(f"Resume Domain: {resume_domain}")
print(f"Matched Job Domain: {job_domain}")
print("---- Resume ----")
print(resume_text[:600])  # show first 600 chars


In [None]:
for domain_name in DOMAIN_KEYWORDS_REFERENCE.keys():
    keywords = DOMAIN_KEYWORDS_REFERENCE[domain_name]
    pattern = "|".join([r"\b" + kw + r"\b" for kw in keywords])
    keyword_matches = jobs[jobs['job_text_clean'].str.contains(pattern, case=False, na=False, regex=True)]

    
    if not keyword_matches.empty:
        print(f"\n---- Matching Job Posting for {domain_name} ----")
        print(keyword_matches.iloc[0]['job_text_clean'][:600])
    else:
        print(f"\nNo matching job posting found for {domain_name}")


In [None]:
# Show top 5 strongest matches overall
top_matches = match_df.sort_values('Similarity', ascending=False).head(5)

for _, row in top_matches.iterrows():
    r_idx = row['ResumeIndex']
    resume_text = resumes.loc[r_idx, "Resume_clean"][:400]

    job_domain = row['MatchedJobDomain']
    job_match = jobs[jobs["JobClusterDomainLabel"] == job_domain].sample(1, random_state=42)
    job_text = job_match.iloc[0]["job_text_clean"][:400]

    print(f"\nResume #{r_idx} ‚Äî {row['ResumeDomain']} ‚Üí {job_domain} (Similarity: {row['Similarity']:.3f})")
    print("-" * 90)
    print("Resume snippet:")
    print(resume_text)
    print()
    print("Matched job posting snippet:")
    print(job_text)
    print("=" * 100)


In [None]:
# =============================================
# üîé Inspect Sample Resume‚ÄìJob Matches
# =============================================

# sample 5 resume‚Äìjob pairs
sample = match_df.sample(5, random_state=42)

for _, row in sample.iterrows():
    r_idx = row['ResumeIndex']
    
    # get the resume text
    resume_text = resumes.loc[r_idx, "Resume_clean"][:400]
    
    # find a matching job posting in that matched job domain
    job_domain = row['MatchedJobDomain']
    job_matches = jobs[jobs["JobClusterDomainLabel"] == job_domain]
    
    if not job_matches.empty:
        job_text = job_matches.sample(1, random_state=42).iloc[0]["job_text_clean"][:400]
    else:
        job_text = "[No job text found for this domain]"
    
    print(f"\nResume #{r_idx} ‚Äî {row['ResumeDomain']} ‚Üí {job_domain} (Similarity: {row['Similarity']:.3f})")
    print("-" * 90)
    print("Resume snippet:")
    print(resume_text)
    print()
    print("Matched job posting snippet:")
    print(job_text)
    print("=" * 100)


In [None]:
print(matches.columns)
