In [6]:
import json
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score

In [7]:
with open('linkedin-cvs-annotated.json', 'r', encoding='utf-8-sig') as f:
    data = json.load(f) 

In [8]:
def build_profile_text(person_jobs, only_active=True, max_jobs=5):
    
    jobs = person_jobs
    if only_active:
        active = [j for j in person_jobs if j.get("status") == "ACTIVE"]
        jobs = active if active else person_jobs

    jobs = sorted(jobs, key=lambda j: j.get("startDate") or "", reverse=True)


    parts = []
    for j in jobs[:max_jobs]:
        pos = (j.get("position") or "").strip()
        org = (j.get("organization") or "").strip()

        if pos and org:
            parts.append(f"{pos} at {org}")
        elif pos:
            parts.append(pos)

    return " | ".join(parts)


In [9]:
rows = []
for i, person_jobs in enumerate(data):
    rows.append({
        "person_id": i,
        "profile_text": build_profile_text(person_jobs, only_active=True, max_jobs=5)
    })

df = pd.DataFrame(rows)


In [10]:
def extract_labels(data):
    dept_set, sen_set = set(), set()

    for person_jobs in data:
        for j in person_jobs:
            d = j.get("department")
            s = j.get("seniority")

            if d and str(d).strip():
                dept_set.add(str(d).strip())

            if s and str(s).strip():
                sen_set.add(str(s).strip())

    return sorted(dept_set), sorted(sen_set)

department_labels, seniority_labels = extract_labels(data)
print("Department labels:", department_labels)
print("Seniority labels:", seniority_labels)


Department labels: ['Administrative', 'Business Development', 'Consulting', 'Customer Support', 'Human Resources', 'Information Technology', 'Marketing', 'Other', 'Project Management', 'Purchasing', 'Sales']
Seniority labels: ['Director', 'Junior', 'Lead', 'Management', 'Professional', 'Senior']


In [11]:
dept_prompts = [f"This job belongs to the {l} department." for l in department_labels]
sen_prompts  = [f"This job has {l} seniority level." for l in seniority_labels]


In [12]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# label embeddings
dept_emb = model.encode(dept_prompts, normalize_embeddings=True)
sen_emb  = model.encode(sen_prompts,  normalize_embeddings=True)

# profile embeddings
texts = df["profile_text"].fillna("").tolist()
text_emb = model.encode(texts, normalize_embeddings=True)

# cosine similarity
dept_sim = cosine_similarity(text_emb, dept_emb)
sen_sim  = cosine_similarity(text_emb, sen_emb)

# best label
dept_idx = dept_sim.argmax(axis=1)
sen_idx  = sen_sim.argmax(axis=1)

df["predicted_department"] = [department_labels[i] for i in dept_idx]
df["predicted_seniority"]  = [seniority_labels[i] for i in sen_idx]

# confidence scores
df["dept_score"] = dept_sim.max(axis=1)
df["sen_score"]  = sen_sim.max(axis=1)


In [13]:
def get_ground_truth(person_jobs):
    active = [j for j in person_jobs if j.get("status") == "ACTIVE"]
    if not active:
        return None, None

    active = sorted(active, key=lambda j: j.get("startDate") or "", reverse=True)
    j = active[0]
    return j.get("department"), j.get("seniority")


In [14]:
true_dept, true_sen = [], []

for person_jobs in data:
    d, s = get_ground_truth(person_jobs)
    true_dept.append(d)
    true_sen.append(s)

df["true_department"] = true_dept
df["true_seniority"]  = true_sen


In [15]:
eval_df = df.dropna(subset=["true_department", "true_seniority"])

dept_acc = accuracy_score(eval_df["true_department"], eval_df["predicted_department"])
sen_acc  = accuracy_score(eval_df["true_seniority"],  eval_df["predicted_seniority"])

print(f"Department accuracy: {dept_acc:.3f}")
print(f"Seniority accuracy:  {sen_acc:.3f}")


Department accuracy: 0.207
Seniority accuracy:  0.301
