In [1]:
import pandas as pd
import numpy as np
import json
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, classification_report


In [2]:
with open('linkedin-cvs-annotated.json', 'r', encoding='utf-8-sig') as f:
        data = json.load(f)


In [3]:
dataset = []

for person_jobs in data:
    active_jobs = [j for j in person_jobs if j.get('status') == 'ACTIVE']
    active_jobs.sort(key=lambda x: x.get('startDate') or "", reverse=True)

    if active_jobs:
        current_job = active_jobs[0]
        current_title = current_job.get('position', "")
        current_department = current_job.get('department', "Unknown")
        current_seniority = current_job.get('seniority', "Unknown")
    else:
        current_title = ""
        current_department = "Unemployed/Unknown"
        current_seniority = "Unknown"

    dataset.append({
        "profile_text": current_title,          # برای zero-shot می‌تونی بعداً غنی‌ترش کنی
        "current_title": current_title,
        "current_department": current_department,
        "current_seniority": current_seniority,
    })

X = pd.DataFrame(dataset)
print("X shape:", X.shape)
X.head()

X shape: (609, 4)


Unnamed: 0,profile_text,current_title,current_department,current_seniority
0,Prokurist,Prokurist,Other,Management
1,Solutions Architect,Solutions Architect,Information Technology,Professional
2,Medizintechnik Beratung,Medizintechnik Beratung,Consulting,Professional
3,Director expansión de negocio.,Director expansión de negocio.,Business Development,Director
4,"APL-ansvarig, samordning","APL-ansvarig, samordning",Administrative,Lead


In [None]:
dept_df = pd.read_csv("department-v2.csv")     
sen_df  = pd.read_csv("seniority-v2.csv")      

dept_labels = sorted(dept_df["label"].astype(str).unique())
sen_labels  = sorted(sen_df["label"].astype(str).unique())

print("Num dept labels:", len(dept_labels))
print("Num seniority labels:", len(sen_labels))
dept_labels[:10], sen_labels[:10]


Num dept labels: 11
Num seniority labels: 5


(['Administrative',
  'Business Development',
  'Consulting',
  'Customer Support',
  'Human Resources',
  'Information Technology',
  'Marketing',
  'Other',
  'Project Management',
  'Purchasing'],
 ['Director', 'Junior', 'Lead', 'Management', 'Senior'])

In [6]:
model = SentenceTransformer("all-MiniLM-L6-v2")
dept_label_emb = model.encode(dept_labels, convert_to_numpy=True, normalize_embeddings=True)
sen_label_emb  = model.encode(sen_labels,  convert_to_numpy=True, normalize_embeddings=True)

dept_label_emb.shape, sen_label_emb.shape


((11, 384), (5, 384))

In [None]:

texts = X["profile_text"].fillna("").astype(str).tolist()

text_emb = model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)

# similarity: (n_samples x n_labels)
dept_sim = cosine_similarity(text_emb, dept_label_emb)
sen_sim  = cosine_similarity(text_emb, sen_label_emb)


dept_pred_idx = dept_sim.argmax(axis=1)
sen_pred_idx  = sen_sim.argmax(axis=1)

X["pred_dept_zero_shot"] = [dept_labels[i] for i in dept_pred_idx]
X["pred_sen_zero_shot"]  = [sen_labels[i]  for i in sen_pred_idx]

X[["current_title", "pred_dept_zero_shot", "pred_sen_zero_shot"]].head(10)


Unnamed: 0,current_title,pred_dept_zero_shot,pred_sen_zero_shot
0,Prokurist,Administrative,Director
1,Solutions Architect,Project Management,Management
2,Medizintechnik Beratung,Information Technology,Management
3,Director expansión de negocio.,Consulting,Director
4,"APL-ansvarig, samordning",Sales,Senior
5,Kaufmännischer Leiter,Administrative,Junior
6,Lab-Supervisor,Project Management,Management
7,Human Resources Generalist,Human Resources,Management
8,Managing Director,Project Management,Director
9,,Other,Lead


In [8]:
print("Dept accuracy:", accuracy_score(X["current_department"], X["pred_dept_zero_shot"]))
print("Sen accuracy:", accuracy_score(X["current_seniority"], X["pred_sen_zero_shot"]))

print("\n--- Department report ---")
print(classification_report(X["current_department"], X["pred_dept_zero_shot"], zero_division=0))

print("\n--- Seniority report ---")
print(classification_report(X["current_seniority"], X["pred_sen_zero_shot"], zero_division=0))


Dept accuracy: 0.18719211822660098
Sen accuracy: 0.25287356321839083

--- Department report ---
                        precision    recall  f1-score   support

        Administrative       0.03      0.60      0.07        10
  Business Development       0.18      0.33      0.24        18
            Consulting       0.28      0.57      0.37        28
      Customer Support       0.16      0.67      0.26         6
       Human Resources       0.41      0.75      0.53        16
Information Technology       0.52      0.24      0.33        55
             Marketing       0.44      0.42      0.43        19
                 Other       0.05      0.03      0.04       246
    Project Management       0.32      0.65      0.43        31
            Purchasing       0.38      0.67      0.48        12
                 Sales       0.56      0.38      0.45        37
    Unemployed/Unknown       0.00      0.00      0.00       131

              accuracy                           0.19       609
      