In [5]:
# download resume dataset
import kagglehub
import shutil

download_path = kagglehub.dataset_download("shamimhasan8/resume-vs-job-description-matching-dataset")
target_folder = "../dataset"

shutil.move(download_path, target_folder)

Downloading from https://www.kaggle.com/api/v1/datasets/download/shamimhasan8/resume-vs-job-description-matching-dataset?dataset_version_number=1...


100%|██████████| 927k/927k [00:00<00:00, 5.36MB/s]

Extracting files...





'../dataset/1'

In [6]:
download_path = kagglehub.dataset_download("surendra365/recruitement-dataset")

shutil.move(download_path, target_folder)

Downloading from https://www.kaggle.com/api/v1/datasets/download/surendra365/recruitement-dataset?dataset_version_number=2...


100%|██████████| 1.58M/1.58M [00:00<00:00, 9.29MB/s]

Extracting files...





'../dataset/2'

In [7]:
from sentence_transformers import SentenceTransformer, losses, InputExample
from torch.utils.data import DataLoader
import torch

# use gpu is possible
device = "mps" if torch.backends.mps.is_available() else "cpu"

# load model on CPU first (to avoid meta tensor error)
model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
model = model.to(device)

In [8]:
import pandas as pd
file_path = "../dataset/resume_job_matching_dataset.csv"
df = pd.read_csv(file_path)

# normalize similarity score (1-5) linearly from 0 to 1
df['normal_score'] = (df['match_score'] - 1) / 4

In [9]:
import pandas as pd
file_path2 = "../dataset/job_applicant_dataset.csv"
df2 = pd.read_csv(file_path2)

In [10]:
print(df2)

     Job Applicant Name  Age  Gender             Race   Ethnicity  \
0          Daisuke Mori   29    Male  Mongoloid/Asian  Vietnamese   
1        Taichi Shimizu   31    Male  Mongoloid/Asian    Filipino   
2          Sarah Martin   46  Female  White/Caucasian       Dutch   
3          Keith Hughes   43    Male    Negroid/Black   Caribbean   
4           James Davis   49    Male  White/Caucasian     English   
...                 ...  ...     ...              ...         ...   
9995      Jada Williams   30  Female    Negroid/Black    Ghanaian   
9996       Jaden Carter   52    Male    Negroid/Black    Nigerian   
9997         Mia Foster   25  Female  White/Caucasian      German   
9998       Stella Green   51  Female  White/Caucasian       Irish   
9999        Ryo Nishida   46    Male  Mongoloid/Asian        Thai   

                                                 Resume             Job Roles  \
0     Proficient in Injury Prevention, Motivation, N...         Fitness Coach   
1     Pro

In [11]:
# make sure values are from 0 - 1
import numpy as np
df['normal_score'].describe()
df['normal_score'] = np.where(df['normal_score'] >= 0.5, 1, 0)
print(df['normal_score'])

0       1
1       1
2       1
3       1
4       1
       ..
9995    1
9996    0
9997    0
9998    0
9999    1
Name: normal_score, Length: 10000, dtype: int64


In [12]:
#combine 2 datasets
df2_renamed = df2.rename(columns={
    'Best Match': 'normal_score',
    'Job Description': 'job_description',
    "Resume" : 'resume'
})

columns_to_keep = ['job_description', 'resume', 'normal_score']
df2_sub = df2_renamed[columns_to_keep].copy()

for col in df.columns:
    if col not in df2_sub.columns:
        df2_sub.loc[:, col] = pd.NA

# Now concatenate
df_combined = pd.concat([df, df2_sub], ignore_index=True, sort=False)

In [13]:
df_combined

Unnamed: 0,job_description,resume,match_score,normal_score
0,"Data Analyst needed with experience in SQL, Ex...","Experienced professional skilled in SQL, Power...",4,1
1,Data Scientist needed with experience in Stati...,"Experienced professional skilled in Python, De...",4,1
2,Software Engineer needed with experience in Sy...,"Experienced professional skilled in wait, Git,...",5,1
3,"ML Engineer needed with experience in Python, ...","Experienced professional skilled in return, De...",4,1
4,Software Engineer needed with experience in RE...,"Experienced professional skilled in REST APIs,...",5,1
...,...,...,...,...
19995,A Biomedical Engineer designs and develops med...,"Proficient in Biology, Regulatory Compliance, ...",,0
19996,A Teacher shapes the future of students by del...,"Proficient in Communication, Teamwork, Lesson ...",,0
19997,"Diagnose and treat illnesses, prescribe medica...","Proficient in Medical Terminology, Critical Th...",,0
19998,A Fitness Coach is responsible for helping cl...,"Proficient in Exercise Programming, Motivation...",,1


In [14]:
import re

def clean_text(text):
    text = text.lower()
    # collapse multiple spaces/newlines into a single space
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-z0-9\s.,\-+#/]', '', text)
    return text.strip()

In [15]:
df_combined['job_description'] = df_combined['job_description'].apply(clean_text)
df_combined['resume'] = df_combined['resume'].apply(clean_text)

In [16]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    df_combined,
    test_size=0.2,
    stratify=df_combined["normal_score"],  # keeps balance of 0/1
    random_state=42
)

In [17]:
from sentence_transformers import InputExample

examples = []
for _, row in df_combined.iterrows():
    if pd.notna(row['normal_score']):  # skip NaN rows
        examples.append(
            InputExample(
                texts=[row['resume'], row['job_description']], 
                label=float(row['normal_score'])
            )
        )

train_examples = [
    InputExample(texts=[row['resume'], row['job_description']], label=float(row['normal_score']))
    for _, row in train_df.iterrows()
]

test_examples = [
    InputExample(texts=[row['resume'], row['job_description']], label=float(row['normal_score']))
    for _, row in test_df.iterrows()
]

In [18]:
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, losses

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=8)

model = SentenceTransformer("all-MiniLM-L6-v2")

train_loss = losses.CosineSimilarityLoss(model=model)
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=5,
    warmup_steps=int(len(train_dataloader) * 0.1),
    show_progress_bar=True
)



Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



Step,Training Loss
500,0.1919
1000,0.1747
1500,0.1704
2000,0.1592
2500,0.1623
3000,0.1591
3500,0.1594
4000,0.1563
4500,0.1502
5000,0.1548


In [19]:
from sklearn.metrics import roc_auc_score, accuracy_score

job_texts = [ex.texts[1] for ex in test_examples]
resume_texts = [ex.texts[0] for ex in test_examples]
labels = [ex.label for ex in test_examples]

job_embs = model.encode(job_texts, convert_to_numpy=True, show_progress_bar=True)
resume_embs = model.encode(resume_texts, convert_to_numpy=True, show_progress_bar=True)

# cosine similarity
cosine_scores = np.sum(job_embs * resume_embs, axis=1) / (
    np.linalg.norm(job_embs, axis=1) * np.linalg.norm(resume_embs, axis=1)
)

# metrics
roc_auc = roc_auc_score(labels, cosine_scores)
preds = [1 if s > 0.5 else 0 for s in cosine_scores]  # threshold at 0.5
acc = accuracy_score(labels, preds)

print(f"ROC AUC: {roc_auc:.4f}")
print(f"Accuracy: {acc:.4f}")

Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Batches:   0%|          | 0/125 [00:00<?, ?it/s]

ROC AUC: 0.8050
Accuracy: 0.6970


In [22]:
model.save("../model/all_minilm_finetuned")