In [None]:
!pip install transformers



In [None]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report, accuracy_score

In [None]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
df = pd.read_csv(r"/content/drive/MyDrive/Colab Notebooks/job_applicant_dataset.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Job Applicant Name  10000 non-null  object
 1   Age                 10000 non-null  int64 
 2   Gender              10000 non-null  object
 3   Race                10000 non-null  object
 4   Ethnicity           10000 non-null  object
 5   Resume              10000 non-null  object
 6   Job Roles           10000 non-null  object
 7   Job Description     10000 non-null  object
 8   Best Match          10000 non-null  int64 
dtypes: int64(2), object(7)
memory usage: 703.3+ KB


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df.head(5)

Unnamed: 0,Job Applicant Name,Age,Gender,Race,Ethnicity,Resume,Job Roles,Job Description,Best Match
0,Daisuke Mori,29,Male,Mongoloid/Asian,Vietnamese,"Proficient in Injury Prevention, Motivation, N...",Fitness Coach,A Fitness Coach is responsible for helping cl...,0
1,Taichi Shimizu,31,Male,Mongoloid/Asian,Filipino,"Proficient in Healthcare, Pharmacology, Medica...",Physician,"Diagnose and treat illnesses, prescribe medica...",0
2,Sarah Martin,46,Female,White/Caucasian,Dutch,"Proficient in Forecasting, Financial Modelling...",Financial Analyst,"As a Financial Analyst, you will be responsibl...",0
3,Keith Hughes,43,Male,Negroid/Black,Caribbean,"Proficient in Budgeting, Supply Chain Optimiza...",Supply Chain Manager,A Supply Chain Manager oversees the entire sup...,1
4,James Davis,49,Male,White/Caucasian,English,"Proficient in Logistics, Negotiation, Procurem...",Supply Chain Manager,A Supply Chain Manager oversees the entire sup...,1


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoTokenizer
from collections import Counter
import re
import numpy as np
import pandas as pd


df['Resume'] = df['Resume'].str.replace(r"\bProficient in\b", "", regex=True)
df['Resume'] = df['Resume'].str.replace(r"\s+", " ", regex=True).str.strip()
df['Job Description'] = df['Job Description'].str.replace(r"\s+", " ", regex=True).str.strip()

df['Resume']

Unnamed: 0,Resume
0,"Injury Prevention, Motivation, Nutrition, Heal..."
1,"Healthcare, Pharmacology, Medical Terminology,..."
2,"Forecasting, Financial Modelling, Excel, Budge..."
3,"Budgeting, Supply Chain Optimization, Risk Man..."
4,"Logistics, Negotiation, Procurement, Supply Ch..."
...,...
9995,"Biology, Regulatory Compliance, Product Develo..."
9996,"Communication, Teamwork, Lesson Planning, Moti..."
9997,"Medical Terminology, Critical Thinking, Surgic..."
9998,"Exercise Programming, Motivation, Physical The..."


In [None]:
def tokenize(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())  # keep alphanumerics
    return text.split()

df['combined_text'] = "[RESUME] " + df['Resume'] + " [JOB] " + df['Job Description']
tokenized_texts = [tokenize(t) for t in df['combined_text']]

#Building Vocabulary
max_vocab_size = 20000
all_tokens = [tok for txt in tokenized_texts for tok in txt]
vocab = {word: idx+2 for idx, (word, _) in enumerate(
    Counter(all_tokens).most_common(max_vocab_size)
)}
vocab["<PAD>"] = 0
vocab["<UNK>"] = 1

def encode(tokens):
    return torch.tensor([vocab.get(tok, 1) for tok in tokens], dtype=torch.long)

encoded_texts = [encode(t) for t in tokenized_texts]

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

labels = df['Best Match'].tolist()

#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    encoded_texts, labels, test_size=0.2, random_state=42
)

X_train = np.array(X_train, dtype=object)
X_test = np.array(X_test, dtype=object)
y_train = np.array(y_train)
y_test = np.array(y_test)

class ResumeDataset(Dataset):
    def __init__(self, texts, labels, max_len=256):
        self.texts = [t[:max_len] for t in texts]  # truncate
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

def collate_fn(batch):
    texts, labels = zip(*batch)
    padded = torch.nn.utils.rnn.pad_sequence(texts, batch_first=True, padding_value=0)
    return padded, torch.stack(labels)

#DataLoaders
train_loader = DataLoader(
    ResumeDataset(X_train, y_train),
    batch_size=2,
    collate_fn=collate_fn,
    shuffle=True
)

test_loader = DataLoader(
    ResumeDataset(X_test, y_test),
    batch_size=2,
    collate_fn=collate_fn
)


In [None]:
#LSTM Model
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers=3):
        super(LSTMClassifier, self).__init__()
        self.embedding=nn.Embedding(vocab_size, 100, padding_idx=0)
        self.lstm=nn.LSTM(100, 128, batch_first=True, bidirectional=True, num_layers=3)
        self.dropout = nn.Dropout(0.2)

        self.attain = nn.Linear(128*2, 1)

        self.fc = nn.Linear(128*2, 512)
        self.output = nn.Linear(512, 1)

        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
      emb = self.embedding(x)
      out, _ = self.lstm(emb)

      attn_weights = self.attain(out)
      attn_weights = torch.softmax(attn_weights, dim=1)

      context_vector = torch.sum(attn_weights*out, dim=1)

      out = self.dropout(context_vector)
      out = F.relu(self.fc(out))
      out = self.output(out)

      return self.sigmoid(out).squeeze()

In [None]:
#Training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(len(vocab), embed_dim=50, hidden_dim=64).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(10):
    model.train()
    total_loss = 0
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")

#Evaluation
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for batch_x, batch_y in test_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        outputs = model(batch_x)
        preds = (outputs >= 0.5).float()
        correct += (preds == batch_y).sum().item()
        total += batch_y.size(0)

print(f"Test Accuracy: {correct/total:.2f}")


Epoch 1, Loss: 0.6937
Epoch 2, Loss: 0.6944
Epoch 3, Loss: 0.6930
Epoch 4, Loss: 0.6875
Epoch 5, Loss: 0.6846
Epoch 6, Loss: 0.6830
Epoch 7, Loss: 0.6793
Epoch 8, Loss: 0.6860
Epoch 9, Loss: 0.6825
Epoch 10, Loss: 0.6796
Test Accuracy: 0.53


**Using Bert Transformers**

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

In [None]:
MODEL_NAME = "bert-base-uncased"
MAX_LEN = 256
BATCH_SIZE = 8
EPOCHS = 3
LR = 1e-3
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['combined_text'].tolist(),
    df['Best Match'].tolist(),
    test_size=0.2,
    random_state=42)

In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

class ResumeJobDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            add_special_tokens=True,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.float)
        }

In [None]:
train_dataset = ResumeJobDataset(train_texts, train_labels, tokenizer, MAX_LEN)
test_dataset = ResumeJobDataset(test_texts, test_labels, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [None]:
model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=1
)
model.to(DEVICE)
optimizer = AdamW(model.parameters(), lr=LR)
loss_fn = torch.nn.BCEWithLogitsLoss()


In [None]:
or epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits.squeeze()

        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1} - Loss: {total_loss/len(train_loader):.4f}")

In [None]:
model.eval()
preds, true_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = torch.sigmoid(outputs.logits.squeeze())
        predictions = (logits > 0.5).long()

        preds.extend(predictions.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

print("Accuracy:", accuracy_score(true_labels, preds))
print(classification_report(true_labels, preds))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3: 100%|██████████| 1000/1000 [06:23<00:00,  2.61it/s]


Epoch 1 - Loss: 0.6940


Epoch 2/3: 100%|██████████| 1000/1000 [06:22<00:00,  2.61it/s]


Epoch 2 - Loss: 0.6826


Epoch 3/3: 100%|██████████| 1000/1000 [06:22<00:00,  2.61it/s]


Epoch 3 - Loss: 0.6781
Accuracy: 0.5275
              precision    recall  f1-score   support

         0.0       0.55      0.44      0.49      1023
         1.0       0.51      0.62      0.56       977

    accuracy                           0.53      2000
   macro avg       0.53      0.53      0.52      2000
weighted avg       0.53      0.53      0.52      2000

