In [2]:
#Step 1: Import Necessary Libraries
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from transformers import DistilBertTokenizer, DistilBertModel, DistilBertForSequenceClassification, pipeline
from torch.utils.data import DataLoader, Dataset
import torch
import pickle



In [4]:
# Step 2: Load and Prepare Data
# Load data from JSON files (Entity Recognition in Resumes, IT Job Descriptions, Salaries).
with open('data/json/Entity Recognition in Resumes.json', 'r') as file:
    resume_data = json.load(file)

with open('data/json/IT Job Desc Annotated Detailed.json', 'r') as file:
    job_desc_data = json.load(file)

with open('/data/json/salaries.json', 'r') as file:
    salaries_data = json.load(file)

JSONDecodeError: Extra data: line 2 column 1 (char 3520)

In [None]:
# Step 3: Exploratory Data Analysis (EDA)
# Inspecting the structure of each dataset and combining relevant information for modeling.
print("Number of resumes loaded:", len(resume_data))
print("Number of job descriptions loaded:", len(job_desc_data))
print("Number of salary records loaded:", len(salaries_data))

In [None]:
# Step 4: Preprocessing Data
# Create a dataset that combines resume information, job descriptions, and salary insights.
def preprocess_resume_data(resume_data):
    processed_data = []
    for entry in resume_data:
        processed_data.append({
            'text': entry['content'],
            'skills': ' '.join([annot['text'] for annot in entry['annotation'] if annot['label'] == ['Skills']])
        })
    return processed_data

resume_dataset = preprocess_resume_data(resume_data)

# Convert to DataFrame for easier handling.
resume_df = pd.DataFrame(resume_dataset)
job_desc_df = pd.DataFrame(job_desc_data['annotations'])
salaries_df = pd.DataFrame(salaries_data)

In [None]:
# Step 5: Train-Test Split
# Use the resume and job description data for building an NLP model.
X = resume_df['text'] + ' ' + resume_df['skills']
y = job_desc_df['IT SKILLS']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 6: Tokenization and Model Preparation
# We will use DistilBERT for tokenizing and creating embeddings for the text.
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

train_dataset = CustomDataset(X_train, y_train, tokenizer, max_length=128)
test_dataset = CustomDataset(X_test, y_test, tokenizer, max_length=128)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [None]:
# Step 7: Model Training with DistilBERT
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(set(y)))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

optimizer = torch.optim.Adam(params=model.parameters(), lr=2e-5)

# Adding a progress bar for training.
epochs = 3
for epoch in range(epochs):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        loop.set_description(f'Epoch {epoch+1}')
        loop.set_postfix(loss=loss.item())

In [None]:
# Step 8: Evaluate the Model
model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=-1)
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

print("Classification Report:")
print(classification_report(all_labels, all_predictions))
print("Confusion Matrix:")
print(confusion_matrix(all_labels, all_predictions))

In [None]:
# Step 9: Integration with RAG
# Implementing a RAG model to enhance the retrieval capabilities during job requirement analysis.
retriever = pipeline('question-answering', model='distilbert-base-uncased', tokenizer=tokenizer)

def retrieve_information(question, context):
    result = retriever(question=question, context=context)
    return result['answer']

# Example of using RAG to find specific details in job descriptions.
example_question = "What are the required IT skills for this job?"
example_context = X_test.iloc[0]
print("RAG Answer:", retrieve_information(example_question, example_context))

In [None]:
# Step 10: Ensemble Model
# Combine DistilBERT and RandomForest for better performance.
# Get DistilBERT embeddings for RandomForest model.
def get_embeddings(texts, tokenizer, model):
    model.eval()
    embeddings = []
    with torch.no_grad():
        for text in texts:
            encoding = tokenizer(
                text,
                return_tensors='pt',
                truncation=True,
                padding=True,
                max_length=128
            )
            output = model(**encoding)
            embeddings.append(output.last_hidden_state.mean(1).squeeze().cpu().numpy())
    return np.array(embeddings)

X_train_embeddings = get_embeddings(X_train, tokenizer, model)
X_test_embeddings = get_embeddings(X_test, tokenizer, model)

# Train RandomForest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_embeddings, y_train)

# Evaluate RandomForest
rf_predictions = rf_model.predict(X_test_embeddings)
print("Random Forest Ensemble Accuracy:", accuracy_score(y_test, rf_predictions))

In [None]:
# Step 11: Conclusion and Next Steps
# - The model utilizes both BERT and Random Forest as an ensemble.
# - We successfully implemented a retrieval-augmented generation (RAG) for more interactive requirement analysis.
# - Further work could include hyperparameter tuning, experimenting with other transformer-based models (like BERT-large), and expanding dataset diversity.

# Save the final model
with open('recruitment_nlp_model.pkl', 'wb') as model_file:
    pickle.dump(rf_model, model_file)

print("Model saved successfully.")