In [5]:
# Import necessary libraries
from sklearn.feature_extraction.text import CountVectorizer
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

In [15]:
df=pd.read_csv("final.csv")
df.head(2)


Unnamed: 0,Sentences,Labels
0,The Inside Services Sales Manager will work on...,job_functions
1,The candidate will possess knowledge of the so...,required_qualifications


In [16]:
unique_labels_count = df['Labels'].nunique()

# Alternatively, to see the count of each unique label
unique_labels_frequency = df['Labels'].value_counts()

print(f"Number of unique labels: {unique_labels_count}")
print("Frequency of each label:")
print(unique_labels_frequency)

Number of unique labels: 6
Frequency of each label:
job_functions              46392
required_qualifications    40209
about_company              17185
benefits                    9408
recruiting_process          6761
equal_opportunity           3520
Name: Labels, dtype: int64


In [17]:
start=0
texts=df["Sentences"].values[:100]
labels=df["Labels"].values[:100]

In [18]:

# Convert labels to numerical values (0, 1, 2, 3)
label_mapping = {label: idx for idx, label in enumerate(set(labels))}
numeric_labels = [label_mapping[label] for label in labels]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, numeric_labels, test_size=0.2, random_state=42,stratify=numeric_labels)

# Create a CountVectorizer to convert text data into a bag-of-words representation
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)


In [19]:
label_mapping.keys()

dict_keys(['job_functions', 'about_company', 'equal_opportunity', 'required_qualifications', 'benefits'])

In [25]:
# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(label_mapping))

# Tokenize and encode the training and testing data
X_train_tokens = tokenizer(list(X_train), padding=True, truncation=True, return_tensors="pt")
X_test_tokens = tokenizer(list(X_test), padding=True, truncation=True, return_tensors="pt")

# Convert labels to PyTorch tensors
y_train_tensor = torch.tensor(y_train)
y_test_tensor = torch.tensor(y_test)

# Create DataLoader for training and testing data
train_dataset = TensorDataset(X_train_tokens.input_ids, X_train_tokens.attention_mask, y_train_tensor)
test_dataset = TensorDataset(X_test_tokens.input_ids, X_test_tokens.attention_mask, y_test_tensor)

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Set up GPU if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# Set up optimizer and training parameters
optimizer = AdamW(model.parameters(), lr=2e-5)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [46]:
device

device(type='cuda')

In [47]:
# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

In [21]:
# Evaluation function
device='cpu'
def evaluate_model(model, dataloader):
    model.eval()  # Set model to evaluation mode
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # Move logits and labels to CPU if necessary
            logits = logits.detach().cpu().numpy()
            label_ids = labels.to('cpu').numpy()

            predictions.extend(np.argmax(logits, axis=1).flatten())
            true_labels.extend(label_ids.flatten())

    return predictions, true_labels


In [22]:

# Perform prediction on the test dataset
test_predictions, test_true_labels = evaluate_model(model, test_dataloader)

# Calculate accuracy
accuracy = accuracy_score(test_true_labels, test_predictions)
print("Accuracy:", accuracy)


print("Classification Report:")
print(classification_report(test_true_labels, test_predictions, target_names=label_mapping.keys()))

Accuracy: 0.2
Classification Report:
                         precision    recall  f1-score   support

          job_functions       0.00      0.00      0.00        12
          about_company       0.00      0.00      0.00         1
      equal_opportunity       0.00      0.00      0.00         1
required_qualifications       0.20      1.00      0.33         4
               benefits       0.00      0.00      0.00         2

               accuracy                           0.20        20
              macro avg       0.04      0.20      0.07        20
           weighted avg       0.04      0.20      0.07        20



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
import json
import nltk
from nltk.tokenize import sent_tokenize
import torch

def predict(paragraph, model, tokenizer, label_mapping, confidence_threshold=0.3, extra_label='extra'):
    sentences = sent_tokenize(paragraph)
    result_dict = {label: [] for label in label_mapping.keys()}
    result_dict[extra_label] = []

    for sentence in sentences:
        encoded_sentence = tokenizer(sentence, padding=True, truncation=True, return_tensors="pt")
        input_ids = encoded_sentence.input_ids.to(device)
        attention_mask = encoded_sentence.attention_mask.to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=1)
            predicted_label_index = torch.argmax(probabilities, dim=1).item()
            predicted_label = list(label_mapping.keys())[predicted_label_index]
            confidence_score = probabilities[0][predicted_label_index].item()

            if confidence_score >= confidence_threshold:
                result_dict[predicted_label].append(sentence)
            else:
                result_dict[extra_label].append(sentence)

    result_json = json.dumps(result_dict, indent=2)
    return result_json


In [26]:
# Save the model
# torch.save(model.state_dict(), 'bert_model.pth')

device='cpu'
# Load the model
loaded_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(label_mapping))
loaded_model.load_state_dict(torch.load('bert_model.pth'))
loaded_model.eval()  # Make sure to set the model to evaluation mode after loading
loaded_model.to(device)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [27]:

# Perform prediction on the test dataset
test_predictions, test_true_labels = evaluate_model(loaded_model, test_dataloader)

# Calculate accuracy
accuracy = accuracy_score(test_true_labels, test_predictions)
print("Accuracy:", accuracy)


print("Classification Report:")
print(classification_report(test_true_labels, test_predictions, target_names=label_mapping.keys()))

Accuracy: 0.35
Classification Report:
                         precision    recall  f1-score   support

          job_functions       0.56      0.42      0.48        12
          about_company       0.00      0.00      0.00         1
      equal_opportunity       0.00      0.00      0.00         1
required_qualifications       0.67      0.50      0.57         4
               benefits       0.00      0.00      0.00         2

               accuracy                           0.35        20
              macro avg       0.24      0.18      0.21        20
           weighted avg       0.47      0.35      0.40        20



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
from time import time
start = time()
paragraph = "Primary Duties and Responsibilities: The Part-Time Associate reports to the Store Manager and performs management-assigned duties, which may include but may not be limited to receiving inventory, stocking and maintaining a store section, operating a cash register, carry-out services and store cleaning. Specific duties include, but are not limited to: Maintain a store section as assigned and may order, receive and stock merchandise using proper equipment, assures proper pricing on merchandise, checks product date codes to validate proper rotation, stocks shelves, end displays, floor stacks, displays and refrigerated/frozen cases. May prepare perishable products for sale. Sets up advertising/promotional displays. Reviews invoices/bills for accuracy. Cleans and maintains sanitation standards in all interior and exterior areas of store and parking lot as directed by store management. Operates a cash register, receives payment from customers, and makes change or processes check/charge/debit transactions. Processes cash and/or credit refunds, when approved by management. Verifies customer eligibility when selling alcoholic beverages. Offers friendly, knowledgeable, efficient and courteous assistance to customers by providing them with current store and product information. When requested, loads customer purchases by assisting customers to their vehicles. Performs basic bookkeeping duties. Records lost/damaged goods and store supplies using appropriate tools. Candidates must possess the following skills: Ability to read and write English, interact with general public and co-workers. Ability to read and comprehend simple instructions, product labels, product pricing codes, shelf tags, short correspondence, and memos. Ability to write simple correspondence. Ability to effectively present information in one-on-one and small group situations to customers, clients, and other employees of the organization. Ability to apply common sense understanding to carry out detailed but uninvolved written or oral instructions. The employee must regularly lift and/or move up to 25 pounds, frequently lift and/or move up to 45 pounds, and occasionally lift and/or move up to 60 pounds. Basic PC/Outlook skills preferred. Required Qualifications: Retail Management Certificate is desirable. Our company provides equal employment opportunities (EEO) to all employees and applicants for employment without regard to race, color, religion, sex, national origin, age, disability or genetics."
result = predict(paragraph, loaded_model, tokenizer, label_mapping)
end = time()
print(end-start)
print(result)



1.791938304901123
{
  "job_functions": [
    "Primary Duties and Responsibilities: The Part-Time Associate reports to the Store Manager and performs management-assigned duties, which may include but may not be limited to receiving inventory, stocking and maintaining a store section, operating a cash register, carry-out services and store cleaning.",
    "Specific duties include, but are not limited to: Maintain a store section as assigned and may order, receive and stock merchandise using proper equipment, assures proper pricing on merchandise, checks product date codes to validate proper rotation, stocks shelves, end displays, floor stacks, displays and refrigerated/frozen cases.",
    "Operates a cash register, receives payment from customers, and makes change or processes check/charge/debit transactions.",
    "Offers friendly, knowledgeable, efficient and courteous assistance to customers by providing them with current store and product information.",
    "The employee must regular