In [1]:
VAR = {
    'max_len': 512,
    'batch_size': 16
}

In [2]:
import numpy
import pandas
import torch
import torch.nn as nn
from transformers import BertTokenizer, AdamW, BertForSequenceClassification
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas




# Classification Task: Initiating Data

In [3]:
from preprocessing_NLP import pipeline

resume_df = pipeline('UpdatedResumeDataSet_T1_7.csv', feature_name='Resume')
resume_df = resume_df.reset_index(drop=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\JS\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\JS\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\JS\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\JS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def split_into_sub_length(resume: str, max_len=128):
    # Ensures all words are tokenized and analyzed;  Due to keras pad_sequences()
    resume_list = resume.split()
    
    resume_length = len(resume_list)
    
    splits = resume_length // max_len
    remaining_splits = resume_length % max_len
    
    resume_at_max_len = []

    for i in range(splits):
        resume_section = ' '.join(resume_list[i*max_len: (i+1)*max_len])
        # print(resume_list[i*max_len: (i+1)*max_len])
        resume_at_max_len.append(resume_section)
        
    final_section = ' '.join(resume_list[-1-remaining_splits:-1])
    resume_at_max_len.append(final_section)
    
    return resume_at_max_len

In [5]:
resume_df['Resume'] = resume_df['Resume'].apply(split_into_sub_length)
resume_df['Resume']

0      [qwtnrvduof education detail may 2013 may 2017...
1      [qwtnrvduof area interest deep learn control s...
2      [skill r python sap hana tableau sap hana sql ...
3      [education detail mca ymcaust faridabad haryan...
4      [skill c basic iot python matlab data science ...
                             ...                        
183    [skill set o window xp 7 8 8bntgbqlmkk1 10 dat...
184    [good logical analytical skill positive attitu...
185    [personal skill quick learner eagerness learn ...
186    [core skill project program management agile s...
187    [education detail february 2006 february 2006 ...
Name: Resume, Length: 188, dtype: object

In [6]:
resume_df = resume_df.explode('Resume')
resume_df

Unnamed: 0,Category,Resume
0,Data Science,qwtnrvduof education detail may 2013 may 2017 ...
0,Data Science,mixed attribute company matelabs
1,Data Science,qwtnrvduof area interest deep learn control sy...
1,Data Science,year month mathematics exprience less 1 year m...
2,Data Science,skill r python sap hana tableau sap hana sql s...
...,...,...
186,DevOps Engineer,various type test like system regression sanit...
186,DevOps Engineer,transition drill session forward reverse shado...
186,DevOps Engineer,action plan team prepare implement business co...
187,Business Analyst,education detail february 2006 february 2006 t...


In [7]:
type(resume_df['Category'].iloc[0])

str

In [8]:
encoder = LabelEncoder()

In [9]:
resume_df['Category'] = encoder.fit_transform(resume_df['Category'])
resume_df['Category'] = resume_df['Category'].astype(numpy.int64)

In [10]:
type(resume_df['Category'].iloc[0])

numpy.int64

In [11]:
resume_df['Category'].value_counts().max()

41

In [12]:
import random

# minimum = resume_df['Category'].value_counts().max()
minimum = 0
current_lowest = resume_df['Category'].value_counts().min()

#Check
count = resume_df['Category'].value_counts()
remaining = 7 - count[count<minimum]

while len(remaining != 0):
    count = resume_df['Category'].value_counts()
    remaining = 7 - count[count<minimum]

    for category in remaining.index:
        someInt = random.randint(0, current_lowest-1)
        value_to_append = resume_df[
            resume_df['Category']==category
            ]['Resume'].values[someInt]


        df_to_concat = pandas.DataFrame({
            'Category': [category],
            'Resume': [value_to_append]
        })

        resume_df = pandas.concat([resume_df, df_to_concat], axis=0)

In [13]:
resume_df['Category'].value_counts()

Category
7     41
18    39
6     35
8     31
15    27
4     27
13    25
2     25
21    23
9     22
17    21
5     18
10    17
12    16
19    16
1     15
11    15
16    15
0     14
23    14
14    14
20    14
3     13
24    12
22    11
Name: count, dtype: int64

In [14]:
resumes = resume_df.Resume.values
resumes = ["[CLS] " + resume + " [SEP]" for resume in resumes]
labels = resume_df.Category.values

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_texts = [tokenizer.tokenize(resume) for resume in resumes]

input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=VAR['max_len'], dtype="long", truncating="post", padding="post")

attention_masks = []
for sequence in input_ids:
    sequence_mask = [float(id > 0) for id in sequence]
    attention_masks.append(sequence_mask)

In [15]:
training_inputs, validation_test_inputs, training_labels, validation_test_labels, training_masks, validation_test_masks = train_test_split(
    input_ids, labels, attention_masks,
    random_state=42, test_size=0.3
)

validation_inputs, testing_inputs, validation_labels, testing_labels, validation_masks, testing_masks = train_test_split(
    validation_test_inputs, validation_test_labels, validation_test_masks, random_state=42, test_size=0.3
)

training_data = TensorDataset(torch.tensor(training_inputs), torch.tensor(training_masks), torch.tensor(training_labels))
training_sampler = RandomSampler(training_data)
training_dataloader = DataLoader(training_data, sampler=training_sampler, batch_size=VAR['batch_size'])

validation_data = TensorDataset(torch.tensor(validation_inputs), torch.tensor(validation_masks), torch.tensor(validation_labels))
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=VAR['batch_size'])


In [16]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(encoder.classes_))
model = nn.DataParallel(model)
model.to(device)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, correct_bias=False)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
def flat_accuracy(predicted_labels, labels):
    predicted_labels = numpy.argmax(predicted_labels.to('cpu').numpy(), axis=1).flatten()
    labels = labels.to('cpu').numpy().flatten()
    return numpy.sum(predicted_labels == labels) / len(labels)


In [18]:
from tqdm import trange
epochs = 30
random.seed(42)
torch.manual_seed(42)
training_losses = []

for epoch in trange(epochs, desc="Epoch"):
    model.train()
    training_loss = 0
    training_steps = 0

    for step, batch in enumerate(training_dataloader):
        inputs = batch[0].to(device)
        attention_masks = batch[1].to(device)
        # labels = labels.type(torch.LongTensor)
        labels = batch[2].to(device)

        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=attention_masks, labels=labels)
        # print("TRAIN", outputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        training_loss += loss.item()
        training_steps += 1

        training_losses.append(loss.item())

    average_training_loss = training_loss/training_steps
    print("Epoch {}: Average Training Loss: {}".format(epoch+1, average_training_loss))

    model.eval()
    validation_accuracy = 0
    validation_steps = 0

    for batch in validation_dataloader:
        inputs = batch[0].to(device)
        attention_masks = batch[1].to(device)
        labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(inputs, attention_mask=attention_masks, labels=labels)
            # print("VAL", outputs)

        logits = outputs.logits
        temp_validation_accuracy = flat_accuracy(logits, labels)
        validation_accuracy += temp_validation_accuracy
        validation_steps += 1

    average_validation_accuracy = validation_accuracy/validation_steps
    print("Epoch {}: Validation Accuracy: {}".format(epoch+1, average_validation_accuracy))

Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

Epoch 1: Average Training Loss: 3.2221393377884575


Epoch:   3%|▎         | 1/30 [00:17<08:24, 17.38s/it]

Epoch 1: Validation Accuracy: 0.03571428571428571
Epoch 2: Average Training Loss: 3.1139644643534785


Epoch:   7%|▋         | 2/30 [00:32<07:29, 16.05s/it]

Epoch 2: Validation Accuracy: 0.10920329670329669
Epoch 3: Average Training Loss: 2.8511427900065547


Epoch:  10%|█         | 3/30 [00:47<07:05, 15.74s/it]

Epoch 3: Validation Accuracy: 0.08035714285714286
Epoch 4: Average Training Loss: 2.5134009174678638


Epoch:  13%|█▎        | 4/30 [01:02<06:39, 15.36s/it]

Epoch 4: Validation Accuracy: 0.25206043956043955
Epoch 5: Average Training Loss: 1.9874599757401838


Epoch:  17%|█▋        | 5/30 [01:17<06:22, 15.30s/it]

Epoch 5: Validation Accuracy: 0.4168956043956044
Epoch 6: Average Training Loss: 1.4805627154267353


Epoch:  20%|██        | 6/30 [01:33<06:13, 15.58s/it]

Epoch 6: Validation Accuracy: 0.45467032967032966
Epoch 7: Average Training Loss: 1.0492682120074397


Epoch:  23%|██▎       | 7/30 [01:49<05:57, 15.53s/it]

Epoch 7: Validation Accuracy: 0.5906593406593407
Epoch 8: Average Training Loss: 0.6988247607065283


Epoch:  27%|██▋       | 8/30 [02:04<05:40, 15.48s/it]

Epoch 8: Validation Accuracy: 0.6682692307692307
Epoch 9: Average Training Loss: 0.4524361219095147


Epoch:  30%|███       | 9/30 [02:20<05:29, 15.69s/it]

Epoch 9: Validation Accuracy: 0.6792582417582418
Epoch 10: Average Training Loss: 0.30694415193537006


Epoch:  33%|███▎      | 10/30 [02:36<05:12, 15.64s/it]

Epoch 10: Validation Accuracy: 0.6792582417582418
Epoch 11: Average Training Loss: 0.21349444661451422


Epoch:  37%|███▋      | 11/30 [02:53<05:03, 15.99s/it]

Epoch 11: Validation Accuracy: 0.6902472527472527
Epoch 12: Average Training Loss: 0.15424726741469424


Epoch:  40%|████      | 12/30 [03:08<04:44, 15.79s/it]

Epoch 12: Validation Accuracy: 0.6881868131868132
Epoch 13: Average Training Loss: 0.11895340972620508


Epoch:  43%|████▎     | 13/30 [03:23<04:26, 15.66s/it]

Epoch 13: Validation Accuracy: 0.6881868131868132
Epoch 14: Average Training Loss: 0.10016696576191031


Epoch:  47%|████▋     | 14/30 [03:40<04:12, 15.80s/it]

Epoch 14: Validation Accuracy: 0.6971153846153847
Epoch 15: Average Training Loss: 0.08488919893684595


Epoch:  50%|█████     | 15/30 [03:56<03:59, 15.96s/it]

Epoch 15: Validation Accuracy: 0.6971153846153847
Epoch 16: Average Training Loss: 0.07322068528636642


Epoch:  53%|█████▎    | 16/30 [04:10<03:34, 15.32s/it]

Epoch 16: Validation Accuracy: 0.6971153846153847
Epoch 17: Average Training Loss: 0.06544263353166373


Epoch:  57%|█████▋    | 17/30 [04:24<03:16, 15.12s/it]

Epoch 17: Validation Accuracy: 0.6971153846153847
Epoch 18: Average Training Loss: 0.05834278464317322


Epoch:  60%|██████    | 18/30 [04:38<02:57, 14.81s/it]

Epoch 18: Validation Accuracy: 0.7081043956043956
Epoch 19: Average Training Loss: 0.05840563531155172


Epoch:  63%|██████▎   | 19/30 [04:53<02:42, 14.76s/it]

Epoch 19: Validation Accuracy: 0.6971153846153847
Epoch 20: Average Training Loss: 0.050425378848677094


Epoch:  67%|██████▋   | 20/30 [05:09<02:31, 15.12s/it]

Epoch 20: Validation Accuracy: 0.7081043956043956
Epoch 21: Average Training Loss: 0.04635015340602916


Epoch:  70%|███████   | 21/30 [05:25<02:18, 15.41s/it]

Epoch 21: Validation Accuracy: 0.717032967032967
Epoch 22: Average Training Loss: 0.04169160467774972


Epoch:  73%|███████▎  | 22/30 [05:40<02:02, 15.26s/it]

Epoch 22: Validation Accuracy: 0.717032967032967
Epoch 23: Average Training Loss: 0.03925108723342419


Epoch:  77%|███████▋  | 23/30 [05:55<01:46, 15.16s/it]

Epoch 23: Validation Accuracy: 0.717032967032967
Epoch 24: Average Training Loss: 0.036814695224165916


Epoch:  80%|████████  | 24/30 [06:10<01:31, 15.22s/it]

Epoch 24: Validation Accuracy: 0.717032967032967
Epoch 25: Average Training Loss: 0.03443587644268637


Epoch:  83%|████████▎ | 25/30 [06:25<01:15, 15.11s/it]

Epoch 25: Validation Accuracy: 0.717032967032967
Epoch 26: Average Training Loss: 0.03253409922446893


Epoch:  87%|████████▋ | 26/30 [06:40<01:00, 15.11s/it]

Epoch 26: Validation Accuracy: 0.717032967032967
Epoch 27: Average Training Loss: 0.03039335791507493


Epoch:  90%|█████████ | 27/30 [06:55<00:45, 15.07s/it]

Epoch 27: Validation Accuracy: 0.717032967032967
Epoch 28: Average Training Loss: 0.02922360258905784


Epoch:  93%|█████████▎| 28/30 [07:10<00:30, 15.04s/it]

Epoch 28: Validation Accuracy: 0.717032967032967
Epoch 29: Average Training Loss: 0.02767333048193351


Epoch:  97%|█████████▋| 29/30 [07:25<00:14, 15.00s/it]

Epoch 29: Validation Accuracy: 0.717032967032967
Epoch 30: Average Training Loss: 0.026497313266862995


Epoch: 100%|██████████| 30/30 [07:41<00:00, 15.39s/it]

Epoch 30: Validation Accuracy: 0.717032967032967





In [19]:
input_ids = torch.tensor(testing_inputs)
attention_masks = torch.tensor(testing_masks)
labels = torch.tensor(testing_labels)

prediction_data = TensorDataset(input_ids, attention_masks, labels)
prediction_dataloader = DataLoader(prediction_data, batch_size=VAR['batch_size'])

model.eval()
logits_set = []
labels_set = []

for batch in prediction_dataloader:
    batch_input_ids, batch_attention_masks, batch_labels = batch
    batch_input_ids, batch_attention_masks, batch_labels = batch_input_ids.to(device), batch_attention_masks.to(device), batch_labels.to(device)

    with torch.no_grad():
        outputs = model(batch_input_ids, attention_mask=batch_attention_masks)
        logits = outputs.logits

    logits_set.append(logits.cpu().numpy())
    labels_set.append(batch_labels.cpu().numpy())

In [20]:
from sklearn.metrics import matthews_corrcoef
matthews_set = []

# Calculate Matthews correlation coefficient for each batch
for i in range(len(labels_set)):
    mcc = matthews_corrcoef(labels_set[i], numpy.argmax(logits_set[i], axis=1).flatten())
    matthews_set.append(mcc)

for i, mcc in enumerate(matthews_set):
    print(f"Batch {i + 1}: MCC = {mcc}")

# Calculate the overall Matthews correlation coefficient
overall_mcc = numpy.mean(matthews_set)
print(f"\nOverall MCC: {overall_mcc}")

Batch 1: MCC = 0.7982763270115342
Batch 2: MCC = 0.9345067267136572
Batch 3: MCC = 0.58

Overall MCC: 0.7709276845750638


In [21]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []


for i in range(len(labels_set)):
    acc = accuracy_score(labels_set[i], numpy.argmax(logits_set[i], axis=1).flatten())
    accuracy_scores.append(acc)
    
    precision = precision_score(labels_set[i], numpy.argmax(logits_set[i], axis=1).flatten(), average='weighted', zero_division=0)
    precision_scores.append(precision)
    
    recall = recall_score(labels_set[i], numpy.argmax(logits_set[i], axis=1).flatten(), average='weighted', zero_division=0)
    recall_scores.append(recall)
    
    f1_scoring = f1_score(labels_set[i], numpy.argmax(logits_set[i], axis=1).flatten(), average='weighted', zero_division=0)
    f1_scores.append(f1_scoring)
    
mean_accuracy = numpy.mean(accuracy_scores)
mean_precision = numpy.mean(precision_scores)
mean_recall = numpy.mean(recall_scores)
mean_f1 = numpy.mean(f1_scores)

print(f'{mean_accuracy=}, {mean_precision=}, {mean_recall=}, {mean_f1=}')

mean_accuracy=0.7833333333333333, mean_precision=0.7625000000000001, mean_recall=0.7833333333333333, mean_f1=0.762037037037037


In [22]:
%timeit model(batch_input_ids, attention_mask=batch_attention_masks)

217 ms ± 80.8 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [23]:
batch_input_ids.shape

torch.Size([15, 512])