In [1]:
VAR = {
    'max_len': 128,
    'batch_size': 64
}

In [2]:
import numpy
import pandas
import torch
import torch.nn as nn
from transformers import BertTokenizer, AdamW, BertForSequenceClassification
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas




# Classification Task: Initiating Data

In [3]:
from preprocessing_NLP import pipeline

resume_df = pipeline('UpdatedResumeDataSet_T1_7.csv', feature_name='Resume')
resume_df = resume_df.reset_index(drop=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\JS\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\JS\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\JS\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\JS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def split_into_sub_length(resume: str, max_len=128):
    # Ensures all words are tokenized and analyzed;  Due to keras pad_sequences()
    resume_list = resume.split()
    
    resume_length = len(resume_list)
    
    splits = resume_length // max_len
    remaining_splits = resume_length % max_len
    
    resume_at_max_len = []

    for i in range(splits):
        resume_section = ' '.join(resume_list[i*max_len: (i+1)*max_len])
        # print(resume_list[i*max_len: (i+1)*max_len])
        resume_at_max_len.append(resume_section)
        
    final_section = ' '.join(resume_list[-1-remaining_splits:-1])
    resume_at_max_len.append(final_section)
    
    return resume_at_max_len

In [5]:
resume_df['Resume'] = resume_df['Resume'].apply(split_into_sub_length)
resume_df['Resume']

0      [qwtnrvduof education detail may 2013 may 2017...
1      [qwtnrvduof area interest deep learn control s...
2      [skill r python sap hana tableau sap hana sql ...
3      [education detail mca ymcaust faridabad haryan...
4      [skill c basic iot python matlab data science ...
                             ...                        
183    [skill set o window xp 7 8 8bntgbqlmkk1 10 dat...
184    [good logical analytical skill positive attitu...
185    [personal skill quick learner eagerness learn ...
186    [core skill project program management agile s...
187    [education detail february 2006 february 2006 ...
Name: Resume, Length: 188, dtype: object

In [6]:
resume_df = resume_df.explode('Resume')
resume_df

Unnamed: 0,Category,Resume
0,Data Science,qwtnrvduof education detail may 2013 may 2017 ...
0,Data Science,mixed attribute company matelabs
1,Data Science,qwtnrvduof area interest deep learn control sy...
1,Data Science,year month mathematics exprience less 1 year m...
2,Data Science,skill r python sap hana tableau sap hana sql s...
...,...,...
186,DevOps Engineer,various type test like system regression sanit...
186,DevOps Engineer,transition drill session forward reverse shado...
186,DevOps Engineer,action plan team prepare implement business co...
187,Business Analyst,education detail february 2006 february 2006 t...


In [7]:
type(resume_df['Category'].iloc[0])

str

In [8]:
encoder = LabelEncoder()

In [9]:
resume_df['Category'] = encoder.fit_transform(resume_df['Category'])
resume_df['Category'] = resume_df['Category'].astype(numpy.int64)

In [10]:
type(resume_df['Category'].iloc[0])

numpy.int64

In [11]:
resume_df['Category'].value_counts().max()

41

In [12]:
import random

# minimum = resume_df['Category'].value_counts().max()
minimum = 0
current_lowest = resume_df['Category'].value_counts().min()

#Check
count = resume_df['Category'].value_counts()
remaining = 7 - count[count<minimum]

while len(remaining != 0):
    count = resume_df['Category'].value_counts()
    remaining = 7 - count[count<minimum]

    for category in remaining.index:
        someInt = random.randint(0, current_lowest-1)
        value_to_append = resume_df[
            resume_df['Category']==category
            ]['Resume'].values[someInt]


        df_to_concat = pandas.DataFrame({
            'Category': [category],
            'Resume': [value_to_append]
        })

        resume_df = pandas.concat([resume_df, df_to_concat], axis=0)

In [13]:
resume_df['Category'].value_counts()

Category
7     41
18    39
6     35
8     31
15    27
4     27
13    25
2     25
21    23
9     22
17    21
5     18
10    17
12    16
19    16
1     15
11    15
16    15
0     14
23    14
14    14
20    14
3     13
24    12
22    11
Name: count, dtype: int64

In [14]:
resume_df = resume_df.reset_index(drop=True)
test_df = resume_df.sample(100, random_state=42)
resume_df = resume_df.drop(test_df.index)

test_df = test_df.reset_index(drop=True)
resume_df = resume_df.reset_index(drop=True)

### Augmentation

In [15]:
import nlpaug.flow as naf
import nlpaug.augmenter.word as naw
from nlpaug.util.file.download import DownloadUtil
# DownloadUtil.download_word2vec(dest_dir='.')

In [16]:
flow = naf.Sequential([
    # naw.WordEmbsAug(model_type='word2vec', model_path='GoogleNews-vectors-negative300.bin',
    #                 action="substitute", aug_p=0.2),
    # naw.WordEmbsAug(model_type='word2vec', model_path='GoogleNews-vectors-negative300.bin',
    #                 action="insert", aug_p=0.1),
    # naw.RandomWordAug(action="swap", aug_p=0.5),
    # naw.RandomWordAug(action="delete", aug_p=0.1),
    
    naw.SynonymAug(aug_src='wordnet', aug_p=0.3)
])

In [17]:
resume_df['Resume'] = resume_df['Resume'].apply(lambda x: flow.augment(x, n=3))

In [18]:
resume_df = resume_df.explode('Resume')

In [19]:
resume_df.dropna(inplace=True)

resume_df

Unnamed: 0,Category,Resume
0,6,mixed dimension company matelabs
0,6,miscellaneous attribute company matelabs
0,6,miscellaneous attribute caller matelabs
1,6,year month mathematics exprience le 1 twelvemo...
1,6,year month mathematics exprience less one year...
...,...,...
418,8,action program team prepare implement business...
418,8,activity plan squad prepare implement business...
419,4,education detail february 2006 february 2006 t...
419,4,education detail february 2006 february 2006 t...


In [20]:
resumes = test_df.Resume.values
resumes = ["[CLS] " + resume + " [SEP]" for resume in resumes]
testing_labels = test_df.Category.values

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_texts = [tokenizer.tokenize(resume) for resume in resumes]

testing_inputs = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
testing_inputs = pad_sequences(testing_inputs, maxlen=VAR['max_len'], dtype="long", truncating="post", padding="post")

testing_masks = []
for sequence in testing_inputs:
    sequence_mask = [float(id > 0) for id in sequence]
    testing_masks.append(sequence_mask)

In [21]:
resumes = resume_df.Resume.values
resumes = ["[CLS] " + resume + " [SEP]" for resume in resumes]
labels = resume_df.Category.values

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_texts = [tokenizer.tokenize(resume) for resume in resumes]

input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=VAR['max_len'], dtype="long", truncating="post", padding="post")

attention_masks = []
for sequence in input_ids:
    sequence_mask = [float(id > 0) for id in sequence]
    attention_masks.append(sequence_mask)

In [22]:
training_inputs, validation_inputs, training_labels, validation_labels, training_masks, validation_masks = train_test_split(
    input_ids, labels, attention_masks,
    random_state=42, test_size=0.3
)

training_data = TensorDataset(torch.tensor(training_inputs), torch.tensor(training_masks), torch.tensor(training_labels))
training_sampler = RandomSampler(training_data)
training_dataloader = DataLoader(training_data, sampler=training_sampler, batch_size=VAR['batch_size'])

validation_data = TensorDataset(torch.tensor(validation_inputs), torch.tensor(validation_masks), torch.tensor(validation_labels))
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=VAR['batch_size'])


In [23]:
from transformers import BertConfig, BertModel
configuration = BertConfig()
model = BertModel(configuration)
configuration = model.config

from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(encoder.classes_))
model = nn.DataParallel(model)
model.to(device)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, correct_bias=False)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
def flat_accuracy(predicted_labels, labels):
    predicted_labels = numpy.argmax(predicted_labels.to('cpu').numpy(), axis=1).flatten()
    labels = labels.to('cpu').numpy().flatten()
    return numpy.sum(predicted_labels == labels) / len(labels)


In [25]:
from tqdm import trange
epochs = 10
random.seed(42)
torch.manual_seed(42)
training_losses = []

for epoch in trange(epochs, desc="Epoch"):
    model.train()
    training_loss = 0
    training_steps = 0

    for step, batch in enumerate(training_dataloader):
        inputs = batch[0].to(device)
        attention_masks = batch[1].to(device)
        # labels = labels.type(torch.LongTensor)
        labels = batch[2].to(device)

        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=attention_masks, labels=labels)
        # print("TRAIN", outputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        training_loss += loss.item()
        training_steps += 1

        training_losses.append(loss.item())

    average_training_loss = training_loss/training_steps
    print("Epoch {}: Average Training Loss: {}".format(epoch+1, average_training_loss))

    model.eval()
    validation_accuracy = 0
    validation_steps = 0

    for batch in validation_dataloader:
        inputs = batch[0].to(device)
        attention_masks = batch[1].to(device)
        labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(inputs, attention_mask=attention_masks, labels=labels)
            # print("VAL", outputs)

        logits = outputs.logits
        temp_validation_accuracy = flat_accuracy(logits, labels)
        validation_accuracy += temp_validation_accuracy
        validation_steps += 1

    average_validation_accuracy = validation_accuracy/validation_steps
    print("Epoch {}: Validation Accuracy: {}".format(epoch+1, average_validation_accuracy))

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1: Average Training Loss: 3.1998044763292586


Epoch:  10%|█         | 1/10 [00:05<00:51,  5.70s/it]

Epoch 1: Validation Accuracy: 0.09827302631578948
Epoch 2: Average Training Loss: 2.985216804913112


Epoch:  20%|██        | 2/10 [00:11<00:45,  5.63s/it]

Epoch 2: Validation Accuracy: 0.1695449561403509
Epoch 3: Average Training Loss: 2.6669307436261858


Epoch:  30%|███       | 3/10 [00:16<00:39,  5.64s/it]

Epoch 3: Validation Accuracy: 0.23789290935672514
Epoch 4: Average Training Loss: 2.3292363711765836


Epoch:  40%|████      | 4/10 [00:22<00:33,  5.63s/it]

Epoch 4: Validation Accuracy: 0.35667945906432746
Epoch 5: Average Training Loss: 1.9099662474223547


Epoch:  50%|█████     | 5/10 [00:28<00:28,  5.62s/it]

Epoch 5: Validation Accuracy: 0.4822733918128655
Epoch 6: Average Training Loss: 1.546489179134369


Epoch:  60%|██████    | 6/10 [00:33<00:22,  5.60s/it]

Epoch 6: Validation Accuracy: 0.6355537280701754
Epoch 7: Average Training Loss: 1.205161018030984


Epoch:  70%|███████   | 7/10 [00:39<00:16,  5.59s/it]

Epoch 7: Validation Accuracy: 0.7562591374269005
Epoch 8: Average Training Loss: 0.8848496462617602


Epoch:  80%|████████  | 8/10 [00:44<00:11,  5.58s/it]

Epoch 8: Validation Accuracy: 0.8529331140350878
Epoch 9: Average Training Loss: 0.6292311506611961


Epoch:  90%|█████████ | 9/10 [00:50<00:05,  5.60s/it]

Epoch 9: Validation Accuracy: 0.9280884502923977
Epoch 10: Average Training Loss: 0.3987915962934494


Epoch: 100%|██████████| 10/10 [00:56<00:00,  5.61s/it]

Epoch 10: Validation Accuracy: 0.9352613304093568





In [26]:
input_ids = torch.tensor(testing_inputs)
attention_masks = torch.tensor(testing_masks)
labels = torch.tensor(testing_labels)

prediction_data = TensorDataset(input_ids, attention_masks, labels)
prediction_dataloader = DataLoader(prediction_data, batch_size=VAR['batch_size'])

model.eval()
logits_set = []
labels_set = []

for batch in prediction_dataloader:
    batch_input_ids, batch_attention_masks, batch_labels = batch
    batch_input_ids, batch_attention_masks, batch_labels = batch_input_ids.to(device), batch_attention_masks.to(device), batch_labels.to(device)

    with torch.no_grad():
        outputs = model(batch_input_ids, attention_mask=batch_attention_masks)
        logits = outputs.logits

    logits_set.append(logits.cpu().numpy())
    labels_set.append(batch_labels.cpu().numpy())

In [27]:
from sklearn.metrics import matthews_corrcoef
matthews_set = []

# Calculate Matthews correlation coefficient for each batch
for i in range(len(labels_set)):
    mcc = matthews_corrcoef(labels_set[i], numpy.argmax(logits_set[i], axis=1).flatten())
    matthews_set.append(mcc)

for i, mcc in enumerate(matthews_set):
    print(f"Batch {i + 1}: MCC = {mcc}")

# Calculate the overall Matthews correlation coefficient
overall_mcc = numpy.mean(matthews_set)
print(f"\nOverall MCC: {overall_mcc}")

Batch 1: MCC = 0.6412299198109214
Batch 2: MCC = 0.7435707691490477

Overall MCC: 0.6924003444799846


In [28]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []


for i in range(len(labels_set)):
    acc = accuracy_score(labels_set[i], numpy.argmax(logits_set[i], axis=1).flatten())
    accuracy_scores.append(acc)
    
    precision = precision_score(labels_set[i], numpy.argmax(logits_set[i], axis=1).flatten(), average='weighted', zero_division=0)
    precision_scores.append(precision)
    
    recall = recall_score(labels_set[i], numpy.argmax(logits_set[i], axis=1).flatten(), average='weighted', zero_division=0)
    recall_scores.append(recall)
    
    f1_scoring = f1_score(labels_set[i], numpy.argmax(logits_set[i], axis=1).flatten(), average='weighted', zero_division=0)
    f1_scores.append(f1_scoring)
    
mean_accuracy = numpy.mean(accuracy_scores)
mean_precision = numpy.mean(precision_scores)
mean_recall = numpy.mean(recall_scores)
mean_f1 = numpy.mean(f1_scores)

print(f'{mean_accuracy=}, {mean_precision=}, {mean_recall=}, {mean_f1=}')

mean_accuracy=0.703125, mean_precision=0.7702763310185186, mean_recall=0.703125, mean_f1=0.6946201849847684
