# Classification Task: Initiating Data

In [1]:
import numpy
import pandas
import torch

import torch.nn as nn
import transformers
def generate_class_labels(labels):

    unique_labels = list(set(labels))

    unique_labels_map = {}

    for index, label in enumerate(unique_labels, start=0):
        unique_labels_map[label] = index

    return unique_labels_map

data_frame = pandas.read_csv("cleanedResumes.csv")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device('cpu')

data_frame.columns = 'label', 'sentence'

labels = data_frame['label']
label_map = generate_class_labels(labels)
data_frame['label'] = data_frame['label'].apply(lambda label_name: label_map[label_name])

data_frame.sample(10)

# Prepare sentences and labels
sentences = data_frame.sentence.values
# sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
labels = data_frame.label.values

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas


In [2]:
data_frame

Unnamed: 0,label,sentence
0,4,qwtnrvduof education detail may 2013 to may 20...
1,4,qwtnrvduof area of interest deep learn control...
2,4,skill r python sap hana tableau sap hana sql s...
3,4,education detail mca ymcaust faridabad haryana...
4,4,skill c basic iot python matlab data science m...
...,...,...
183,23,skill set o window xp 7 8 8bntgbqlmkk1 10 data...
184,23,good logical and analytical skill positive att...
185,23,personal skill quick learner eagerness to lear...
186,13,core skill project program management agile sc...


In [3]:
# Tokenize sentences using BERT tokenizer
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

# Pad sequences and create attention masks
from keras.preprocessing.sequence import pad_sequences
MAX_LEN = 128
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

attention_masks = []
for sequence in input_ids:
    sequence_mask = [float(id > 0) for id in sequence]
    attention_masks.append(sequence_mask)

# Split data into training and validation sets
from sklearn.model_selection import train_test_split
training_inputs, validation_inputs, training_labels, validation_labels, training_masks, validation_masks = train_test_split(
    input_ids, labels, attention_masks,
    random_state=2018, test_size=0.1
)

# Create DataLoader for training set
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
batch_size = 32
training_data = TensorDataset(torch.tensor(training_inputs), torch.tensor(training_masks), torch.tensor(training_labels))
training_sampler = RandomSampler(training_data)
training_dataloader = DataLoader(training_data, sampler=training_sampler, batch_size=batch_size)

# Create DataLoader for validation set
validation_data = TensorDataset(torch.tensor(validation_inputs), torch.tensor(validation_masks), torch.tensor(validation_labels))
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# Configure BERT model for sequence classification
from transformers import BertConfig, BertModel
configuration = BertConfig()
model = BertModel(configuration)
configuration = model.config

from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                      num_labels=len(label_map.keys()))
model = nn.DataParallel(model)
model.to(device)

from transformers import AdamW
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, correct_bias=False)

def flat_accuracy(predicted_labels, labels):
    predicted_labels = numpy.argmax(predicted_labels.to('cpu').numpy(), axis=1).flatten()
    labels = labels.to('cpu').numpy().flatten()
    return numpy.sum(predicted_labels == labels) / len(labels)

# Train the BERT model
from tqdm import trange
epochs = 1000
training_losses = []

for epoch in trange(epochs, desc="Epoch"):
    model.train()
    training_loss = 0
    training_steps = 0

    for step, batch in enumerate(training_dataloader):
        inputs = batch[0].to(device)
        attention_masks = batch[1].to(device)
        labels = batch[2].to(device)

        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        training_loss += loss.item()
        training_steps += 1

        training_losses.append(loss.item())

    average_training_loss = training_loss/training_steps
    print("Epoch {}: Average Training Loss: {}".format(epoch+1, average_training_loss))

    model.eval()
    validation_accuracy = 0
    validation_steps = 0

    for batch in validation_dataloader:
        inputs = batch[0].to(device)
        attention_masks = batch[1].to(device)
        labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(inputs, attention_mask=attention_masks, labels=labels)

        logits = outputs.logits
        temp_validation_accuracy = flat_accuracy(logits, labels)
        validation_accuracy += temp_validation_accuracy
        validation_steps += 1

    average_validation_accuracy = validation_accuracy/validation_steps
    print("Epoch {}: Validation Accuracy: {}".format(epoch+1, average_validation_accuracy))

data_frame = pandas.read_csv("out_of_domain_dev.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])
sentences = data_frame.sentence.values
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
labels = data_frame.label.values

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

input_ids = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

attention_masks = [[float(i > 0) for i in seq] for seq in input_ids]

input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)
labels = torch.tensor(labels)

prediction_data = TensorDataset(input_ids, attention_masks, labels)
prediction_dataloader = DataLoader(prediction_data, batch_size=batch_size)

# Evaluate the BERT model on the out-of-domain dataset
model.eval()
logits_set = []
labels_set = []

for batch in prediction_dataloader:
    batch_input_ids, batch_attention_masks, batch_labels = batch
    batch_input_ids, batch_attention_masks, batch_labels = batch_input_ids.to(device), batch_attention_masks.to(device), batch_labels.to(device)

    with torch.no_grad():
        outputs = model(batch_input_ids, attention_mask=batch_attention_masks)
        logits = outputs.logits

    logits_set.append(logits.cpu().numpy())
    labels_set.append(batch_labels.cpu().numpy())

from sklearn.metrics import matthews_corrcoef
matthews_set = []

# Calculate Matthews correlation coefficient for each batch
for i in range(len(labels_set)):
    mcc = matthews_corrcoef(labels_set[i], numpy.argmax(logits_set[i], axis=1).flatten())
    matthews_set.append(mcc)

for i, mcc in enumerate(matthews_set):
    print(f"Batch {i + 1}: MCC = {mcc}")

# Calculate the overall Matthews correlation coefficient
overall_mcc = numpy.mean(matthews_set)
print(f"\nOverall MCC: {overall_mcc}")



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch:   0%|          | 1/1000 [00:03<1:04:05,  3.85s/it]

Epoch 1: Average Training Loss: 3.307697375615438
Epoch 1: Validation Accuracy: 0.10526315789473684


Epoch:   0%|          | 2/1000 [00:05<45:14,  2.72s/it]  

Epoch 2: Average Training Loss: 3.208211143811544
Epoch 2: Validation Accuracy: 0.05263157894736842


Epoch:   0%|          | 3/1000 [00:07<39:12,  2.36s/it]

Epoch 3: Average Training Loss: 3.1169822216033936
Epoch 3: Validation Accuracy: 0.10526315789473684


Epoch:   0%|          | 4/1000 [00:09<36:31,  2.20s/it]

Epoch 4: Average Training Loss: 3.1132096449534097
Epoch 4: Validation Accuracy: 0.05263157894736842


Epoch:   0%|          | 5/1000 [00:11<34:59,  2.11s/it]

Epoch 5: Average Training Loss: 2.9384714365005493
Epoch 5: Validation Accuracy: 0.05263157894736842


Epoch:   1%|          | 6/1000 [00:13<34:02,  2.05s/it]

Epoch 6: Average Training Loss: 2.8603967825571694
Epoch 6: Validation Accuracy: 0.05263157894736842


Epoch:   1%|          | 7/1000 [00:15<33:23,  2.02s/it]

Epoch 7: Average Training Loss: 2.8279240926106772
Epoch 7: Validation Accuracy: 0.05263157894736842


Epoch:   1%|          | 8/1000 [00:17<33:01,  2.00s/it]

Epoch 8: Average Training Loss: 2.714881261189779
Epoch 8: Validation Accuracy: 0.10526315789473684


Epoch:   1%|          | 9/1000 [00:19<32:45,  1.98s/it]

Epoch 9: Average Training Loss: 2.6096741755803428
Epoch 9: Validation Accuracy: 0.05263157894736842


Epoch:   1%|          | 10/1000 [00:21<32:36,  1.98s/it]

Epoch 10: Average Training Loss: 2.476803501447042
Epoch 10: Validation Accuracy: 0.15789473684210525


Epoch:   1%|          | 11/1000 [00:23<32:31,  1.97s/it]

Epoch 11: Average Training Loss: 2.4712359507878623
Epoch 11: Validation Accuracy: 0.15789473684210525


Epoch:   1%|          | 12/1000 [00:25<32:26,  1.97s/it]

Epoch 12: Average Training Loss: 2.3702671925226846
Epoch 12: Validation Accuracy: 0.21052631578947367


Epoch:   1%|▏         | 13/1000 [00:27<32:22,  1.97s/it]

Epoch 13: Average Training Loss: 2.2750934759775796
Epoch 13: Validation Accuracy: 0.3157894736842105


Epoch:   1%|▏         | 14/1000 [00:29<32:20,  1.97s/it]

Epoch 14: Average Training Loss: 2.2302366495132446
Epoch 14: Validation Accuracy: 0.10526315789473684


Epoch:   2%|▏         | 15/1000 [00:31<32:19,  1.97s/it]

Epoch 15: Average Training Loss: 2.178211828072866
Epoch 15: Validation Accuracy: 0.15789473684210525


Epoch:   2%|▏         | 16/1000 [00:33<32:18,  1.97s/it]

Epoch 16: Average Training Loss: 2.0813910563786826
Epoch 16: Validation Accuracy: 0.3157894736842105


Epoch:   2%|▏         | 17/1000 [00:35<32:22,  1.98s/it]

Epoch 17: Average Training Loss: 1.978749672571818
Epoch 17: Validation Accuracy: 0.2631578947368421


Epoch:   2%|▏         | 18/1000 [00:37<32:22,  1.98s/it]

Epoch 18: Average Training Loss: 1.8657977382342021
Epoch 18: Validation Accuracy: 0.2631578947368421


Epoch:   2%|▏         | 19/1000 [00:39<32:18,  1.98s/it]

Epoch 19: Average Training Loss: 1.6414345701535542
Epoch 19: Validation Accuracy: 0.3157894736842105


Epoch:   2%|▏         | 20/1000 [00:41<32:19,  1.98s/it]

Epoch 20: Average Training Loss: 1.5263442893822987
Epoch 20: Validation Accuracy: 0.3157894736842105


Epoch:   2%|▏         | 21/1000 [00:43<32:17,  1.98s/it]

Epoch 21: Average Training Loss: 1.4431722263495128
Epoch 21: Validation Accuracy: 0.3157894736842105


Epoch:   2%|▏         | 22/1000 [00:45<32:18,  1.98s/it]

Epoch 22: Average Training Loss: 1.4663065473238628
Epoch 22: Validation Accuracy: 0.2631578947368421


Epoch:   2%|▏         | 23/1000 [00:47<32:21,  1.99s/it]

Epoch 23: Average Training Loss: 1.2944801648457844
Epoch 23: Validation Accuracy: 0.3157894736842105


Epoch:   2%|▏         | 24/1000 [00:49<32:20,  1.99s/it]

Epoch 24: Average Training Loss: 1.2125499447186787
Epoch 24: Validation Accuracy: 0.3684210526315789


Epoch:   2%|▎         | 25/1000 [00:51<32:19,  1.99s/it]

Epoch 25: Average Training Loss: 1.181723674138387
Epoch 25: Validation Accuracy: 0.3684210526315789


Epoch:   3%|▎         | 26/1000 [00:53<32:18,  1.99s/it]

Epoch 26: Average Training Loss: 1.0745592415332794
Epoch 26: Validation Accuracy: 0.42105263157894735


Epoch:   3%|▎         | 27/1000 [00:55<32:16,  1.99s/it]

Epoch 27: Average Training Loss: 1.0695224304993947
Epoch 27: Validation Accuracy: 0.47368421052631576


Epoch:   3%|▎         | 28/1000 [00:57<32:18,  1.99s/it]

Epoch 28: Average Training Loss: 0.9554974337418874
Epoch 28: Validation Accuracy: 0.5263157894736842


Epoch:   3%|▎         | 29/1000 [00:59<32:18,  2.00s/it]

Epoch 29: Average Training Loss: 0.9707877536614736
Epoch 29: Validation Accuracy: 0.47368421052631576


Epoch:   3%|▎         | 30/1000 [01:01<32:21,  2.00s/it]

Epoch 30: Average Training Loss: 0.8765203058719635
Epoch 30: Validation Accuracy: 0.5263157894736842


Epoch:   3%|▎         | 31/1000 [01:03<32:22,  2.01s/it]

Epoch 31: Average Training Loss: 0.8320548931757609
Epoch 31: Validation Accuracy: 0.5263157894736842


Epoch:   3%|▎         | 32/1000 [01:05<32:27,  2.01s/it]

Epoch 32: Average Training Loss: 0.8874687453111013
Epoch 32: Validation Accuracy: 0.5263157894736842


Epoch:   3%|▎         | 33/1000 [01:07<32:28,  2.02s/it]

Epoch 33: Average Training Loss: 0.760360966126124
Epoch 33: Validation Accuracy: 0.5789473684210527


Epoch:   3%|▎         | 34/1000 [01:09<32:27,  2.02s/it]

Epoch 34: Average Training Loss: 0.7564536233743032
Epoch 34: Validation Accuracy: 0.47368421052631576


Epoch:   4%|▎         | 35/1000 [01:11<32:29,  2.02s/it]

Epoch 35: Average Training Loss: 0.6813815186421076
Epoch 35: Validation Accuracy: 0.42105263157894735


Epoch:   4%|▎         | 36/1000 [01:13<32:27,  2.02s/it]

Epoch 36: Average Training Loss: 0.6270691603422165
Epoch 36: Validation Accuracy: 0.42105263157894735


Epoch:   4%|▎         | 36/1000 [01:14<33:08,  2.06s/it]


KeyboardInterrupt: 

# OUT OF CODE

In [None]:
from nltk.corpus import wordnet as wn

In [None]:
wn.synsets('prophetiqmadshiyn')

In [None]:
sample_lemmas

In [None]:
lemmatizer.lemmatize('deployed', 'v')

In [None]:
sample_tag

In [None]:
# Watch Cell
# print(clean_raw_text(sample_res))
# clean_raw_text(sample_res)
# nltk.corpus.words.raw().split('\n')
# np.isin(['detail'], nltk.corpus.words.raw().split('\n'))]
def x():
    x_dict = nltk.corpus.words.raw().split('\n')
    x_list = [levenshteinDistance('sklearn', word) for word in x_dict]
    
    id = x_list.index(min(x_list))
    print(id)
    print(min(x_list))
    print(nltk.corpus.words.raw().split('\n')[id])
    
x()

# lemmatizer.lemmatize('extracting')

In [None]:
from nltk.metrics.distance import edit_distance as test
test('aws', 'reductionqunsobcudt', substitution_cost=1)