In [42]:
## import the libs ##
import pandas as pd
import numpy as np

import torch

from transformers import pipeline, BertForSequenceClassification, BertTokenizerFast
from torch.utils.data import Dataset
from torch import cuda
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
from tqdm import tqdm

!pip install accelerate -U
!pip install sentencepiece



In [23]:
## check device availability ##
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [24]:
## We will be using bert-base-uncased model to finetune using our custom data
## since these model are already pre-trained on large amount of data, they don't need very high data to perform well

from transformers import BertTokenizer, BertForSequenceClassification
# load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# load model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)
# load model to device
model = model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
## load train and holdout csv ##
train_df = pd.read_csv(r"/content/train_df_clean.csv")
test_df = pd.read_csv(r"/content/test_df_clean.csv")

# drop na
train_df.dropna(inplace = True)
test_df.dropna(inplace = True)

# get the labels
labels = train_df['target'].unique().tolist()
print(labels)

# create id <> mappings
label2id = {"Yes": 1, "No" : 0}
id2label = {1: "Yes", 0: "No"}

[1, 0]


In [26]:
## since the train data only has ~800 data points, there was no point splitting it into small group for val, hence training on whole and finally will
## benchmark on the holdout data

train_texts = list(train_df['input_text_clean'].values)
train_labels = list(train_df['target'].values)

test_texts = list(test_df['input_text_clean'].values)
test_labels = list(test_df['target'].values)


In [27]:
train_tokenized = tokenizer(train_texts, padding=True, truncation=True, max_length=512)
test_tokenized = tokenizer(test_texts, padding=True, truncation=True, max_length=512)

In [28]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [29]:
## creating transformers Dataset style for easy loading into model

train_dataset = Dataset(train_tokenized, train_labels)
test_dataset = Dataset(test_tokenized, test_labels)

In [30]:
len(train_dataset), len(test_dataset)

(841, 80)

In [31]:
## checking the train format data

train_dataset[5]

{'input_ids': tensor([  101,  2951,  7123,  4013,  7661,  2186, 29215,  2099,  2081,  2605,
          2152,  2836,  2813, 29215,  2099,  5988,  5882, 10959,  2369,  6490,
          3973, 13338,  3898,  4359,  5995,  2625,  4642,  2152,  2373,  8304,
          2659,  3622,  7730,  2659, 20870,  3145,  2685,  9542,  5647,  2188,
          5988,  3378,  3688,  6177,  4013,  7661,  2186,  2846,  7218,  4734,
          3027,  5746,  5097,  2081,  2605,  2086, 10943,  2100,  7492,  1010,
          2086, 10943,  2100, 23713,  3669,  9413,  2015,  3033,  2326,  5452,
          3343,  7142,  7620,  3914,  2157, 27950,  5617, 28699,  2072,  4937,
          8496,  2302,  3188,  5060, 14297,  5746, 23564,  4649, 10975,  9648,
          5134,  2482,  2278,  2605,  3042,  9167, 14297, 19513,  3695,  9006,
          6853,  2764,  7609,  2012, 11631,  2213,  2605,  2373,  2302, 12528,
          4691,  2224, 23713,  3669,  9413,  3458,  6537, 27895,  2015, 12528,
          4691,  4742,  2453,  2599,  6

In [32]:
## function to cal metrics

def compute_metrics(p):

    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [33]:
# Define Args
args = TrainingArguments(
    output_dir="bert-based-parspec",
    num_train_epochs=3,
    per_device_train_batch_size=8

)

# Define Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [34]:
## train the model

trainer.train()

Step,Training Loss


TrainOutput(global_step=318, training_loss=0.1676722472568728, metrics={'train_runtime': 234.7431, 'train_samples_per_second': 10.748, 'train_steps_per_second': 1.355, 'total_flos': 663829192673280.0, 'train_loss': 0.1676722472568728, 'epoch': 3.0})

In [35]:
## evaluate the model

trainer.evaluate()

{'eval_loss': 1.3154149055480957,
 'eval_accuracy': 0.75,
 'eval_precision': 0.5,
 'eval_recall': 1.0,
 'eval_f1': 0.6666666666666666,
 'eval_runtime': 2.538,
 'eval_samples_per_second': 31.521,
 'eval_steps_per_second': 3.94,
 'epoch': 3.0}

In [36]:
import numpy as np
np.set_printoptions(suppress=True)


In [41]:
## A simple testing on a random entry from holdout set

text = test_texts[3]

inputs = tokenizer(text,padding = True, truncation = True, return_tensors='pt').to('cuda')
outputs = model(**inputs)
print("Raw output -> ", outputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print("Class output -> ", predictions.argmax().item())
# actual output
print(test_labels[3])
print("0 -> No lighting || 1 -> Lighting")



Raw output ->  SequenceClassifierOutput(loss=None, logits=tensor([[-2.9409,  3.8667]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
Class output ->  1
1
0 -> No lighting || 1 -> Lighting


In [43]:
## cal scores for the same [ accuracy ] for all holdout set and create a dataframe

pred_df = {
    'gt' : [],
    'pred' : []
}

pred_labels = []
print("Predicting info....")

for text in tqdm(test_texts):
  inputs = tokenizer(text, padding = True, truncation = True, return_tensors='pt').to('cuda')
  outputs = model(**inputs)
  # print(outputs)
  predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

  output_response = predictions.argmax().item()
  pred_labels.append(output_response)   # get the class

assert len(test_labels) == len(pred_labels), "Mismatch in labels size"

pred_df['gt'] = test_labels
pred_df['pred'] = pred_labels

df = pd.DataFrame(pred_df)

print(df)
## save the df
df.to_csv("Holdout score - parspec.csv", index=False)

print("----")
print("Acc score ---> ", accuracy_score(test_labels, pred_labels))
print(confusion_matrix(test_labels, pred_labels))

Predicting info....


100%|██████████| 80/80 [00:05<00:00, 14.76it/s]

    gt  pred
0    0     0
1    0     0
2    1     1
3    1     1
4    1     1
..  ..   ...
75   0     0
76   0     0
77   0     0
78   1     1
79   1     1

[80 rows x 2 columns]
----
Acc score --->  0.75
[[40 20]
 [ 0 20]]





In [44]:
## loading the model from local and checking predictions

trainer.save_model('bert-base-uncased-finetuned-clean-data')
model_1 = BertForSequenceClassification.from_pretrained("bert-base-uncased-finetuned-clean-data")
model_1.to('cuda')


text = test_texts[2]
actual_label = test_labels[2]

inputs = tokenizer(text, padding = True, truncation = True, return_tensors='pt').to('cuda')
outputs = model_1(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
output_class = predictions.argmax().item()


print("Pred class from model -> ", output_class)
print("GT class from data -> ", actual_label)


Pred class from model ->  1
GT class from data ->  1
