# **Model Training**

In [None]:
#Installing the necessary libraries
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 5.2 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 41.6 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 57.7 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.0 tokenizers-0.13.2 transformers-4.24.0


In [None]:
#Mounting google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Importing necessary libraries
import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification
import pandas as pd

In [None]:
#loading a csv file into a pandas dataframe
df = pd.read_csv('/content/drive/MyDrive/synonym/synonym_data.csv')

In [None]:
#print all the colunns
df.columns #contains 2 columns that is label and nvalue

Index(['label', 'nvalue'], dtype='object')

In [None]:
#first 5 rows of a dataframe
df.head()

Unnamed: 0,label,nvalue
0,name,name
1,name,given name
2,name,last name and first name
3,name,full name
4,name,names


In [None]:
#Count all the unique values present in a label column of a dataframe
#label contains 5 labels and those are (other,name,address,dob,gender)
df['label'].value_counts()


other      71
name       67
address    67
dob        45
gender     18
Name: label, dtype: int64

In [None]:
#Counting all the unique values present in column nvalue
df['nvalue'].value_counts()

dob ddmmyyyy              3
date of birth ddmmyyyy    3
birthdate ddmmyyyy        3
date of birth ddmmyy      2
birthdate daymonthyear    2
                         ..
gender femalemale         1
gender m or f             1
gender male or female     1
date of birth             1
former name               1
Name: nvalue, Length: 249, dtype: int64

In [None]:
#Encoding the labels
possible_labels = df.label.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{'name': 0, 'gender': 1, 'dob': 2, 'address': 3, 'other': 4}

In [None]:
#Renaming the column from "label" to "target"
df.rename(columns={'label':'target'},inplace=True)

In [None]:
#Replacing the values(name,gender,dob,address,other) of label columns to encoded values({'name': 0, 'gender': 1, 'dob': 2, 'address': 3, 'other': 4})
df['label'] = df.target.replace(label_dict)

In [None]:
#checking the first 5 rows after encoding the values of label column
df.head()

Unnamed: 0,target,nvalue,label
0,name,name,0
1,name,given name,0
2,name,last name and first name,0
3,name,full name,0
4,name,names,0


In [None]:
#Shape of dataframe
df.shape #dataframe contains 268 rows and 2 columns(label,nvalue)

(268, 3)

In [None]:
from sklearn.model_selection import train_test_split
#Spliting the dataset into training and validation dataset X=df.index.values() , Y=df.label.values(Target variable)
X_train, X_val, y_train, y_val = train_test_split(df.index.values,
                                                  df.label.values,
                                                  test_size=0.15,
                                                  random_state=42,
                                                  stratify=df.label.values)

In [None]:
df['data_type'] = ['not_set']*df.shape[0]

#returning a value present in data_type column
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [None]:
df.groupby(['target', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,nvalue
target,label,data_type,Unnamed: 3_level_1
address,3,train,57
address,3,val,10
dob,2,train,38
dob,2,val,7
gender,1,train,15
gender,1,val,3
name,0,train,57
name,0,val,10
other,4,train,60
other,4,val,11


In [None]:
#Tokenize the text sentences and convert them to vectorized form
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
#To convert all the text into encodded form using batch_encode_plus
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].nvalue.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].nvalue.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)
#We’ll be passing two variables to the BERT’s forward function later, namely, input_ids and attention_mask
#Splitting the data into input_ids , attention_mask and labels
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
#Creation of training dataset and validation dataset after the encoding
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [None]:
#Checking the length of training and validation dataset
len(dataset_train), len(dataset_val)

(227, 41)

In [None]:
#Bert Pretrained model
#We are treating each nvalue as its unique sequence, so one sequence will be classified to one of the five labels(address,dob,gender,name and other)
#num_labels to indicate the number of output labels(that is 5 (name,address,dob,gender and other))
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
#DataLoader for combining the dataset with sampler
#using randomsampler for training dataset and sequentialsampler for validation dataset
batch_size = 3

dataloader_train = DataLoader(dataset_train,
                              sampler=RandomSampler(dataset_train),
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val,
                                   sampler=SequentialSampler(dataset_val),
                                   batch_size=batch_size)

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup
#Optimizer
optimizer = AdamW(model.parameters(),
                  lr=1e-5,
                  eps=1e-8)



In [None]:
#Scheduler
epochs = 5

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

In [None]:
from sklearn.metrics import f1_score
import numpy as np

#Performance metrics : f1 score and accuracy per class
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [None]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cpu


In [None]:
#Training loop
def evaluate(dataloader_val):

    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader_val:

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

In [None]:
for epoch in tqdm(range(1, epochs+1)):

    model.train()

    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        outputs = model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})


    torch.save(model.state_dict(), f'/content/drive/MyDrive/synonym/finetuned_BERT_epoch_{epoch}.model')

    tqdm.write(f'\nEpoch {epoch}')

    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')

    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/76 [00:00<?, ?it/s]


Epoch 1
Training loss: 1.2677208185195923
Validation loss: 0.989491377558027
F1 Score (Weighted): 0.6952715720434516


Epoch 2:   0%|          | 0/76 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.7251580200697246
Validation loss: 0.7441217878035137
F1 Score (Weighted): 0.796622889305816


Epoch 3:   0%|          | 0/76 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.47854465540302427
Validation loss: 0.6363178651247706
F1 Score (Weighted): 0.7770285721505233


Epoch 4:   0%|          | 0/76 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.35436360616432994
Validation loss: 0.5242625448320594
F1 Score (Weighted): 0.8068564678320775


Epoch 5:   0%|          | 0/76 [00:00<?, ?it/s]


Epoch 5
Training loss: 0.2849992583260724
Validation loss: 0.5303478703967163
F1 Score (Weighted): 0.8068564678320775


In [None]:
#loading and evaluating the model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
model.load_state_dict(torch.load('/content/drive/MyDrive/synonym/finetuned_BERT_epoch_5.model', map_location=torch.device('cpu')))

<All keys matched successfully>

In [None]:
_, predictions, true_vals = evaluate(dataloader_validation)

In [None]:
accuracy_per_class(predictions, true_vals)

Class: name
Accuracy: 9/10

Class: gender
Accuracy: 2/3

Class: dob
Accuracy: 6/7

Class: address
Accuracy: 6/10

Class: other
Accuracy: 10/11



In [None]:
model.save_pretrained('/content/drive/MyDrive/synonym/bert_tokenizer/')

In [None]:
PATH='/content/drive/MyDrive/synonym/bert_tokenizer/'

# **Model Evaluation**

In [None]:
PATH='/content/drive/MyDrive/synonym/bert_tokenizer/'

In [None]:
tokenizer= BertTokenizer.from_pretrained("bert-base-uncased")
model1 = BertForSequenceClassification.from_pretrained(PATH)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
from transformers import TextClassificationPipeline
pipe = TextClassificationPipeline(model=model1, tokenizer=tokenizer, return_all_scores=True,Function_to_apply="softmax")
pipe("NAME")

Keyword arguments {'Function_to_apply': 'softmax'} not recognized.


[[{'label': 'LABEL_0', 'score': 0.38676726818084717},
  {'label': 'LABEL_1', 'score': 0.046474285423755646},
  {'label': 'LABEL_2', 'score': 0.023004844784736633},
  {'label': 'LABEL_3', 'score': 0.026176761835813522},
  {'label': 'LABEL_4', 'score': 0.5175768136978149}]]

In [None]:
from transformers import TextClassificationPipeline
def model_test(input_string):
  pipe = TextClassificationPipeline(model=model1, tokenizer=tokenizer, return_all_scores=True)
  output_synonym=pipe(input_string)
  return(output_synonym)

In [None]:
output_synonym=model_test("Name")
print(output_synonym)

In [None]:
# import numpy as np
# k=np.argmax(model1.predict,axis=1)

In [None]:
# inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
# labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
# outputs = model1(**inputs, labels=labels)

In [None]:
# outputs.keys()

In [None]:
# model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [None]:
# import pandas as pd
# import io
# import pandas as pd

# def check_similarity(s):

#     # sentence_pairs = pd.read_csv(io.StringIO(s),header=None)

#     # sentence_pairs = pd.read_csv('test.txt',header=None)


#     # test_data = BertSemanticDataGenerator(
#     #     sentence_pairs, labels=None, batch_size=1, shuffle=False, include_targets=False,
#     # )
#     test_data=tokenizer.batch_encode_plus(
#     s,
#     add_special_tokens=True,
#     return_attention_mask=True,
#     pad_to_max_length=True,
#     max_length=256,
#     return_tensors='pt'
# )

#     proba = model1.predict(test_data[0])[0]
#     print(proba)
#     idx = np.argmax(proba)
#     proba = f"{proba[idx]: .2f}%"
#     pred = labels[idx]
#     return pred, proba

In [None]:
# t,k=check_similarity("general name")

In [None]:
# model1.eval("general_name")

In [None]:
#load the model globally
#function(input_string="for eg: general name")
#output confidence and class

In [None]:
# import pickle

# filename = '/content/drive/MyDrive/synonym/synonym.pkl'

# #Save the trained model as a pickle string.
# with open(filename, 'wb') as handle:
#     pickle.dump(model, handle)


In [None]:
# Load the pickled model and using "with" statement which closes the file when the statement ends, even if an exception occurs
# with open(filename, 'rb') as handle:
#     model_from_pickle = pickle.load(handle)

In [None]:
# def get_result(input_string):
#   #Converting res to dataframe
#   # df = pd.DataFrame(input_string)
#   #Getting the predictions
#   predictions=model_from_pickle.predict(df)
#   predictions1=model_from_pickle.predict_proba(df)
#   print(predictions1)
#   print(predictions)


In [None]:
# pred_output=get_result("General name")
# print(pred_output)

In [None]:
# def prediction(review_text):

#   encoded_review = tokenizer.encode_plus(
#   review_text,
#   max_length=256,
#   add_special_tokens=True,
#   return_token_type_ids=False,
#   pad_to_max_length=True,
#   return_attention_mask=True,
#   return_tensors='pt',
#   )
#   input_ids = encoded_review['input_ids'].to(device)
#   attention_mask = encoded_review['attention_mask'].to(device)
#   output = model(input_ids, attention_mask)
#   _, prediction = torch.max(output, dim=1)

#   return(review_text,prediction)


In [None]:
# review_text="general name"
# text,pred=prediction(review_text)

In [None]:
# texts ="General name"
# x_val = tokenizer(
#     text=texts,
#     add_special_tokens=True,
#     max_length=256,
#     truncation=True,
#     padding='max_length',
#     return_tensors='tf',
#     return_token_type_ids = False,
#     return_attention_mask = True,
#     verbose = True)
# validation = model.predict({'input_ids':x_val['input_ids'],'attention_mask':x_val['attention_mask']})*100
# for key , value in zip(label_dict.keys(),validation[0]):
#     print(key,value)