In [3]:
# Importing stock ml libraries
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import logging
logging.basicConfig(level=logging.ERROR)

In [62]:
!pip install -q transformers[torch]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [4]:
# # Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [6]:
data = pd.read_excel(r'/kaggle/input/sct-for-original-model-trail/raghuvamsha_sarga_03.xlsx')

In [8]:
new_df = pd.DataFrame()
new_df['text'] = data['Sanskrit Text']
new_df['labels'] = data.iloc[:, 1:].values.tolist()

In [9]:
new_df.head()

Unnamed: 0,text,labels
0,\n\nअथेप्सितं भर्तुरुपस्थितोदयं सखीजनोद्वीक्षण...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,\n\nशरीरसादादसमग्रभूषणा मुखेन सालक्ष्यत लोध्रप...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,\n\nतदाननं मृत्सुरभि क्षितीश्वरो रहस्युपाघ्राय...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
3,\n\nदिवं मरुत्वानिव भोक्ष्यते भुवं दिगन्तविश्र...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,\n\nन मे ह्रिया शंसति किंचिदीप्सितं स्पृहावती ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [23]:
len(new_df.labels[0])

45

In [36]:
import os
from transformers import AutoTokenizer,AutoModelForMaskedLM
from transformers import pipeline
import re

model = AutoModelForMaskedLM.from_pretrained('sampathlonka/San-BERT')

In [15]:
new_model_max_length = 512
# Instantiate the tokenizer
MAX_LEN = 128
tokenizer = AutoTokenizer.from_pretrained('sampathlonka/San-BERT',model_max_length = new_model_max_length,truncation=True)

In [41]:
#model
#tokenizer
#model.bert().last_hidden_state
model.d

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [47]:
# Defining some key variables that will be used later on in the training
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
EPOCHS = 20
LEARNING_RATE = 1e-05

In [18]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [19]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_data=new_df.sample(frac=train_size,random_state=200)
test_data=new_df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)

FULL Dataset: (70, 2)
TRAIN Dataset: (56, 2)
TEST Dataset: (14, 2)


In [70]:
train_data.to_csv("train.csv")
test_data.to_csv("test.csv")

In [77]:
!pip install datasets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [82]:
!pip install pandas==2.0.0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting pandas==2.0.0
  Downloading pandas-2.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m46.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 1.5.3
    Uninstalling pandas-1.5.3:
      Successfully uninstalled pandas-1.5.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cuml 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
beatrix-jupyterlab 2023.814.150030 requires jupyter-server~=1.16, but you have jupyter-server 2.12.3 which is incompatible.
beatrix-jupyterlab 2023.814.150030 requi

In [83]:
from datasets import load_dataset
train=load_dataset("csv",data_files="/kaggle/working/train.csv")
print(train)

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-e69d1f1a8cd5c965/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

TypeError: read_csv() got an unexpected keyword argument 'mangle_dupe_cols'

In [20]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [42]:
# Creating the customized model, by adding a drop out and a dense layer on top of bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = model
        self.pre_classifier = torch.nn.Linear(30522, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, 45)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = BERTClass()
model.to(device)

BERTClass(
  (l1): BertForMaskedLM(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
          

In [48]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [49]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

<a id='section05'></a>
### Fine Tuning the Model

After all the effort of loading and preparing the data and datasets, creating the model and defining its loss and optimizer. This is probably the easier steps in the process. 

Here we define a training function that trains the model on the training dataset created above, specified number of times (EPOCH), An epoch defines how many times the complete data will be passed through the network. 

Following events happen in this function to fine tune the neural network:
- The dataloader passes data to the model based on the batch size. 
- Subsequent output from the model and the actual category are compared to calculate the loss. 
- Loss value is used to optimize the weights of the neurons in the network.
- After every 5000 steps the loss value is printed in the console.

As you can see just in 1 epoch by the final step the model was working with a miniscule loss of 0.05 i.e. the network output is extremely close to the actual output.

In [50]:
def train(epoch):
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        loss.backward()
        optimizer.step()

In [51]:
for epoch in range(EPOCHS):
    train(epoch)

2it [00:00, 15.26it/s]

Epoch: 0, Loss:  0.2108510583639145


14it [00:01, 11.84it/s]
2it [00:00, 11.98it/s]

Epoch: 1, Loss:  0.17227540910243988


14it [00:01, 11.56it/s]
2it [00:00, 11.82it/s]

Epoch: 2, Loss:  0.19510629773139954


14it [00:01, 11.58it/s]
2it [00:00, 11.80it/s]

Epoch: 3, Loss:  0.1846008598804474


14it [00:01, 11.54it/s]
2it [00:00, 11.81it/s]

Epoch: 4, Loss:  0.14685969054698944


14it [00:01, 11.58it/s]
2it [00:00, 11.84it/s]

Epoch: 5, Loss:  0.13325797021389008


14it [00:01, 11.57it/s]
2it [00:00, 11.96it/s]

Epoch: 6, Loss:  0.15642908215522766


14it [00:01, 11.56it/s]
2it [00:00, 11.78it/s]

Epoch: 7, Loss:  0.1366712599992752


14it [00:01, 11.57it/s]
2it [00:00, 11.94it/s]

Epoch: 8, Loss:  0.13923116028308868


14it [00:01, 11.56it/s]
2it [00:00, 11.91it/s]

Epoch: 9, Loss:  0.12561668455600739


14it [00:01, 11.56it/s]
2it [00:00, 12.05it/s]

Epoch: 10, Loss:  0.1354372799396515


14it [00:01, 11.55it/s]
2it [00:00, 11.88it/s]

Epoch: 11, Loss:  0.13538818061351776


14it [00:01, 11.57it/s]
2it [00:00, 11.92it/s]

Epoch: 12, Loss:  0.23530392348766327


14it [00:01, 11.55it/s]
2it [00:00, 11.90it/s]

Epoch: 13, Loss:  0.10675220936536789


14it [00:01, 11.57it/s]
2it [00:00, 11.87it/s]

Epoch: 14, Loss:  0.09965745359659195


14it [00:01, 11.58it/s]
2it [00:00, 11.77it/s]

Epoch: 15, Loss:  0.13738174736499786


14it [00:01, 11.55it/s]
2it [00:00, 11.85it/s]

Epoch: 16, Loss:  0.14541319012641907


14it [00:01, 11.55it/s]
2it [00:00, 11.90it/s]

Epoch: 17, Loss:  0.25457170605659485


14it [00:01, 11.54it/s]
2it [00:00, 11.88it/s]

Epoch: 18, Loss:  0.13538774847984314


14it [00:01, 11.54it/s]
2it [00:00, 11.88it/s]

Epoch: 19, Loss:  0.13434121012687683


14it [00:01, 11.51it/s]


In [63]:
from transformers import TrainingArguments,Trainer

In [68]:
training_args = TrainingArguments(
    output_dir="san-bert-test-1",
    learning_rate=1e-3,
    num_train_epochs=30,
    weight_decay=0.01,
    per_device_train_batch_size=4,
    save_strategy="steps",
    evaluation_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    #load_best_model_at_end=True,
    fp16=True,
    report_to='none'
    #push_to_hub=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=training_loader,
    eval_dataset=testing_loader,
    tokenizer=tokenizer
)

In [69]:
trainer.train()

TypeError: 'DataLoader' object is not subscriptable

<a id='section06'></a>
### Validating the Model

During the validation stage we pass the unseen data(Testing Dataset) to the model. This step determines how good the model performs on the unseen data. 

This unseen data is the 20% of `train.csv` which was seperated during the Dataset creation stage. 
During the validation stage the weights of the model are not updated. Only the final output is compared to the actual value. This comparison is then used to calcuate the accuracy of the model. 

As defined above to get a measure of our models performance we are using the following metrics. 
- Hamming Score
- Hamming Loss


In [52]:
def validation(testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [53]:
outputs, targets = validation(testing_loader)

final_outputs = np.array(outputs) >=0.5

4it [00:00, 42.25it/s]


In [60]:
final_outputs

array([[False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
    

In [54]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [55]:
val_hamming_loss = metrics.hamming_loss(targets, final_outputs)
val_hamming_score = hamming_score(np.array(targets), np.array(final_outputs))

print(f"Hamming Score = {val_hamming_score}")
print(f"Hamming Loss = {val_hamming_loss}")

Hamming Score = 0.0
Hamming Loss = 0.03968253968253968


<a id='section07'></a>
### Saving the Trained Model for inference

This is the final step in the process of fine tuning the model. 

The model and its vocabulary are saved locally. These files are then used in the future to make inference on new inputs of news headlines.

In [56]:
!mkdir models

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [57]:
# Saving the files for inference

output_model_file = './models/test.bin'
output_vocab_file = './models/vocab_test.bin'

torch.save(model, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('Saved')

Exception: No such file or directory (os error 2)