In [1]:
!pip install transformers



In [3]:
!nvidia-smi

Sun Feb 18 02:44:30 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import shutil
import sys
from transformers import BertTokenizer,BertModel
from sklearn.model_selection import train_test_split

In [2]:
dataset_path = "/content/text_dataonly.csv"

In [3]:
df = pd.read_csv(dataset_path)

In [4]:
df['label'].nunique()

12

In [5]:
encoded_labels = pd.get_dummies(df['label'], prefix='label')
df = pd.concat([df['text'], encoded_labels], axis=1)
df.columns = ['text'] + list(encoded_labels.columns)

In [6]:
df.shape

(2171, 13)

In [7]:
target = encoded_labels.columns

In [8]:
#hyperparams
max_len = 256
train_batch_size=32
val_batch_size=32
epochs=8
lr = 1e-05

In [9]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [11]:
class diseaseDataset(torch.utils.data.Dataset):
  #constructor
  def __init__(self, df, tokenizer, max_len):
    self.df=df
    self.tokenizer=tokenizer
    self.max_len=max_len
    self.title = self.df["text"]
    self.targets = self.df[target].values

  def __len__(self):
    return len(self.title)

  #so no garabge
  def __getitem__(self,index):
    title = str(self.title[index])
    title = " ".join(title.split())

    inputs = self.tokenizer.encode_plus(
        title,
        None,
        add_special_tokens=True,
        max_length=self.max_len,
        padding="max_length",
        return_token_type_ids=True,
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    return {
      'input_ids': inputs['input_ids'].flatten(),
      'attention_mask': inputs['attention_mask'].flatten(),
      'token_type_ids':inputs['token_type_ids'].flatten(),
      'targets':torch.FloatTensor(self.targets[index])
    }

In [12]:
train_df, test_df= train_test_split(df,test_size=0.1,random_state=2)
train_df=train_df.reset_index(drop=True)
test_df=test_df.reset_index(drop=True)

In [128]:
len(df)

2171

In [129]:
len(test_df)

218

In [130]:
len(train_df)

1953

In [13]:
train_df, val_df= train_test_split(train_df,test_size=0.2,random_state=2)

In [14]:
train_df=train_df.reset_index(drop=True)
val_df=val_df.reset_index(drop=True)

In [116]:
# train_size = 0.8
# train_df = train_df.sample(frac=train_size, random_state=2).reset_index(drop=True)
# # train_df = train_df.sample(n=1550, random_state=2).reset_index(drop=True)
# val_df = train_df.drop(train_df.index).reset_index(drop=True)

In [106]:
len(val_df)

391

In [15]:
train_dataset = diseaseDataset(train_df, tokenizer, max_len)
val_dataset = diseaseDataset(val_df, tokenizer, max_len)

In [122]:
len(val_dataset)

391

In [16]:
train_data_loader = torch.utils.data.DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=train_batch_size,
    num_workers=0
)
val_data_loader = torch.utils.data.DataLoader(
    val_dataset,
    shuffle=False,
    batch_size=val_batch_size,
    num_workers=0
)

In [110]:
len(val_data_loader)

13

In [17]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [18]:
def load_checkpoint(checkpoint_path, model, optimizer):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    valid_loss_min = checkpoint['valid_loss_min']
    return model, optimizer, checkpoint['epoch'], valid_loss_min.item()

def save_checkpoint(state, is_best, checkpoint_path, best_model_path):
    path = checkpoint_path
    torch.save(state, path)
    if is_best:
        best_path = best_model_path
        shutil.copyfile(path, best_path)

In [19]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True) #1st layer
        self.dropout = torch.nn.Dropout(0.1) #dropout layer
        self.linear = torch.nn.Linear(768, 12) #bert_op = 768 layers

    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids,
            attention_mask=attn_mask,
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

model = BERTClass()
model.to(device)

BERTClass(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [20]:
def loss_func(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params= model.parameters(), lr=lr)

In [21]:
def train_model(n_epochs, train_loader, val_loader, model, opt, ckp_path, best_path):
  val_loss_min= np.Inf
  for epoch in range(1,epochs+1):
    train_loss=0
    val_loss=0
    model.train()
    for idx, batch in enumerate(train_loader):
      ip_ids=batch["input_ids"].to(device, dtype=torch.long)
      attention_mask=batch["attention_mask"].to(device, dtype=torch.long)
      token_type_ids=batch["token_type_ids"].to(device, dtype=torch.long)
      targets = batch["targets"].to(device, dtype=torch.float)
      op = model(ip_ids, attention_mask, token_type_ids)
      opt.zero_grad()
      loss = loss_func(op, targets)
      opt.zero_grad()
      loss.backward()
      opt.step()
      train_loss=train_loss+((1/(idx+1))*(loss.item()-train_loss))
    model.eval()
    with torch.no_grad():
      for batch_idx, data in enumerate(val_loader,0):
        ip_ids=batch["input_ids"].to(device, dtype=torch.long)
        attention_mask=batch["attention_mask"].to(device, dtype=torch.long)
        token_type_ids=batch["token_type_ids"].to(device, dtype=torch.long)
        targets = batch["targets"].to(device, dtype=torch.float)
        op = model(ip_ids, attention_mask, token_type_ids)
        loss = loss_func(op, targets)
        val_loss=val_loss+((1/(idx+1))*(loss.item()-val_loss))

      train_loss = train_loss/len(train_loader)
      val_loss = val_loss/len(val_loader)
      # print training/validation statistics
      print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch, train_loss,val_loss))
      checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': val_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
      }
      save_checkpoint(checkpoint, False, ckp_path, best_path)
  return model

In [22]:
trained_model = train_model(epochs, train_data_loader, val_data_loader,model,optimizer,"/current_ckpt","/best_ckpt")

Epoch: 1 	Avgerage Training Loss: 0.010363 	Average Validation Loss: 0.006883
Epoch: 2 	Avgerage Training Loss: 0.006880 	Average Validation Loss: 0.005307
Epoch: 3 	Avgerage Training Loss: 0.005841 	Average Validation Loss: 0.004683
Epoch: 4 	Avgerage Training Loss: 0.005198 	Average Validation Loss: 0.004123
Epoch: 5 	Avgerage Training Loss: 0.004554 	Average Validation Loss: 0.003413
Epoch: 6 	Avgerage Training Loss: 0.003914 	Average Validation Loss: 0.002947
Epoch: 7 	Avgerage Training Loss: 0.003350 	Average Validation Loss: 0.002551
Epoch: 8 	Avgerage Training Loss: 0.002886 	Average Validation Loss: 0.002186


In [24]:
example = test_df['text'][0]
encodings = tokenizer.encode_plus(
    example,
    None,
    add_special_tokens=True,
    max_length=max_len,
    padding='max_length',
    return_token_type_ids=True,
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)
model.eval()
with torch.no_grad():
    input_ids = encodings['input_ids'].to(device, dtype=torch.long)
    attention_mask = encodings['attention_mask'].to(device, dtype=torch.long)
    token_type_ids = encodings['token_type_ids'].to(device, dtype=torch.long)
    output = model(input_ids, attention_mask, token_type_ids)
    final_output = torch.sigmoid(output).cpu().detach().numpy().tolist()
    print(train_df.columns[1:].to_list()[int(np.argmax(final_output, axis=1))])

label_Arthritis


  print(train_df.columns[1:].to_list()[int(np.argmax(final_output, axis=1))])


In [25]:
test_df["text"][0]

"My neck has been so tense, and I've been feeling like my muscles are incredibly weak. I have trouble moving since my joints have enlarged. To walk has been quite painful."