In [1]:
# Moun the google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# change directory
import os
os.chdir('/content/drive/My Drive/NLP_Coding_Practice')

In [3]:
!pwd

/content/drive/My Drive/NLP_Coding_Practice


In [4]:
import torch
torch.cuda.is_available()

True

In [24]:
# !pip install datasets

In [None]:
# !pip install peft

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, BertModel
from datasets import load_dataset, load_metric
import torch
import torch.nn as nn
import numpy as np

In [6]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Load & Process Dataset

In [16]:
class MyDataset(torch.utils.data.Dataset):
  def __init__(self, text_list, label_list, tokenizer):
    self.text_list = text_list
    self.label_list = label_list
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.label_list)

  def __getitem__(self, idx):
    tok = self.tokenizer(self.text_list[idx], padding= "max_length", max_length = 128, truncation=True)
    item = {key: torch.tensor(val).to(device) for key, val in tok.items()}
    item['labels'] = torch.tensor(self.label_list[idx], dtype=torch.long).to(device)
    return item

In [17]:
class CustomBERT(nn.Module):
  def __init__(self, num_of_classes =2):
    super(CustomBERT, self).__init__()
    self.bert = BertModel.from_pretrained('bert-base-uncased')
    self.classifier = nn.Linear(768, num_of_classes)
    self.dropout = nn.Dropout(0.1)

    for param in self.bert.parameters():
      param.requires_grad = False

  def forward(self,x):
    input_ids = x['input_ids']
    attention_mask = x['attention_mask']
    output = self.bert(input_ids, attention_mask)
    pooled_output = output.pooler_output
    output = self.dropout(pooled_output)
    output = self.classifier(output)
    return output

In [26]:
from peft import LoraConfig, get_peft_model
class CustomBERTWithLoRA(nn.Module):
    def __init__(self, num_of_classes=2):
        super(CustomBERTWithLoRA, self).__init__()
        # Load the BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.classifier = nn.Linear(768, num_of_classes)
        self.dropout = nn.Dropout(0.1)

        # Apply LoRA to the BERT model
        lora_config = LoraConfig(
            r=8,  # Rank of the low-rank matrix
            lora_alpha=16,  # Scaling factor for the LoRA layers
            target_modules=["query", "key"],  # Apply LoRA to attention layers
            lora_dropout=0.1,
            bias="none"
        )

        self.bert = get_peft_model(self.bert, lora_config)

    def forward(self, x):
        input_ids = x['input_ids']
        attention_mask = x['attention_mask']
        output = self.bert(input_ids, attention_mask)
        pooled_output = output.pooler_output
        output = self.dropout(pooled_output)
        output = self.classifier(output)
        return output


In [18]:
from sklearn.metrics import f1_score, accuracy_score

def evaluate(model,testloader):
  model.eval()
  pred = []
  truelabel = []

  with torch.no_grad():
    for batch in testloader:
      logits = model(batch)
      ypred = torch.argmax(logits, dim=-1)
      pred.extend(ypred.cpu().numpy())
      truelabel.extend(batch['labels'].cpu().numpy())

    model.train()
    f1Score = f1_score(truelabel, pred, average='macro')
    acc = accuracy_score(truelabel, pred)
  return f1Score, acc

In [19]:
# Training preparation
import pandas as pd
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# load and create dataset
df = pd.read_csv('/content/drive/MyDrive/NLP_Coding_Practice/dataset/smsspamcollection.tsv', sep='\t')
text = df['message'].tolist()
labels = df['label'].tolist()
labels = [1 if label == 'spam' else 0 for label in labels]

Xtrain, Xtest, ytrain, ytest = train_test_split(text, labels, test_size=0.2, random_state=42)

train_dataset = MyDataset(Xtrain, ytrain, tokenizer)
test_dataset = MyDataset(Xtest, ytest, tokenizer)

# create data loaders
trainloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
testloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [14]:
ytrain[0]

1

In [28]:
# Training the model
maxf1 = 0
# model
# model = CustomBERT().to(device)
model = CustomBERTWithLoRA().to(device)

# optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

# Training loop
for epoch in range(10):
  model.train()
  for batch in trainloader:
    optimizer.zero_grad()
    logits = model(batch)
    loss = criterion(logits, batch['labels'])
    loss.backward()
    optimizer.step()

  with torch.no_grad():
    model.eval()
    acc, ascore = evaluate(model,testloader)
    if ascore > maxf1:
      maxf1 = ascore
      torch.save(model.state_dict(), 'best_model.pth')

    print(f"Epoch: {epoch+1} --> F1 Score : {ascore} ---> Accuracy: {acc}")

Epoch: 1 --> F1 Score : 0.9802690582959641 ---> Accuracy: 0.9571501439391823
Epoch: 2 --> F1 Score : 0.989237668161435 ---> Accuracy: 0.9764926631809748
Epoch: 3 --> F1 Score : 0.9883408071748879 ---> Accuracy: 0.9748947380271542
Epoch: 4 --> F1 Score : 0.9919282511210762 ---> Accuracy: 0.9825204972644187


KeyboardInterrupt: 

In [1]:
from seqeval.metrics import classification_report

ModuleNotFoundError: No module named 'seqeval'