**LoRA Implementation**

In [35]:
#!pip install torch
!git clone https://github.com/e9t/nsmc.git

fatal: destination path 'nsmc' already exists and is not an empty directory.


In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import transformers
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from peft import get_peft_model, LoraConfig, TaskType

from tqdm.auto import tqdm
from itertools import chain

In [37]:
df = pd.read_csv('./nsmc/ratings.txt', sep='\t', quoting=1)
df.dropna(inplace=True)
print(df)
print(repr(df[df.id==8963373]["document"].values[0]))
df = pd.concat([df[:10000],df[-10000:]])


              id                                           document  label
0        8112052                                어릴때보고 지금다시봐도 재밌어요ㅋㅋ      1
1        8132799  디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...      1
2        4655635               폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.      1
3        9251303  와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...      1
4       10067386                        안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.      1
...          ...                                                ...    ...
199995   8963373                                     포켓 몬스터 짜가 ㅡㅡ;;      0
199996   3302770                                              쓰.레.기      0
199997   5458175                  완전 사이코영화. 마지막은 더욱더 이 영화의질을 떨어트린다.      0
199998   6908648                왜난 재미없었지 ㅠㅠ 라따뚜이 보고나서 스머프 봐서 그런가 ㅋㅋ      0
199999   8548411                                    포풍저그가나가신다영차영차영차      0

[199992 rows x 3 columns]
'포켓 몬스터 짜가 ㅡㅡ;;'


In [38]:
train, test = train_test_split(df, test_size=0.3, random_state=42)
print(train)

              id                                           document  label
197217  10041439                        훨씬 더 잘만들 수 있는 영화였다 하정우의 원맨쇼      0
195187   7185964  뭐지이건ㅋㅋㅋㅋ 불필요한 컷도 많고 내용도 이해가 안가고 너무 급전개임 보는사람으로...      0
191294   9860198                  이번만큼은 박평식의 평점이 맞는듯... 벌목한 나무가 아까움      0
199772  10086153                          공산당이 싫어요 ㅠ 정말싫어요 일본만큼 비호감      0
193071   6064138                 내가 정말 웬만하면 재미없다고 안하는데... 이건 아니잖아~~      0
...          ...                                                ...    ...
191283   3789288                   게임같은 영상...재미도 별루...만화가 딱딱한 느낌...      0
191963  10176253            왜이렇게 촌스러운 망작인가 했더니 헉 오우삼감독이라니ㅠㅠ모든게이해되네요      0
5390     5506206                                          진짜 재미있어요.      1
860      8139145      설렘주의보&좀떨떠름? 음. . ㅋㅋㅋ나중에 여자 어떻게 됬는지 알면좋을텐데 ㅋㅋㅋ      1
195794  10011705                                      뭐 별반 다를게 없네요.      0

[14000 rows x 3 columns]


In [39]:
print(test)

             id                                           document  label
190649  5714103          이건 15세관람가가 아니야 막판에 장면을보면 청소년관람불가로 해야하지...      0
2041    4522207                            정말좋은영다 5월의가정 한가족이볼수있는영화      1
8668    8687561  극한 전쟁 상황 속이든 자신이 쌓은 커리어가 완전히 날아가는 상황에서 인간이 보여 ...      1
1114    5194435                                       제중원 너무 좋다~!!      1
193901  7645851                              감독 걍 광고나 만들면서 손가락빨고있어      0
...         ...                                                ...    ...
4464    7997993          아주 자세한 내용은 잘 이해가 안되지만 그냥 시간가는줄 모르고 본 영화다.      1
195655  1152004                                   나에겐 아직 너무 어려운듯..      0
199146  1590979                                            쩝~~! 별루      0
190566  4534131       시간이 넘치게 남아 주체할수 없을때 보세요. 음.. 아닙니다. 그냥 보지마세요.      0
9165    9060009                                     허니잼ㅋㅎㅋㅎ겁나매력터진당      1

[6000 rows x 3 columns]


In [40]:
class CustomDataset(Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: torch.tensor(value[idx]) for key, value in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)


In [41]:
batch_size = 8
num_epochs = 1
learning_rate = 2e-5

In [42]:
model_name = "google-bert/bert-base-uncased"
bert_tokenizer = AutoTokenizer.from_pretrained(model_name)

train_texts = train['document'].tolist()
test_texts = test['document'].tolist()

train_labels = train['label'].tolist()
test_labels = test['label'].tolist()

train_encodings = bert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings = bert_tokenizer(test_texts, truncation=True, padding=True)

In [43]:
train_dataset = CustomDataset(train_encodings, train_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [44]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
bert_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
bert_model.to(device)

optimizer = torch.optim.Adam(bert_model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
class Classifier(nn.Module):
  def __init__(self, hidden_dim, num_labels):
    super().__init__()
    self.a = nn.Linear(hidden_dim, hidden_dim//2)
    self.gelu = nn.GELU()
    self.dropout = nn.Dropout(0.1)
    self.b = nn.Linear(hidden_dim//2, num_labels)

  def forward(self, x):
    x = self.a(x)
    x = self.gelu(x)
    x = self.dropout(x)
    x = self.b(x)

    return x

In [46]:
bert_model = AutoModel.from_pretrained(model_name)
classifier = Classifier(768, 2)

In [48]:
"""
input_text = ""
input_encoding = bert_tokenizer.encode_plus(
    input_text,
    truncation=True,
    padding=True,
    return_tensors='pt'
)

input_ids = input_encoding['input_ids'].to(device)
attention_mask = input_encoding['attention_mask'].to(device)

bert_model.eval()
classifier.eval()

with torch.no_grad():
  outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
  outputs = classifier(outputs.pooler_output)
  _, predicted_labels = torch.max(outputs, dim=1)
predicted_labels = predicted_labels.item()
print(predicted_labels)
"""

'\ninput_text = ""\ninput_encoding = bert_tokenizer.encode_plus(\n    input_text,\n    truncation=True,\n    padding=True,\n    return_tensors=\'pt\'\n)\n\ninput_ids = input_encoding[\'input_ids\'].to(device)\nattention_mask = input_encoding[\'attention_mask\'].to(device)\n\nbert_model.eval()\nclassifier.eval()\n\nwith torch.no_grad():\n  outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)\n  outputs = classifier(outputs.pooler_output)\n  _, predicted_labels = torch.max(outputs, dim=1)\npredicted_labels = predicted_labels.item()\nprint(predicted_labels)\n'

In [50]:
bert_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

for name, param in bert_model.bert.named_parameters():
  param.requires_grad = False

optimizer = torch.optim.Adam(bert_model.parameters(), lr=learning_rate)


bert_model.to(device)

bert_model.train()

for epoch in range(num_epochs):
  total_loss = 0

  for batch in tqdm(train_dataloader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    total_loss += loss.item()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  average_loss = total_loss / len(train_dataloader)
  print(f"Epoch : {epoch}, Avg Loss : {average_loss}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1750 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Epoch : 0, Avg Loss : 0.6971974873883383


In [51]:
bert_model.eval()

correct_predictions = 0
total_predictions = 0

with torch.no_grad():
  for batch in tqdm(test_dataloader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    _, predicted_labels = torch.max(outputs.logits, dim=1)

    correct_predictions += torch.sum(predicted_labels == labels).item()
    total_predictions += labels.size(0)

accuracy = correct_predictions / total_predictions
print(f" Accuracy : {accuracy:.4f}")


  0%|          | 0/750 [00:00<?, ?it/s]

 Accuracy : 0.5310


In [52]:
class ScaledDotProductAttention(nn.Module):
  def __init__(self, hidden_dim):
    super().__init__()
    self.Q = nn.Linear(hidden_dim, hidden_dim)
    self.K = nn.Linear(hidden_dim, hidden_dim)
    self.V = nn.Linear(hidden_dim, hidden_dim)

  def forward(self, x): # x : B x L x D
    Q = self.Q(x)
    K = self.K(x)
    V = self.V(x)

    qk = Q @ K.transpose(1, 2) # B x L x L
    qkv = qk @ V # B x L x D

    return qkv


In [53]:
class BA(nn.Module):
  def __init__(self, W, hidden_dim, lora_dim):
    super().__init__()
    self.W = W
    self.A = nn.Linear(hidden_dim, lora_dim, bias=False)
    self.dropout = nn.Dropout(0.1)
    self.B = nn.Linear(lora_dim, hidden_dim, bias=False)

    nn.init.kaiming_uniform_(self.A.weight)
    nn.init.zeros_(self.B.weight)

    self.alpha = 32
    self.r = lora_dim


  def forward(self, x):
    return self.W(x) + self.alpha / self.r * self.B(self.dropout(self.A(x)))


In [54]:
class LoRA(nn.Module):
  def __init__(self):
    super().__init__()
    self.ba = BA(768, 8)
    self.attention = ScaledDotProductAttention(768)

  def set_attention(self, attention):
    self.attention = attention

  def forward(self, *x):
    return self.attention(x) + self.ba(x[0])


In [55]:
model_name = "google-bert/bert-base-uncased"

bert_tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)

classifier = Classifier(768, 2)

for name, module in bert_model.named_modules():
  print(name)


embeddings
embeddings.word_embeddings
embeddings.position_embeddings
embeddings.token_type_embeddings
embeddings.LayerNorm
embeddings.dropout
encoder
encoder.layer
encoder.layer.0
encoder.layer.0.attention
encoder.layer.0.attention.self
encoder.layer.0.attention.self.query
encoder.layer.0.attention.self.key
encoder.layer.0.attention.self.value
encoder.layer.0.attention.self.dropout
encoder.layer.0.attention.output
encoder.layer.0.attention.output.dense
encoder.layer.0.attention.output.LayerNorm
encoder.layer.0.attention.output.dropout
encoder.layer.0.intermediate
encoder.layer.0.intermediate.dense
encoder.layer.0.intermediate.intermediate_act_fn
encoder.layer.0.output
encoder.layer.0.output.dense
encoder.layer.0.output.LayerNorm
encoder.layer.0.output.dropout
encoder.layer.1
encoder.layer.1.attention
encoder.layer.1.attention.self
encoder.layer.1.attention.self.query
encoder.layer.1.attention.self.key
encoder.layer.1.attention.self.value
encoder.layer.1.attention.self.dropout
encoder.

In [56]:
for param in bert_model.parameters():
  param.requires_grad = False

for layer in bert_model.encoder.layer:
  #lora = LoRA()
  #lora.set_attention(layer.attention.self)
  #layer.attention.self = lora
  layer.attention.self.query = BA(layer.attention.self.query, 768, 8)
  layer.attention.self.key = BA(layer.attention.self.key, 768, 8)
  layer.attention.self.value = BA(layer.attention.self.value, 768, 8)
  #print(layer)

  #layer.attention.output.dense = BA(layer.attention.output.dense, 768, 8)
  #layer.intermediate.dense = BA(layer.intermediate.dense, 768, 8)
  #layer.output.dense = BA(layer.output.dense, 768, 8)

In [57]:
def print_trainable_params(model):
  total_params = sum(p.numel() for p in model.parameters())
  trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
  print(f"total : {total_params}, trainable : {trainable_params}, ratio : {trainable_params / total_params * 100}")

In [58]:
print_trainable_params(bert_model)
print_trainable_params(classifier)

total : 109924608, trainable : 442368, ratio : 0.40242854448023135
total : 296066, trainable : 296066, ratio : 100.0


My LoRA without Dropout and initialization

In [59]:
class BAonly(nn.Module):
  def __init__(self, weight, lora_alpha, lora_dim):
    super().__init__()

    # weight in pytorch : [out_features, in_features]
    in_dim = weight.shape[1]
    out_dim = weight.shape[0]

    self.W = nn.Linear(in_dim, out_dim)
    self.W.weight = weight
    self.A = nn.Linear(in_dim, lora_dim)
    self.B = nn.Linear(lora_dim, out_dim)

    self.alpha = lora_alpha
    self.r = lora_dim
    self.scaling = lora_alpha / lora_dim

  def forward(self, x):
    return self.W(x) + self.scaling * self.B(self.A(x))

My LoRA w/o drop and init, with temp logic

In [60]:
bert_model = AutoModel.from_pretrained(model_name)

for param in bert_model.parameters():
  param.requires_grad = False

parent_module = None
target_modules = ["query", "key", "value"]
'''
for name, module in bert_model.named_modules():

  if isinstance(module, nn.Linear):
    if not any(target_module in name for target_module in target_modules):
      continue

    print(name)
    print(module)
    print("p",parent_module)
    if parent_module:
      for target_module in target_modules:
        module = getattr(parent_module, target_module)
        setattr(parent_module, name, BAonly(module.weight, 32, 8))
  else:
    parent_module = module

  #f "layer" not in name:continue
  #layer.attention.query = BAonly(layer.attention.query.weight, 32, 8)
  #layer.attention.key = BAonly(layer.attention.key.weight, 32, 8)
  #layer.attention.value = BAonly(layer.attention.value.weight, 32, 8)
'''
target_pairs = []
for name, module in bert_model.named_modules():

  if isinstance(module, nn.Linear):
    if not any(target_module in name for target_module in target_modules):
      continue

    #print(name)
    #print(module)
    #print("p",parent_module)
    if parent_module:
      for target_module in target_modules:
        module = getattr(parent_module, target_module)
        target_pairs += [(parent_module, name, module)]
  else:
    parent_module = module

for pairs in set(target_pairs):
  parent_module, name, module = pairs
  print("p", parent_module)
  print("name", name)
  print("m", module)

  setattr(parent_module, name, BAonly(module.weight, 32, 8))

print_trainable_params(bert_model)


classifier = Classifier(768, 2)

optimizer = torch.optim.Adam(chain(bert_model.parameters(), classifier.parameters()), lr=learning_rate)

p BertSdpaSelfAttention(
  (query): Linear(in_features=768, out_features=768, bias=True)
  (key): Linear(in_features=768, out_features=768, bias=True)
  (value): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)
name encoder.layer.5.attention.self.query
m Linear(in_features=768, out_features=768, bias=True)
p BertSdpaSelfAttention(
  (query): Linear(in_features=768, out_features=768, bias=True)
  (key): Linear(in_features=768, out_features=768, bias=True)
  (value): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)
name encoder.layer.8.attention.self.value
m Linear(in_features=768, out_features=768, bias=True)
p BertSdpaSelfAttention(
  (query): Linear(in_features=768, out_features=768, bias=True)
  (key): Linear(in_features=768, out_features=768, bias=True)
  (value): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (encoder.layer.5.attention

In [61]:
bert_model.to(device)
classifier.to(device)

bert_model.train()
classifier.train()

for epoch in range(num_epochs):
  total_loss = 0

  for batch in tqdm(train_dataloader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
    outputs = classifier(outputs.pooler_output)

    loss = criterion(outputs, labels)
    total_loss += loss

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  average_loss = total_loss / len(train_dataloader)

  print(f"average_loss : {average_loss}")

  0%|          | 0/1750 [00:00<?, ?it/s]

average_loss : 0.6916404962539673


In [62]:
bert_model.eval()
classifier.eval()

for epoch in range(num_epochs):
  total_predictions = 0
  correct_predictions = 0

  for batch in tqdm(test_dataloader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    with torch.no_grad():
      outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
      outputs = classifier(outputs.pooler_output)
      _, predicted_labels = torch.max(outputs, dim=1)

    correct_predictions += torch.sum(predicted_labels == labels).item()
    total_predictions += labels.size(0)

  print(f"accuracy : {correct_predictions / total_predictions}")

  0%|          | 0/750 [00:00<?, ?it/s]

accuracy : 0.5371666666666667


My LoRA w/o dropout and initialization, with direct replacements

In [63]:
bert_model = AutoModel.from_pretrained(model_name)

for param in bert_model.parameters():
  param.requires_grad = False

for layer in bert_model.encoder.layer:
  layer.attention.self.query = BAonly(layer.attention.self.query.weight, 32, 8)
  layer.attention.self.key = BAonly(layer.attention.self.key.weight, 32, 8)
  layer.attention.self.value = BAonly(layer.attention.self.value.weight, 32, 8)

print_trainable_params(bert_model)

classifier = Classifier(768, 2)

print_trainable_params(classifier)

optimizer = torch.optim.Adam(chain(*[bert_model.parameters(), classifier.parameters()]), lr=learning_rate)

total : 109952544, trainable : 497952, ratio : 0.45287901660556396
total : 296066, trainable : 296066, ratio : 100.0


In [64]:
bert_model.to(device)
classifier.to(device)

bert_model.train()
classifier.train()

for epoch in range(num_epochs):
  total_loss = 0

  for batch in tqdm(train_dataloader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
    outputs = classifier(outputs.pooler_output)

    loss = criterion(outputs, labels)
    total_loss += loss

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  average_loss = total_loss / len(train_dataloader)

  print(f"epoch : {epoch}, average_loss : {average_loss}")

  0%|          | 0/1750 [00:00<?, ?it/s]

epoch : 0, average_loss : 0.6936452984809875


In [65]:
bert_model.eval()
classifier.eval()

for epoch in range(num_epochs):
  total_predictions = 0
  correct_predictions = 0

  for batch in tqdm(test_dataloader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    with torch.no_grad():
      outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
      outputs = classifier(outputs.pooler_output)

    _, predicted_labels = torch.max(outputs, dim=1)
    correct_predictions += torch.sum(predicted_labels == labels).item()
    total_predictions += labels.size(0)

  print(f"accuracy : {correct_predictions / total_predictions}")

  0%|          | 0/750 [00:00<?, ?it/s]

accuracy : 0.5661666666666667


In [66]:
print(len([*chain(bert_model.parameters(), classifier.parameters())]))
print(len([*chain(*[bert_model.parameters(), classifier.parameters()])]))

347
347


My LoRA w/o dropout and initialization, with direct replacements

 difference : *chain([p1, p2]) -> chain(p1, p2)

In [72]:
bert_model = AutoModel.from_pretrained(model_name)

for param in bert_model.parameters():
  param.requires_grad = False

for layer in bert_model.encoder.layer:
  layer.attention.self.query = BAonly(layer.attention.self.query.weight, 32, 8)
  layer.attention.self.key = BAonly(layer.attention.self.key.weight, 32, 8)
  layer.attention.self.value = BAonly(layer.attention.self.value.weight, 32, 8)

print_trainable_params(bert_model)

classifier = Classifier(768, 2)

print_trainable_params(classifier)

optimizer = torch.optim.Adam(chain(bert_model.parameters(), classifier.parameters()), lr=learning_rate)

total : 109952544, trainable : 497952, ratio : 0.45287901660556396
total : 296066, trainable : 296066, ratio : 100.0


In [73]:
bert_model.to(device)
classifier.to(device)

bert_model.train()
classifier.train()

for epoch in range(num_epochs):
  total_loss = 0

  for batch in tqdm(train_dataloader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
    outputs = classifier(outputs.pooler_output)

    loss = criterion(outputs, labels)
    total_loss += loss

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  average_loss = total_loss / len(train_dataloader)

  print(f"epoch : {epoch}, average_loss : {average_loss}")

  0%|          | 0/1750 [00:00<?, ?it/s]

epoch : 0, average_loss : 0.690314531326294


In [74]:
bert_model.eval()
classifier.eval()

for epoch in range(num_epochs):
  total_predictions = 0
  correct_predictions = 0

  for batch in tqdm(test_dataloader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    with torch.no_grad():
      outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
      outputs = classifier(outputs.pooler_output)

    _, predicted_labels = torch.max(outputs, dim=1)
    correct_predictions += torch.sum(predicted_labels == labels).item()
    total_predictions += labels.size(0)

  print(f"accuracy : {correct_predictions / total_predictions}")

  0%|          | 0/750 [00:00<?, ?it/s]

accuracy : 0.5531666666666667


My LoRA with Dropout

In [67]:


from itertools import chain

optimizer = torch.optim.Adam(params=chain(*[bert_model.parameters(), classifier.parameters()]), lr=learning_rate)

bert_model.to(device)
classifier.to(device)

bert_model.train()
classifier.train()


for epoch in range(num_epochs):
  total_loss = 0

  for batch in tqdm(train_dataloader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
    outputs = classifier(outputs.pooler_output)

    loss = criterion(outputs, labels)
    total_loss += loss.item()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  average_loss = total_loss / len(train_dataloader)

  print(f"epoch : {epoch}, average_loss : {average_loss}")


  0%|          | 0/1750 [00:00<?, ?it/s]

epoch : 0, average_loss : 0.6717811492851802


In [68]:
bert_model.eval()
classifier.eval()

for epoch in range(num_epochs):
  total_predictions = 0
  correct_predictions = 0

  for batch in tqdm(test_dataloader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    with torch.no_grad():
      outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
      outputs = classifier(outputs.pooler_output)
      _, predicted_labels = torch.max(outputs, dim=1)

    correct_predictions += torch.sum(predicted_labels == labels).item()
    total_predictions += labels.size(0)

  print(f"accuracy : {correct_predictions / total_predictions}")

  0%|          | 0/750 [00:00<?, ?it/s]

accuracy : 0.603


HuggingFace LoRA

In [69]:
peft_config = LoraConfig(task_type=TaskType.SEQ_CLS, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)
bert_model = AutoModelForSequenceClassification.from_pretrained(model_name)
bert_model = get_peft_model(bert_model, peft_config)
bert_model.print_trainable_parameters()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 296,450 || all params: 109,780,228 || trainable%: 0.2700


In [70]:
bert_model.to(device)
bert_model.train()

optimizer = torch.optim.Adam(params=bert_model.parameters(), lr=learning_rate)


for epoch in range(num_epochs):
  total_loss = 0

  for batch in tqdm(train_dataloader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss

    total_loss += loss

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  average_loss = total_loss / len(train_dataloader)

  print(f"epoch : {epoch}, average_loss : {average_loss}")

  0%|          | 0/1750 [00:00<?, ?it/s]

epoch : 0, average_loss : 0.6914411783218384


In [71]:
bert_model.eval()

for epoch in range(num_epochs):
  total_predictions = 0
  correct_predictions = 0

  for batch in tqdm(test_dataloader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    with torch.no_grad():
      outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
      _, predicted_labels = torch.max(outputs.logits, dim=1)

    correct_predictions += torch.sum(predicted_labels == labels).item()
    total_predictions += labels.size(0)

  print(f"accuracy : {correct_predictions / total_predictions}")

  0%|          | 0/750 [00:00<?, ?it/s]

accuracy : 0.5596666666666666
