In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tqdm
import wandb

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [2]:
train = '/home/ines/JYP/Dataset/ratings_train.txt'
test = '/home/ines/JYP/Dataset/ratings_test.txt'

tra = pd.read_csv(train, sep='\t')
tes = pd.read_csv(test, sep='\t')

tr = tra.sample(frac=0.5, random_state=42)
te = tes.sample(frac=0.5, random_state=42)

tr.dropna(inplace=True)
te.dropna(inplace=True)
tr.info()

<class 'pandas.core.frame.DataFrame'>
Index: 74997 entries, 59770 to 96244
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        74997 non-null  int64 
 1   document  74997 non-null  object
 2   label     74997 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 2.3+ MB


In [3]:
tr_doc = tr['document']
tr_lb = tr['label']
te_doc = te['document']
te_lb = te['label']

max_len = max(len(sent) for sent in tr_doc)
print(max_len)

train_doc, val_doc, train_lb, val_lb = train_test_split(tr_doc, tr_lb, random_state=42, test_size=0.1)

146


In [4]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)
device_name = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(device_name)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [5]:
device

device(type='cuda')

In [6]:
def pre_tok(doc):
    tokenized = []
    mask = []
    for sent in doc:
        tok_sen = tokenizer(sent, max_length=146, padding='max_length', truncation=True)
        tok_id = tok_sen['input_ids']
        att_mask = tok_sen['attention_mask']
        tokenized.append(tok_id)
        mask.append(att_mask)
    return tokenized, mask


tr_tok, tr_mask = pre_tok(train_doc)
val_tok, val_mask = pre_tok(val_doc)
te_tok, te_mask = pre_tok(te_doc)

In [7]:
tr_tok_ten = torch.tensor(tr_tok)
tr_mask_ten = torch.tensor(tr_mask)
tr_lb = torch.tensor(list(train_lb))

val_tok_ten = torch.tensor(val_tok)
val_mask_ten = torch.tensor(val_mask)
val_lb = torch.tensor(list(val_lb))

te_tok_ten = torch.tensor(te_tok)
te_mask_ten = torch.tensor(te_mask)
te_lb = torch.tensor(list(te_lb))

In [8]:
class CustomDataset(Dataset):
    def __init__(self, tok, mask, lb):
        super(CustomDataset, self).__init__()

        self.tok = tok
        self.mask = mask
        self.lb = lb

    def __len__(self):
        return len(self.lb)

    def __getitem__(self, idx):
        return {
            'input_ids': self.tok[idx].clone().detach(),
            'attention_mask': self.mask[idx].clone().detach(),
            'labels': self.lb[idx].clone().detach()
        }
    

#batch size
batch_size = 16

tr_set = CustomDataset(tr_tok_ten, tr_mask_ten, tr_lb)
val_set = CustomDataset(val_tok_ten, val_mask_ten, val_lb)
te_set = CustomDataset(te_tok_ten, te_mask_ten, te_lb)

tr_dl = DataLoader(tr_set, batch_size = batch_size, shuffle=True)
val_dl = DataLoader(val_set, batch_size = batch_size, shuffle=True)
te_dl = DataLoader(te_set, batch_size = batch_size, shuffle=True)

In [9]:
!pwd

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/home/ines/JYP


In [10]:
#Hyperparameter
lr = 1e-5
epochs = 5

optimizer = optim.Adam(model.parameters(), lr=lr, eps=1e-8)

wandb.init(project='NSMC_Classification')
wandb.run.name='Res_epoch_5'
wandb.run.save()

args = {
    "learning_rate": lr,
    "epochs": epochs,
    "batch_size": batch_size
}
wandb.config.update(args)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mqkrwodbs0824[0m ([33mines_[0m). Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [11]:
#Train & Validation
all_preds = []
all_labels = []

loss = 0
total_loss = 0
best_acc = 0.0

for epoch in range(epochs):
    total_loss = 0
    correct=0
    total_sample = 0
    
    model.train()
    for step, batch in enumerate(tr_dl):
        optimizer.zero_grad()

        texts, attention_masks, labels = batch['input_ids'], batch['attention_mask'], batch['labels']
        texts, attention_masks, labels = texts.to(device), attention_masks.to(device), labels.to(device)
        outputs = model(input_ids = texts, attention_mask = attention_masks, labels=labels)
        
        loss = outputs.loss
        logits = outputs.logits
        pred = torch.argmax(torch.sigmoid(logits), dim=1)
        
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        pred = pred.tolist()
        labels = labels.tolist()
        accuracy = accuracy_score(pred, labels)
        
        if step % 500 == 0:
            print(f"epoch:{epoch+1} - steps: {step} Tr_loss: {loss.item():.4f}  Tr_accuracy: {accuracy:.4f}")
            wandb.log({"Training Accuracy": accuracy})
            
    model.eval()
    with torch.no_grad():
        val_total_loss = 0
        val_total_cor = 0
        for v_step, batch in enumerate(val_dl):
            val_texts, val_attention_masks, val_labels = batch['input_ids'], batch['attention_mask'], batch['labels']
            val_texts, val_attention_masks, val_labels = val_texts.to(device), val_attention_masks.to(device), val_labels.to(device)
            val_outputs = model(input_ids = val_texts, attention_mask = val_attention_masks, labels=val_labels)

            val_loss = val_outputs.loss
            val_logits = val_outputs.logits
            val_pred = torch.argmax(torch.sigmoid(val_logits), dim=1)
            val_total_sample = len(val_dl.dataset)

            val_total_loss += val_loss.item()
            val_avg_loss = val_total_loss / val_total_sample
        
        val_pred = val_pred.tolist()
        val_labels = val_labels.tolist()
        val_acc = accuracy_score(val_pred, val_labels)
        print(f"val_loss: {val_avg_loss:.4f} val_acc: {val_acc:.4f}")

    if accuracy >= best_acc:
        best_acc = accuracy
        torch.save(model.state_dict(), 'NSMC_Bert.pth')
    
print(f"Best accuracy: {best_acc:.4f}")

epoch:1 - steps: 0 Tr_loss: 0.6700  Tr_accuracy: 0.5625
epoch:1 - steps: 500 Tr_loss: 0.3678  Tr_accuracy: 0.8750
epoch:1 - steps: 1000 Tr_loss: 0.7176  Tr_accuracy: 0.5625
epoch:1 - steps: 1500 Tr_loss: 0.5891  Tr_accuracy: 0.6875
epoch:1 - steps: 2000 Tr_loss: 0.2263  Tr_accuracy: 0.9375
epoch:1 - steps: 2500 Tr_loss: 0.4548  Tr_accuracy: 0.7500
epoch:1 - steps: 3000 Tr_loss: 0.3110  Tr_accuracy: 0.9375
epoch:1 - steps: 3500 Tr_loss: 0.6454  Tr_accuracy: 0.6875
epoch:1 - steps: 4000 Tr_loss: 0.2518  Tr_accuracy: 0.8750
val_loss: 0.0237 val_acc: 0.6667
epoch:2 - steps: 0 Tr_loss: 0.2640  Tr_accuracy: 0.9375
epoch:2 - steps: 500 Tr_loss: 0.1974  Tr_accuracy: 0.8750
epoch:2 - steps: 1000 Tr_loss: 0.1533  Tr_accuracy: 0.9375
epoch:2 - steps: 1500 Tr_loss: 0.4013  Tr_accuracy: 0.8125
epoch:2 - steps: 2000 Tr_loss: 0.3598  Tr_accuracy: 0.8750
epoch:2 - steps: 2500 Tr_loss: 0.2872  Tr_accuracy: 0.8750
epoch:2 - steps: 3000 Tr_loss: 0.2700  Tr_accuracy: 0.8750
epoch:2 - steps: 3500 Tr_loss: 

In [20]:
#test
model.eval()
with torch.no_grad():
    for batch in te_dl:
        te_texts, te_att_masks, te_lab = batch['input_ids'], batch['attention_mask'], batch['labels']
        te_texts, te_att_masks, te_lab = te_texts.to(device), te_att_masks.to(device), te_lab.to(device)
    
        te_outputs = model(input_ids=te_texts, attention_mask=te_att_masks, labels=te_lab)
        te_logits = te_outputs.logits
        te_pred = torch.argmax(torch.sigmoid(te_logits), dim=1)

te_pred = te_pred.tolist()
te_lab = te_lab.tolist()
accuracy = accuracy_score(te_pred, te_lab)
print(f"Total Accuracy on Test Data: {accuracy:.4f}")

Total Accuracy on Test Data: 1.0000
