In [2]:
import pandas as pd
from dataset import read_sentiment_data
import torch 
from torch.utils.data import DataLoader, Dataset
import numpy as np 
import os
import matplotlib.pyplot as plt 

In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
device = torch.device("cuda")

# Data Exploration

In [5]:
ID_2_LABEL = {
    2: "positive", 
    1: "negative", 
    0: "neutral", 
}

LABEL_2_ID = {
     "positive": 2, 
     "negative": 1, 
     "neutral": 0 
}

In [6]:
df_train = read_sentiment_data("./data/sentiment/train.txt")
df_test = read_sentiment_data("./data/sentiment/test.txt")

In [7]:
df_train = pd.DataFrame(data=df_train)
df_train = df_train.convert_dtypes()

df_test = pd.DataFrame(data=df_test) 
df_test = df_test.convert_dtypes()

In [8]:
print(df_train) 

        id  label                                              value
0        0      1                          Cần tư vấn mà add  k rep

1        1      1         Hotline khó gọi quá gọi mãi ko thưa máy à

2        2      1  Mình thấy câu dịch vụ tốt nhất cho kh khó lắm....
3        3      1  Em chọn chuyển tiền trong nước. Chuyển đến số ...
4        4      1       Mình xài cái thể VISA của BIDV hạn mức 100tr
...    ...    ...                                                ...
1972  1972      2                                      Dạ em cảm ơn

1973  1973      1  Có kinh nghiệm nhưng phải bằng đại học chính q...
1974  1974      2                     Vietcombank tks add trước nha

1975  1975      2                            Vietcombank ok tks add

1976  1976      1                  Gọi k được mà tốn tiền như gì ấy


[1977 rows x 3 columns]


In [9]:
positives = df_train['label'][df_train['label'] == 2].count()
negatives = df_train['label'][df_train['label'] == 1].count()
neutrals = df_train['label'][df_train['label'] == 0].count()

print(f"positive label count: {positives}")
print(f"negative label count: {negatives}")
print(f"neutral label count: {neutrals}")

print(f"total: {positives + negatives + neutrals} == {df_train['label'].count()}")

positive label count: 1211
negative label count: 743
neutral label count: 23
total: 1977 == 1977


# Model 

In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification


model = AutoModelForSequenceClassification.from_pretrained(
    "5CD-AI/Vietnamese-Sentiment-visobert")

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
model = model.to(device)
print(model)

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(15004, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=7

## Tokenizer

In [12]:
tokenizer = AutoTokenizer.from_pretrained(
    "5CD-AI/Vietnamese-Sentiment-visobert")

In [13]:
vocab = tokenizer.vocab
reversed_vocab = {v: k for k, v in vocab.items()} # reverse it so that we can retrieve the text from the token 
tokenizer

XLMRobertaTokenizerFast(name_or_path='5CD-AI/Vietnamese-Sentiment-visobert', vocab_size=15002, model_max_length=256, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	15001: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}

In [14]:
df_train.loc[0].value

'Cần tư vấn mà add  k rep\n'

In [15]:
t = tokenizer.encode(text=df_train.loc[0].value)

In [16]:
for id in t: 
    print(f"Token: {id} -> {reversed_vocab[id]}") # ignores new line character, a property of sentencepiece 

Token: 0 -> <s>
Token: 2615 -> ▁Cần
Token: 749 -> ▁tư
Token: 970 -> ▁vấn
Token: 50 -> ▁mà
Token: 2786 -> ▁add
Token: 17 -> ▁k
Token: 2321 -> ▁rep
Token: 2 -> </s>


In [None]:
t_long = tokenizer.encode(text=df_train.loc[64].value)

for id in t_long: 
    print(f"Token: {id} -> {reversed_vocab[id]}")

In [17]:
s = tokenizer.decode(t, skip_special_tokens=True) 
print(s)

Cần tư vấn mà add k rep


# Inference before training

In [18]:
# input = df_train.loc[0]
# label = input.label 
# input = tokenizer.encode(text=input.value) 

# input = torch.tensor(input, dtype=torch.int32) 
# input = input.reshape(1, -1)
# input = input.to("cuda")
# print(input.shape) # (batch size, token length)

In [19]:
# embedding = model.get_input_embeddings()
# embedding_vector = embedding(input)
# print(embedding_vector.shape) # (batch size, token length, embed_dim)

In [20]:
# output = model(input, labels=torch.tensor([1]).unsqueeze(0)) # the labels of the input) 
# loss = output.loss
# print(output)

In [21]:
# res = torch.argmax(output.logits) 
# print(res)
# print(f"Prediction label: {ID_2_LABEL[res.item()]}")
# print(f"Real label: {ID_2_LABEL[label]}")

In [22]:
tokenize_texts = df_train["value"][:500]
tokenize_texts = tokenize_texts.to_list()

In [23]:
tokenize_texts = tokenizer(tokenize_texts, truncation=True, padding=True, return_tensors="pt").to("cpu")

In [24]:
tokens = tokenize_texts["input_ids"]
attention_mask = tokenize_texts["attention_mask"]

In [25]:
# outputs = model(tokens, attention_mask=attention_mask)

# Train

In [26]:
train_tokens = tokenizer(df_train["value"].to_list(), truncation=True, padding=True, return_tensors="pt")

In [27]:
class VisoDataset(Dataset): 
    def __init__(self, tokens: pd.Series, label: pd.Series): 
        self.label = label
        self.input_ids = tokens["input_ids"]
        self.attention_mask = tokens["attention_mask"]

        self.length = len(self.input_ids)

    def __len__(self): 
        return self.length 

    def __getitem__(self, idx): 
        label = self.label.loc[idx]
        input_id = self.input_ids[idx]
        attention_mask = self.attention_mask[idx]

        return {
            "labels": label, 
            "input_ids": input_id, 
            "attention_mask": attention_mask,
        }


In [28]:
train_dataset = VisoDataset(tokens=train_tokens, label=df_train["label"])

In [29]:
train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True, num_workers=8, pin_memory=True)

In [30]:
for params in model.base_model.parameters(): 
    params.requires_grad = False

In [31]:
from transformers import AdamW
from tqdm import tqdm 

optimizer = AdamW(model.parameters(), lr=1e-5)



In [32]:
loss_history = [] 

In [33]:
epochs = 6 

for i in tqdm(range(epochs), desc="Epoch", total=epochs): 
    losses = 0.0
    total_steps = len(train_loader)
    for input_dict in tqdm(train_loader, desc="Train step", total=total_steps): 
        input_ids = input_dict["input_ids"].to(device)
        labels = input_dict["labels"].to(device)
        attention_mask = input_dict["attention_mask"].to(device) 

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        print(outputs.logits.shape)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

        losses += loss.item()

    loss_history.append(losses / total_steps) 

Epoch:   0%|          | 0/6 [00:00<?, ?it/s]

torch.Size([64, 3])
torch.Size([64, 3])




torch.Size([64, 3])




torch.Size([64, 3])




torch.Size([64, 3])




torch.Size([64, 3])


Train step:  16%|█▌        | 5/31 [00:05<00:30,  1.17s/it]
Epoch:   0%|          | 0/6 [00:05<?, ?it/s]


KeyboardInterrupt: 

In [194]:
loss_history

[4.856175491886754,
 3.0950295540594284,
 1.4152748738565752,
 0.5592228068459418,
 0.4253804289525555,
 0.4852882090114778]

In [None]:
plt.plot(loss_history, np.arange(1, 7))