In [1]:
import pandas as pd
from dataset import read_intention_data, read_intention_data_multiclass
import torch 
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import os
import matplotlib.pyplot as plt 
from transformers import AdamW
from tqdm import tqdm 

In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
device = torch.device("cuda")

# Data exploration

In [3]:
LABEL_2_ID = {'TRADEMARK': 0, 'INTEREST_RATE': 1, 'ACCOUNT': 2, 'SECURITY': 3, 'CARD': 4, 'SAVING': 5, 'CUSTOMER_SUPPORT': 6,
                'PROMOTION': 7, 'MONEY_TRANSFER': 8, 'PAYMENT': 9, 'DISCOUNT': 10, 'LOAN': 11, 'OTHER': 12, 'INTERNET_BANKING': 13}


ID_2_LABEL = {0: 'TRADEMARK', 1: 'INTEREST_RATE', 2: 'ACCOUNT', 3: 'SECURITY', 4: 'CARD', 5: 'SAVING', 6: 'CUSTOMER_SUPPORT', 7: 'PROMOTION', 8: 'MONEY_TRANSFER', 9: 'PAYMENT', 10: 'DISCOUNT', 11: 'LOAN', 12: 'OTHER', 13: 'INTERNET_BANKING'}

num_classes = len(LABEL_2_ID)

In [137]:
df_train = read_intention_data("./data/intention/train.txt")
# df_test = read_sentiment_data("./data/sentiment/test.txt")

df_train = pd.DataFrame(data=df_train)
df_train = df_train.convert_dtypes()

In [138]:
# count of each class 
label = df_train["label"]


def get_class_count_multiclass(label: pd.Series, classname): 
    label = label.apply(func=lambda x: x[classname] == 1)
    pos = label[label == True].count()

    return pos 

for i in range(num_classes): 
    print(f"class {ID_2_LABEL[i]} has {get_class_count_multiclass(label, classname=i)} instances")

print(f"Number of quotes: {df_train['label'].count()}")

class TRADEMARK has 699 instances
class INTEREST_RATE has 68 instances
class ACCOUNT has 5 instances
class SECURITY has 5 instances
class CARD has 67 instances
class SAVING has 13 instances
class CUSTOMER_SUPPORT has 784 instances
class PROMOTION has 53 instances
class MONEY_TRANSFER has 36 instances
class PAYMENT has 15 instances
class DISCOUNT has 42 instances
class LOAN has 74 instances
class OTHER has 69 instances
class INTERNET_BANKING has 79 instances
Number of quotes: 1977


# Model

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification


base_model = AutoModelForSequenceClassification.from_pretrained(
    "5CD-AI/Vietnamese-Sentiment-visobert")

In [5]:
base_model

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(15004, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=7

In [4]:
base_model = base_model.base_model

In [30]:
print(base_model)

XLMRobertaModel(
  (embeddings): XLMRobertaEmbeddings(
    (word_embeddings): Embedding(15004, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): XLMRobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x XLMRobertaLayer(
        (attention): XLMRobertaAttention(
          (self): XLMRobertaSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): XLMRobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=

In [5]:
class MultiLabelVisobert(nn.Module): 
    def __init__(self, base_model, encoder_outc, num_classes): 
        super().__init__()
        
        self.base_model = base_model 
        self.classifier = nn.ModuleList([
            nn.Linear(in_features=encoder_outc, out_features=encoder_outc, bias=True), 
            nn.Dropout(p=0.1, inplace=False), 
            nn.Linear(in_features=encoder_outc, out_features=num_classes, bias=True)
        ])

    def forward(self, x, attention_mask): 
        x = self.base_model(x, attention_mask) 
        x = x.last_hidden_state
        x = x[:, 0, :]
        for module in self.classifier: 
            x = module(x) 

        return x 

In [6]:
for params in base_model.parameters(): 
    params.requires_grad = False

In [144]:
model = MultiLabelVisobert(base_model=base_model, encoder_outc=768, num_classes=num_classes).to(device)

In [145]:
model

MultiLabelVisobert(
  (base_model): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(15004, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)

# Tokenizer

In [7]:
tokenizer = AutoTokenizer.from_pretrained(
    "5CD-AI/Vietnamese-Sentiment-visobert")

# Train

In this dataset, 1 quote can be associated with multiple intentions. This means that we can train this model as a multi-label classifier. We can also just train this as a pure multi-class classifier because the data for some intentions are very rare, causing them to not be sufficiently trained.  

## Multi-label train

In [147]:
train_tokens = tokenizer(df_train["value"].to_list(), truncation=True, padding=True, return_tensors="pt")

In [17]:
class VisoDataset(Dataset): 
    def __init__(self, tokens: pd.Series, label: pd.Series): 
        self.label = label
        self.input_ids = tokens["input_ids"]
        self.attention_mask = tokens["attention_mask"]

        self.length = len(self.input_ids)

    def __len__(self): 
        return self.length 

    def __getitem__(self, idx): 
        label = torch.tensor(self.label.loc[idx], dtype=torch.float32) 
        input_id = self.input_ids[idx]
        attention_mask = self.attention_mask[idx]

        return {
            "labels": label, 
            "input_ids": input_id, 
            "attention_mask": attention_mask,
        }

In [149]:
train_dataset = VisoDataset(tokens=train_tokens, label=df_train["label"])

In [150]:
train_dataset[0]

{'labels': tensor([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]),
 'input_ids': tensor([   0, 2615,  749,  970,   50, 2786,   17, 2321,    2,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1

In [151]:
train_loader = DataLoader(dataset=train_dataset, batch_size=4, shuffle=True, num_workers=4, pin_memory=True)

In [152]:
optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = nn.BCEWithLogitsLoss(reduction="mean")

loss_history = [] 



In [153]:
epochs = 10

for i in tqdm(range(epochs), desc="Epochs", total=epochs): 
    epoch_loss = 0.0 
    total_steps = len(train_loader)

    for input_dict in train_loader: 
        input_ids = input_dict["input_ids"].to(device)
        labels = input_dict["labels"].to(device)
        attention_mask = input_dict["attention_mask"].to(device)

        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    loss_history.append(epoch_loss / total_steps)

Epochs: 100%|██████████| 10/10 [05:33<00:00, 33.31s/it]


In [154]:
loss_history

[0.26008071052004594,
 0.15660022032456566,
 0.16292103533727126,
 0.1848858987303413,
 0.19245248282113056,
 0.19669333255856833,
 0.21083191876479593,
 0.21409094560291017,
 0.24566511762552384,
 0.19201297443849746]

## Multi-class train 

For multi-class train, we need to normalize the dataset so that there are no class imbalances

In [8]:
LABEL_2_ID_MULTICLASS = {"TRADEMARK": 0, "CUSTOMER_SUPPORT": 1, "OTHER": 2}
ID_2_LABEL_MULTICLASS = {0: "TRADEMARK", 1: "CUSTOMER_SUPPORT", 2: "OTHER"}

num_classes_multiclass = len(LABEL_2_ID_MULTICLASS)

In [9]:
df_train_multiclass = read_intention_data_multiclass(
    "./data/intention/train_multiclass.txt")

df_train_multiclass = pd.DataFrame(df_train_multiclass)
df_train_multiclass = df_train_multiclass.convert_dtypes()

In [10]:
df_train_multiclass

Unnamed: 0,id,label,value
0,0,1,Cần tư vấn mà add k rep
1,1,1,Hotline khó gọi quá gọi mãi ko thưa máy à
2,2,1,Mình thấy câu dịch vụ tốt nhất cho kh khó lắm....
3,3,0,Em chọn chuyển tiền trong nước. Chuyển đến số ...
4,4,2,Mình xài cái thể VISA của BIDV hạn mức 100tr
...,...,...,...
1972,1972,1,Dạ em cảm ơn
1973,1973,2,Có kinh nghiệm nhưng phải bằng đại học chính q...
1974,1974,1,Vietcombank tks add trước nha
1975,1975,1,Vietcombank ok tks add


In [11]:
label = df_train_multiclass["label"]

def get_class_count_multiclass(label: pd.Series, classname): 
    pos = label[label == classname].count()
    return pos 

for i in range(num_classes_multiclass): 
    count = get_class_count_multiclass(label, classname=i)
    print(f"class {ID_2_LABEL_MULTICLASS[i]} has {count} instances, taking up {count / label.count() * 100}% of the dataset")

print(f"Number of quotes: {df_train_multiclass['label'].count()}")

class TRADEMARK has 699 instances, taking up 35.35660091047041% of the dataset
class CUSTOMER_SUPPORT has 774 instances, taking up 39.150227617602425% of the dataset
class OTHER has 504 instances, taking up 25.493171471927162% of the dataset
Number of quotes: 1977


In [12]:
model_multiclass = MultiLabelVisobert(base_model=base_model, encoder_outc=768, num_classes=num_classes_multiclass).to(device)

In [13]:
train_tokens_multiclass = tokenizer(df_train_multiclass["value"].to_list(), truncation=True, padding=True, return_tensors="pt")

In [26]:
train_dataset_multiclass = VisoDataset(tokens=train_tokens_multiclass, label=df_train_multiclass["label"])
train_loader_multiclass = DataLoader(dataset=train_dataset_multiclass, batch_size=128, shuffle=True, num_workers=16, pin_memory=True)

In [27]:
optimizer_multiclass = AdamW(model_multiclass.parameters(), lr=1e-5)
criterion_multiclass = nn.CrossEntropyLoss(reduction="mean")

loss_history_multiclass = [] 

In [28]:
epochs = 10

for i in tqdm(range(epochs), desc="Epochs", total=epochs): 
    epoch_loss = 0.0 
    total_steps = len(train_loader_multiclass)

    for input_dict in train_loader_multiclass: 
        input_ids = input_dict["input_ids"].to(device)
        labels = input_dict["labels"].long().to(device)
        attention_mask = input_dict["attention_mask"].to(device)

        logits = model_multiclass(input_ids, attention_mask)
        loss = criterion_multiclass(logits, labels)

        loss.backward()
        optimizer_multiclass.step()

        epoch_loss += loss.item()

    loss_history_multiclass.append(epoch_loss / total_steps)

Epochs: 100%|██████████| 10/10 [05:03<00:00, 30.35s/it]


In [29]:
loss_history_multiclass

[0.8206733800470829,
 0.8539123684167862,
 0.8558911122381687,
 0.7922081016004086,
 0.716634813696146,
 0.6465386040508747,
 0.621187211945653,
 0.650752292945981,
 0.6999602951109409,
 0.7664244472980499]