In [1]:
from transformers import AutoTokenizer, AutoModel, MT5Model, AutoConfig
import torch

In [2]:
import pandas as pd
data = pd.read_json('/Graduation_Research_1/dataset/intent/english_laptop_store.json')

In [3]:
print(data.head())

                                            sentence                 intent
0               I'm very interested in this product.  Interest Confirmation
1               This looks like exactly what I need.  Interest Confirmation
2          I want to buy this one; it looks perfect!  Interest Confirmation
3  This is really nice. I think it's the one for me.  Interest Confirmation
4  I’m planning to purchase this. Can you help me...  Interest Confirmation


In [4]:
data = data.sample(frac=1).reset_index(drop=True) # suffle data randomly
print(data.head())

                                            sentence                intent
0                 Sản phẩm này có giá bao nhiêu vậy?         Price Inquiry
1  Giữa MacBook Pro và Dell XPS 15, chiếc nào có ...    Comparison Inquiry
2  What are the closest alternatives to this laptop?          Find Similar
3  Chiếc này có thể mua được ở cửa hàng gần đây k...  Product Availability
4  Tôi sẽ mua chiếc này vì nó đáp ứng được hết nh...     Purchase Decision


In [5]:
x = data['sentence'].values
y = data['intent'].values

In [6]:
print(x[0])
print(y[0])

Sản phẩm này có giá bao nhiêu vậy?
Price Inquiry


In [7]:
import numpy as np
unique_label = np.unique(y)

In [8]:
print(unique_label)

['Change Decision' 'Comparison Inquiry' 'Delivery Options'
 'Feature Confirmation' 'Find Similar' 'Interest Confirmation'
 'Payment Options' 'Price Inquiry' 'Product Availability'
 'Purchase Decision' 'Return Policy Inquiry' 'Specific Need'
 'Thank You/Closing' 'Warranty Inquiry']


In [9]:
label2index = {}
index2label = {}
for i in range(len(unique_label)):
    label2index[unique_label[i]] = i
    index2label[i] = unique_label[i]


In [10]:
print(label2index)
print(index2label)

{'Change Decision': 0, 'Comparison Inquiry': 1, 'Delivery Options': 2, 'Feature Confirmation': 3, 'Find Similar': 4, 'Interest Confirmation': 5, 'Payment Options': 6, 'Price Inquiry': 7, 'Product Availability': 8, 'Purchase Decision': 9, 'Return Policy Inquiry': 10, 'Specific Need': 11, 'Thank You/Closing': 12, 'Warranty Inquiry': 13}
{0: 'Change Decision', 1: 'Comparison Inquiry', 2: 'Delivery Options', 3: 'Feature Confirmation', 4: 'Find Similar', 5: 'Interest Confirmation', 6: 'Payment Options', 7: 'Price Inquiry', 8: 'Product Availability', 9: 'Purchase Decision', 10: 'Return Policy Inquiry', 11: 'Specific Need', 12: 'Thank You/Closing', 13: 'Warranty Inquiry'}


In [11]:
y = np.vectorize(label2index.get)(y)

In [12]:
print(x[0])
print(y[0]) # check for the vectorize

Sản phẩm này có giá bao nhiêu vậy?
7


In [13]:
from sklearn.model_selection import train_test_split
x_trainVal, x_test, y_trainVal, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print(len(x_trainVal))
print(len(y_trainVal))

KeyboardInterrupt: 

In [14]:
x_train, x_val, y_train ,y_val = train_test_split(x_trainVal, y_trainVal, test_size=0.15)

In [15]:
print(len(x_train), len(y_train))
print(len(x_test), len(y_test))
print(len(x_val), len(y_val))

586 586
173 173
104 104


In [16]:
dataset_train = {'sentence' : x_train, 'intent' : y_train}
dataset_valid = {'sentence' : x_val, 'intent' : y_val}
dataset_test = {'sentence' : x_test, 'intent' : y_test}


In [17]:
dataset_train = pd.DataFrame(dataset_train)
dataset_valid = pd.DataFrame(dataset_valid)
dataset_test = pd.DataFrame(dataset_test)

In [18]:
length = [len(sentence.split()) for sentence in dataset_test['sentence']]
max_length = max(length)
print(max_length)

25


In [19]:
print(dataset_train.head())
print(dataset_test.head())
print(dataset_valid.head())

                                            sentence  intent
0  Tôi có thể gia hạn bảo hành cho chiếc máy này ...      12
1  Tôi muốn biết thêm chi tiết về sản phẩm này, n...       4
2  Tôi sẽ lấy chiếc này, vui lòng giúp tôi thanh ...       8
3  Mình cần một chiếc máy có hiệu suất cao để làm...      10
4  Chiếc máy này nhìn có vẻ phù hợp với nhu cầu c...       4
                                            sentence  intent
0             Is there a restocking fee for returns?       9
1                      Let’s go ahead with this one.       8
2  Cũng có thể tôi sẽ chọn chiếc màn hình 14 inch...       0
3          Đây là sản phẩm tôi cần, tôi sẽ mua ngay.       8
4             Chiếc này có mức giá khuyến mãi không?       6
                                            sentence  intent
0  Bảo hành của sản phẩm này có bao gồm phần mềm ...      12
1       Ở đây có Acer Zenflip bản 32 GB ram không ạ?       7
2  Có những phương thức thanh toán nào khi mua hàng?       5
3           Sản phẩm có 

In [20]:
import os
from huggingface_hub import hf_hub_download
import torch
from transformers import AutoTokenizer, AutoConfig, AutoModel, MT5Model


class MT5EncoderForIntentRecognition(torch.nn.Module):
    def __init__(self, encoder, num_labels):
        super(MT5EncoderForIntentRecognition, self).__init__()
        self.encoder = encoder
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(self.encoder.config.d_model, num_labels)

    @classmethod
    def from_pretrained(cls, model_name, num_labels):
        config = AutoConfig.from_pretrained(model_name)
        encoder = MT5Model.from_pretrained(model_name).encoder
        model_path = hf_hub_download(repo_id=model_name, filename="pytorch_model.bin")

        model = cls(encoder, num_labels)
        state_dict = torch.load(model_path, map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
        model.encoder.load_state_dict(state_dict['encoder_state_dict'], strict=False)
        # Reinitialize the classifier
        model.classifier = torch.nn.Linear(model.encoder.config.d_model, num_labels)  # initialize a new classifier layer for solving this.
        return model

    def forward(self, input_ids, attention_mask=None, labels=None):
        encoder_outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = encoder_outputs.last_hidden_state
        pooled_output = sequence_output[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.classifier.out_features), labels.view(-1))

        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}


In [21]:
model_name = "NguyenMinh03082004/intent_recognition_based_mt5_encoder"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = MT5EncoderForIntentRecognition.from_pretrained(model_name, num_labels=len(unique_label))
device = torch.device('cpu')
model.to(device)

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
Some weights of MT5Model were not initialized from the model checkpoint at NguyenMinh03082004/intent_recognition_based_mt5_encoder and are newly initialized: ['decoder.block.0.layer.0.SelfAttention.k.weight', 'decoder.block.0.layer.0.SelfAttention.o.weight', 'decoder.block.0.layer.0.SelfAttention.q.weight', 'decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight', 'decoder.block.0.layer.0.SelfAttention.v.weight', 'decoder.block.0.layer.0.layer_norm.weight', 'decoder.block.0.layer.1.EncDecAttention.k.weight', 'decoder.block.0.layer.1.EncDecAttention.o.weight', 'decoder.block.0.layer.1.EncDecAttention.q.weight', 'decoder.block.0.layer.1.EncDecAttention.v.weight', 'decoder.block.0.layer.1.layer_norm.weight', 'decoder.block.0.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.0.layer.2.DenseReluDense.wi_1.weight', 'decoder.block.0.layer.2.DenseReluDense.wo.weight', 'decoder.block.0.layer

MT5EncoderForIntentRecognition(
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 512)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): Linear(in_features=102

In [22]:
#Prepare the dataset
from transformers import DataCollatorWithPadding
def tokenize_function(examples):
    return tokenizer(examples['sentence'], padding="max_length", truncation=True, max_length=70)


In [23]:
train_encodings = tokenizer(list(dataset_train['sentence']), truncation=True, padding=True, max_length = 70)
val_encodings = tokenizer(list(dataset_valid['sentence']), truncation=True, padding=True, max_length=70)
test_encodings = tokenizer(list(dataset_test['sentence']), truncation=True, padding=True, max_length=70)

In [24]:
print(train_encodings[0])

Encoding(num_tokens=70, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [25]:
# convert dataset to pytorch tensors 
class IntentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)


In [26]:
train_dataset = IntentDataset(train_encodings, list(dataset_train['intent']))
val_dataset = IntentDataset(val_encodings, list(dataset_valid['intent']))
test_dataset = IntentDataset(test_encodings, list(dataset_test['intent']))

In [27]:
print(train_dataset[0])

{'input_ids': tensor([  366,  3571,   885,   394,   924,  2626,   382,  1292,   330,  3420,
          259,  3449,   970,   317, 12493,   297,  1382,   276,  2424,  1284,
          291,     1,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'labels': tensor(12)}


In [28]:
from transformers import DataCollatorWithPadding


In [29]:
from transformers import Trainer, TrainingArguments


In [30]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def compute_matrics(pred):
    label = pred.label_ids
    preds = pred.predictions.argmax(-1)

    accuracy = accuracy_score(label, preds)
    precision = precision_score(label, preds, average="weighted", zero_division=0)
    recall = recall_score(label, preds, average="weighted", zero_division=0)
    f1 = f1_score(label, preds, average="weighted", zero_division=0)
    return {
        'accuracy' : accuracy,
        'precision' : precision,
        'recall' : recall, 
        'f1' : f1
    }


In [31]:
training_args = TrainingArguments(
    output_dir='/results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-4,
    per_device_eval_batch_size=2,
    per_device_train_batch_size=2,
    num_train_epochs= 5,
    weight_decay=0.01,
    logging_dir='.logs',
    load_best_model_at_end=True
)
trainer = Trainer(
    model = model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer = tokenizer,
    compute_metrics = compute_matrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
)

  trainer = Trainer(


In [32]:
for name, param in model.named_parameters():
    if not param.is_contiguous():
        param.data = param.contiguous()
trainer.train()

  0%|          | 0/1465 [00:00<?, ?it/s]

  0%|          | 0/52 [00:00<?, ?it/s]

{'eval_loss': 1.1497259140014648, 'eval_accuracy': 0.6826923076923077, 'eval_precision': 0.7154071355994434, 'eval_recall': 0.6826923076923077, 'eval_f1': 0.6487797074065026, 'eval_runtime': 2.4256, 'eval_samples_per_second': 42.876, 'eval_steps_per_second': 21.438, 'epoch': 1.0}
{'loss': 1.5527, 'grad_norm': 11.923018455505371, 'learning_rate': 0.000131740614334471, 'epoch': 1.71}


  0%|          | 0/52 [00:00<?, ?it/s]

{'eval_loss': 0.6548277139663696, 'eval_accuracy': 0.7692307692307693, 'eval_precision': 0.8093065431300726, 'eval_recall': 0.7692307692307693, 'eval_f1': 0.7620834415346501, 'eval_runtime': 2.4439, 'eval_samples_per_second': 42.555, 'eval_steps_per_second': 21.278, 'epoch': 2.0}


  0%|          | 0/52 [00:00<?, ?it/s]

{'eval_loss': 0.46170279383659363, 'eval_accuracy': 0.8365384615384616, 'eval_precision': 0.8709852647352647, 'eval_recall': 0.8365384615384616, 'eval_f1': 0.8315295983818252, 'eval_runtime': 2.4385, 'eval_samples_per_second': 42.649, 'eval_steps_per_second': 21.325, 'epoch': 3.0}
{'loss': 0.4328, 'grad_norm': 3.29240345954895, 'learning_rate': 6.348122866894199e-05, 'epoch': 3.41}


  0%|          | 0/52 [00:00<?, ?it/s]

{'eval_loss': 0.4011614918708801, 'eval_accuracy': 0.8557692307692307, 'eval_precision': 0.8850961538461538, 'eval_recall': 0.8557692307692307, 'eval_f1': 0.8547772403035562, 'eval_runtime': 2.4319, 'eval_samples_per_second': 42.766, 'eval_steps_per_second': 21.383, 'epoch': 4.0}


  0%|          | 0/52 [00:00<?, ?it/s]

{'eval_loss': 0.3590450882911682, 'eval_accuracy': 0.9038461538461539, 'eval_precision': 0.9141802641802641, 'eval_recall': 0.9038461538461539, 'eval_f1': 0.9019537999746473, 'eval_runtime': 2.4997, 'eval_samples_per_second': 41.606, 'eval_steps_per_second': 20.803, 'epoch': 5.0}
{'train_runtime': 1507.3177, 'train_samples_per_second': 1.944, 'train_steps_per_second': 0.972, 'train_loss': 0.7356890837893958, 'epoch': 5.0}


TrainOutput(global_step=1465, training_loss=0.7356890837893958, metrics={'train_runtime': 1507.3177, 'train_samples_per_second': 1.944, 'train_steps_per_second': 0.972, 'total_flos': 0.0, 'train_loss': 0.7356890837893958, 'epoch': 5.0})

In [33]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    label = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    accuracy = accuracy_score(label, preds)
    precision = precision_score(label, preds, average="weighted", zero_division=0)
    recall = recall_score(label, preds, average="weighted", zero_division=0)
    f1 = f1_score(label, preds, average="weighted", zero_division=0)
    
    # Metrics for each label
    precision_per_label = precision_score(label, preds, average=None, zero_division=0)
    recall_per_label = recall_score(label, preds, average=None, zero_division=0)
    f1_per_label = f1_score(label, preds, average=None, zero_division=0)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'precision_per_label': precision_per_label,
        'recall_per_label': recall_per_label,
        'f1_per_label': f1_per_label
    }


In [38]:
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics
)

# Run evaluation
results = trainer.evaluate()
print(results)

  trainer = Trainer(


  0%|          | 0/87 [00:00<?, ?it/s]

{'eval_loss': 0.2998286783695221, 'eval_model_preparation_time': 0.0, 'eval_accuracy': 0.9248554913294798, 'eval_precision': 0.9282453577251265, 'eval_recall': 0.9248554913294798, 'eval_f1': 0.9242407524048754, 'eval_precision_per_label': array([1.        , 0.85714286, 0.90909091, 0.9       , 0.85714286,
       1.        , 0.86666667, 0.76923077, 0.90909091, 0.92307692,
       1.        , 1.        , 1.        ]), 'eval_recall_per_label': array([0.85      , 0.92307692, 0.76923077, 1.        , 0.85714286,
       1.        , 1.        , 0.76923077, 1.        , 1.        ,
       0.91666667, 1.        , 0.93333333]), 'eval_f1_per_label': array([0.91891892, 0.88888889, 0.83333333, 0.94736842, 0.85714286,
       1.        , 0.92857143, 0.76923077, 0.95238095, 0.96      ,
       0.95652174, 1.        , 0.96551724]), 'eval_runtime': 5.2412, 'eval_samples_per_second': 33.008, 'eval_steps_per_second': 16.599}


In [39]:
import os
import torch
from transformers import AutoConfig, AutoTokenizer

save_directory = "./mt5_encoder_intent_for_laptop_conversation"
if not os.path.exists(save_directory):
    os.makedirs(save_directory)

# Save model state dictionary
torch.save({
    'encoder_state_dict': model.encoder.state_dict(),
    'classifier_state_dict': model.classifier.state_dict()
}, os.path.join(save_directory, "pytorch_model.bin"))

# Save tokenizer
tokenizer.save_pretrained(save_directory)

# Save configuration
config = AutoConfig.from_pretrained("google/mt5-small")
config.save_pretrained(save_directory)

model.eval()


MT5EncoderForIntentRecognition(
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 512)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): Linear(in_features=102