#### Import library

In [7]:
import pandas as pd
import torch
from underthesea import word_tokenize
from transformers import AutoTokenizer, AutoModel
import numpy as np
from tqdm import tqdm
import joblib

#### Load dataset

In [2]:
train_df = pd.read_csv("D:/Projects/Sentiment_Analysis/data/interim/train.csv")
dev_df   = pd.read_csv("D:/Projects/Sentiment_Analysis/data/interim/dev.csv")
test_df  = pd.read_csv("D:/Projects/Sentiment_Analysis/data/interim/test.csv")

print("Train:", train_df.shape)
print("Dev:", dev_df.shape)
print("Test:", test_df.shape)

Train: (11426, 3)
Dev: (3166, 3)
Test: (3166, 3)


#### Load PhoBERT and Tokenize

In [3]:
model_name = "vinai/phobert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval() 

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(64001, 768, padding_idx=1)
    (position_embeddings): Embedding(258, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dr

#### Text → Embedding Function

In [4]:
def get_phobert_embedding(texts, batch_size=8):
    embeddings = []

    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size)):
            batch_texts = texts[i:i+batch_size]
            encoded = tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=256,
                return_tensors="pt"
            )
            outputs = model(**encoded)
            cls_embeddings = outputs.last_hidden_state[:, 0, :].numpy()
            embeddings.append(cls_embeddings)
    return np.vstack(embeddings)

#### Create Embedding

In [5]:
X_train_pho = get_phobert_embedding(train_df["sentence"].tolist())
X_dev_pho   = get_phobert_embedding(dev_df["sentence"].tolist())
X_test_pho  = get_phobert_embedding(test_df["sentence"].tolist())

100%|██████████████████████████████████████████████████████████████████████████████| 1429/1429 [33:58<00:00,  1.43s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 396/396 [08:02<00:00,  1.22s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 396/396 [08:56<00:00,  1.36s/it]


#### Save data

In [10]:
np.savez_compressed("D:/Projects/Sentiment_Analysis/data/features/PhoBERT/X_train_phobert.npz", X_train_pho)
np.savez_compressed("D:/Projects/Sentiment_Analysis/data/features/PhoBERT/X_dev_phobert.npz", X_dev_pho)
np.savez_compressed("D:/Projects/Sentiment_Analysis/data/features/PhoBERT/X_test_phobert.npz", X_test_pho)

joblib.dump(train_df["sentiment"].values, "D:/Projects/Sentiment_Analysis/models/PhoBERT/y_train.pkl")
joblib.dump(dev_df["sentiment"].values, "D:/Projects/Sentiment_Analysis/models/PhoBERT/y_dev.pkl")
joblib.dump(test_df["sentiment"].values, "D:/Projects/Sentiment_Analysis/models/PhoBERT/y_test.pkl")

print("✅ PhoBERT embeddings đã được lưu thành công!")

✅ PhoBERT embeddings đã được lưu thành công!
