In [22]:
from transformers import AutoModel
from transformers import AutoTokenizer

In [23]:
model_path = '/home/link/spaces/LinhCSE/models/retriever'
tokenizer = AutoTokenizer.from_pretrained('vinai/phobert-base-v2')
model = AutoModel.from_pretrained(model_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at /home/link/spaces/LinhCSE/models/retriever were not used when initializing RobertaModel: ['mlp.dense.bias', 'lm_head.dense.weight', 'mlp.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /home/link/spaces/LinhCSE/models/retriever and are newly i

In [24]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [11]:
def encode(texts, max_seq_len=256, return_output='np', **kwargs):
    features = tokenizer(
        texts,
        max_length=max_seq_len,
        padding='max_length',
        truncation=True,
        return_tensors='pt',
        pad_to_multiple_of=256,
    )
    with torch.no_grad():
        inputs = {
            'input_ids': features['input_ids'].to(model.device),
            'attention_mask': features['attention_mask'].to(model.device),
            'token_type_ids': features['token_type_ids'].to(model.device),
        }
        embedding = model(**inputs)
        embedding = embedding.last_hidden_state[:, 0, :]  # CLS token
        # embedding = embedding.last_hidden_state.mean(dim=1)  # AVG token

    if return_output == 'np':
        embedding.detach().cpu().numpy()
    return embedding

In [25]:
import json
from tqdm import tqdm 
from pyvi import ViTokenizer

In [26]:
with open('/home/link/spaces/LinhCSE/data/369/corpus.json') as f:
    corpus = json.load(f)

In [27]:
from src.retriever.utils.normalize import normalize_encode
from src.retriever.utils.normalize import normalize_word_diacritic
from src.retriever.utils.normalize import remove_punctuation
def pre_process(text):
    text = normalize_encode(normalize_word_diacritic(text))
    text = text.lower()
    text = remove_punctuation(text)
    return text

In [28]:
all_texts = []
for document in tqdm(corpus):
    for chunk in document['sections']:
        text = ViTokenizer.tokenize(chunk['content'])
        text = pre_process(text)
        all_texts.append(text)

  0%|          | 0/3207 [00:00<?, ?it/s]

100%|██████████| 3207/3207 [05:37<00:00,  9.49it/s]


In [29]:
import numpy as np
embedding_query = None
for i in tqdm(range(0, len(all_texts), 512)):
    # for query in all_texts[i: i + 64]:
        queries = all_texts[i: i + 512]
        # queries.append(query)

        embedding = encode(queries)
        if embedding_query is None:
            embedding_query = embedding.detach().cpu().numpy()
        else:
            embedding_query = np.append(
                embedding_query,
                embedding.detach().cpu().numpy(),
                axis=0,
            )

  0%|          | 0/761 [00:00<?, ?it/s]

100%|██████████| 761/761 [31:34<00:00,  2.49s/it]


In [30]:
np.save('/home/link/spaces/LinhCSE/data/369/embeddings_corpus.npy', embedding_query)

In [31]:
embeddings_loaded = np.load('/home/link/spaces/LinhCSE/data/369/embeddings_corpus.npy')

In [32]:
embeddings_loaded.shape

(389389, 768)

In [20]:
embedding_query.shape

(391391, 768)

In [None]:
with torch.no_grad():
    for document in tqdm(corpus):
        for chunk in document['section']:
            text = ViTokenizer.tokenize(chunk['content'])
            text = pre_process(text)

            chunk["embedding"] = embedding_query[0]
            embedding_query = embedding_query[1:]