In [None]:
!pip install -q sentence-transformers InstructorEmbedding


In [None]:
from google.colab import drive
import os

# Kết nối Google Drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:

# Thư mục gốc chứa project của bạn trên Drive
BASE_DIR = "/content/drive/MyDrive/Colab Notebooks/Extend_dsa"  # 🔁 sửa lại cho đúng

# Đường dẫn file data và thư mục lưu embedding
DATA_PATH = f"{BASE_DIR}/Data/merged_dataset.csv"
OUTPUT_DIR = f"{BASE_DIR}/Data/embeddings"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("DATA_PATH:", DATA_PATH)
print("OUTPUT_DIR:", OUTPUT_DIR)


DATA_PATH: /content/drive/MyDrive/Colab Notebooks/Extend_dsa/Data/merged_dataset.csv
OUTPUT_DIR: /content/drive/MyDrive/Colab Notebooks/Extend_dsa/Data/embeddings


In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch

from sentence_transformers import SentenceTransformer
from InstructorEmbedding import INSTRUCTOR



# Cấu hình 4 model
MODEL_CONFIGS = {
    "bge-base-en-v1.5": {
        "name": "BAAI/bge-base-en-v1.5",
        "type": "sentence_transformer",
        "prefix": None,          # không thêm prefix
    },
    "all-MiniLM-L6-v2": {
        "name": "sentence-transformers/all-MiniLM-L6-v2",
        "type": "sentence_transformer",
        "prefix": None,
    },
    "instructor-base": {
        "name": "hkunlp/instructor-base",
        "type": "instructor",
        "instruction": "Represent the sentence for semantic similarity.",
    },
    "e5-base-v2": {
        "name": "intfloat/e5-base-v2",
        "type": "sentence_transformer",
        # e5 thường dùng prefix "passage: " / "query: "
        "prefix": "passage: ",
    },
}

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


Using device: cuda


In [None]:
# Đọc data đã merge
df = pd.read_csv(DATA_PATH)

required_cols = ['text1', 'text2', 'is_duplicate']
missing = [c for c in required_cols if c not in df.columns]
if missing:
    raise ValueError("Thiếu các cột bắt buộc trong merged_dataset.csv: {}".format(missing))

# Chỉ giữ đúng 3 cột cần thiết
df = df[required_cols].copy()

# Bỏ NaN & chuỗi rỗng
df = df.dropna(subset=required_cols)
df['text1'] = df['text1'].astype(str).str.strip()
df['text2'] = df['text2'].astype(str).str.strip()
df = df[(df['text1'] != "") & (df['text2'] != "")]

# Đảm bảo label là int
df['is_duplicate'] = df['is_duplicate'].astype(int)

df = df.reset_index(drop=True)
print("Số mẫu sau khi lọc:", df.shape[0])
df.head()


Số mẫu sau khi lọc: 21626


Unnamed: 0,text1,text2,is_duplicate
0,What are the best desktop configuration for a ...,Which is the best laptop for game and graphic ...,0
1,How can I effectively market my app in a soft ...,What are good free ways to promote your Androi...,0
2,How do north Indians look?,Statergies of election in India?,0
3,How do I manage when you are speaking in group...,How can you get over the feeling when your gan...,0
4,Can a minor sue or be sued?,Is it possible to sue Sea World?,0


In [None]:
def encode_sentence_transformer(model_name,
                                df,
                                output_dir,
                                prefix=None,
                                batch_size=64):
    print("\n=== Encode với model: {} ===".format(model_name))
    model = SentenceTransformer(model_name, device=device)

    if prefix:
        texts1 = (prefix + df['text1'].astype(str)).tolist()
        texts2 = (prefix + df['text2'].astype(str)).tolist()
    else:
        texts1 = df['text1'].tolist()
        texts2 = df['text2'].tolist()

    # Encode text1
    emb1 = model.encode(
        texts1,
        batch_size=batch_size,
        convert_to_numpy=True,
        show_progress_bar=True,
    )
    # Encode text2
    emb2 = model.encode(
        texts2,
        batch_size=batch_size,
        convert_to_numpy=True,
        show_progress_bar=True,
    )

    short_name = model_name.split("/")[-1]
    out_path = os.path.join(output_dir, "{}_pair_embeddings.npz".format(short_name))

    # Lưu cả 2 embedding + nhãn
    np.savez_compressed(
        out_path,
        text1_embeddings=emb1,
        text2_embeddings=emb2,
        is_duplicate=df['is_duplicate'].to_numpy(),
    )
    print("Đã lưu embeddings vào:", out_path)
    print("Shape text1:", emb1.shape, "| text2:", emb2.shape)


In [None]:
def encode_instructor(model_name,
                      instruction,
                      df,
                      output_dir,
                      batch_size=32):
    print("\n=== Encode với model INSTRUCTOR: {} ===".format(model_name))
    instructor_model = INSTRUCTOR(model_name)
    instructor_model.to(device)

    def batched_encode(pairs):
        all_embs = []
        for i in tqdm(range(0, len(pairs), batch_size)):
            batch = pairs[i:i + batch_size]
            emb = instructor_model.encode(batch)
            all_embs.append(np.array(emb))
        return np.vstack(all_embs)

    texts1 = df['text1'].tolist()
    texts2 = df['text2'].tolist()

    # INSTRUCTOR cần dạng [instruction, text]
    pairs1 = [[instruction, t] for t in texts1]
    pairs2 = [[instruction, t] for t in texts2]

    emb1 = batched_encode(pairs1)
    emb2 = batched_encode(pairs2)

    short_name = model_name.split("/")[-1]
    out_path = os.path.join(output_dir, "{}_pair_embeddings.npz".format(short_name))

    np.savez_compressed(
        out_path,
        text1_embeddings=emb1,
        text2_embeddings=emb2,
        is_duplicate=df['is_duplicate'].to_numpy(),
    )
    print("Đã lưu embeddings vào:", out_path)
    print("Shape text1:", emb1.shape, "| text2:", emb2.shape)


In [None]:
for short_name, cfg in MODEL_CONFIGS.items():
    mtype = cfg["type"]
    if mtype == "sentence_transformer":
        encode_sentence_transformer(
            model_name=cfg["name"],
            df=df,
            output_dir=OUTPUT_DIR,
            prefix=cfg.get("prefix"),
        )
    elif mtype == "instructor":
        encode_instructor(
            model_name=cfg["name"],
            instruction=cfg["instruction"],
            df=df,
            output_dir=OUTPUT_DIR,
        )
    else:
        raise ValueError("Unknown model type for {}: {}".format(short_name, mtype))



=== Encode với model: BAAI/bge-base-en-v1.5 ===


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/338 [00:00<?, ?it/s]

Batches:   0%|          | 0/338 [00:00<?, ?it/s]

Đã lưu embeddings vào: /content/drive/MyDrive/Colab Notebooks/Extend_dsa/Data/embeddings/bge-base-en-v1.5_pair_embeddings.npz
Shape text1: (21626, 768) | text2: (21626, 768)

=== Encode với model: sentence-transformers/all-MiniLM-L6-v2 ===


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/338 [00:00<?, ?it/s]

Batches:   0%|          | 0/338 [00:00<?, ?it/s]

Đã lưu embeddings vào: /content/drive/MyDrive/Colab Notebooks/Extend_dsa/Data/embeddings/all-MiniLM-L6-v2_pair_embeddings.npz
Shape text1: (21626, 384) | text2: (21626, 384)

=== Encode với model INSTRUCTOR: hkunlp/instructor-base ===


modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]



config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

100%|██████████| 676/676 [00:55<00:00, 12.20it/s]
100%|██████████| 676/676 [00:58<00:00, 11.65it/s]


Đã lưu embeddings vào: /content/drive/MyDrive/Colab Notebooks/Extend_dsa/Data/embeddings/instructor-base_pair_embeddings.npz
Shape text1: (21626, 768) | text2: (21626, 768)

=== Encode với model: intfloat/e5-base-v2 ===


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

Batches:   0%|          | 0/338 [00:00<?, ?it/s]

Batches:   0%|          | 0/338 [00:00<?, ?it/s]

Đã lưu embeddings vào: /content/drive/MyDrive/Colab Notebooks/Extend_dsa/Data/embeddings/e5-base-v2_pair_embeddings.npz
Shape text1: (21626, 768) | text2: (21626, 768)
