In [61]:
from FlagEmbedding import BGEM3FlagModel

In [62]:
model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

  colbert_state_dict = torch.load(os.path.join(model_dir, 'colbert_linear.pt'), map_location='cpu')
  sparse_state_dict = torch.load(os.path.join(model_dir, 'sparse_linear.pt'), map_location='cpu')


In [63]:
sentences_1 = ["What is BGE M3?", "Defination of BM25"]
sentences_2 = ["BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.", 
            "BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document"]

embeddings_1 = model.encode(sentences_1, 
                            batch_size=12, 
                            max_length=8192, # If you don't need such a long length, you can set a smaller value to speed up the encoding process.
                            )['dense_vecs']
embeddings_2 = model.encode(sentences_2)['dense_vecs']
similarity = embeddings_1 @ embeddings_2.T
print(similarity)

[[0.626  0.3477]
 [0.35   0.6787]]


In [64]:
sentences_1 = ["What is BGE M3?", "Defination of BM25"]
sentences_2 = ["BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.", 
               "BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document"]

output_1 = model.encode(sentences_1, return_dense=True, return_sparse=True, return_colbert_vecs=False)
output_2 = model.encode(sentences_2, return_dense=True, return_sparse=True, return_colbert_vecs=False)

# you can see the weight for each token:
print(model.convert_id_to_token(output_1['lexical_weights'][0]))
print(model.convert_id_to_token(output_2['lexical_weights'][0]))

lexical_scores = model.compute_lexical_matching_score(output_1['lexical_weights'][0], output_2['lexical_weights'][1])
print(lexical_scores)

{'What': np.float16(0.08374), 'is': np.float16(0.08136), 'B': np.float16(0.1298), 'GE': np.float16(0.252), 'M': np.float16(0.1704), '3': np.float16(0.2695), '?': np.float16(0.04092)}
{'B': np.float16(0.1411), 'GE': np.float16(0.2588), 'M': np.float16(0.1722), '3': np.float16(0.269), 'is': np.float16(0.1276), 'an': np.float16(0.07336), 'embe': np.float16(0.2142), 'dding': np.float16(0.167), 'model': np.float16(0.255), 'support': np.float16(0.191), 'ing': np.float16(0.08276), 'den': np.float16(0.1815), 'se': np.float16(0.12146), 're': np.float16(0.05713), 'trie': np.float16(0.1576), 'val': np.float16(0.06335), 'lex': np.float16(0.1515), 'ical': np.float16(0.10547), 'match': np.float16(0.1508), 'and': np.float16(0.01593), 'multi': np.float16(0.0843), 've': np.float16(0.1453), 'ctor': np.float16(0.1401), 'interaction': np.float16(0.1527)}
0.00877


In [15]:
sentences_1 = ["What is BGE M3?", "Defination of BM25"]
sentences_2 = ["BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.", 
               "BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document"]

output_1 = model.encode(sentences_1, return_dense=True, return_sparse=True, return_colbert_vecs=True)
output_2 = model.encode(sentences_2, return_dense=True, return_sparse=True, return_colbert_vecs=True)

print(model.colbert_score(output_1['colbert_vecs'][0], output_2['colbert_vecs'][0]))
print(model.colbert_score(output_1['colbert_vecs'][0], output_2['colbert_vecs'][1]))

tensor(0.7799)
tensor(0.4622)


In [12]:
import re
def remove_stopwords(text: str):
    text = re.sub(r"\*\*page \d+\*\*", "", text)
    text = re.sub(r"\*\*question \d+\*\*", "", text)
    text = re.sub(r"\*\*answer \d+\*\*", "", text)
    # Step 1: 去除網址和 EMAIL
    text = re.sub(r"http\S+|www\S+|https\S+|[\w\.-]+@[\w\.-]+", "", text)

    text = re.sub(r"【[A-Za-z0-9]+】", "", text)

    # Step 6: 去除 "第 X 頁，共 Y 頁" 格式
    text = re.sub(r"第 \d+ 頁，共 \d+ 頁", "", text)

    # Step 7: 去除 "X/Y" 或 "X / Y" 格式
    text = re.sub(r"\b\d+ ?/ ?\d+\b", "", text)

    # Step 8: 去除 "~X~" 格式
    text = re.sub(r"~\d+~", "", text)

    # Step 9: 去除 "（接次頁）" 和 "（承前頁）"
    text = re.sub(r"（接次頁）|（承前頁）", "", text)

    # Step 10: 去除 "- X -" 格式
    text = re.sub(r"- \d+ -", "", text)

    # Step 2: 去除無意義數字（可以依需求調整，如果想保留某些數字格式）
    text = re.sub(r"\b\d+\b", "", text)

    # Step 3: 去除標點符號
    text = re.sub(r"[^\w\s]", "", text)

    # 去除多餘的空格
    text = re.sub(r"\s+", " ", text).strip()

    return text

with open('../source/競賽資料集/reference_text/faq/0.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    print(remove_stopwords(text))

什麼是跨境手機掃碼支付 允許大陸消費者可以用手機支付寶App在台灣實體商店購買商品或服務


In [13]:
query = "提領PayPal款項到玉山銀行的最低金額是多少？"

texts = []
for idx in [209, 530, 536, 316, 215, 202, 134, 604, 481, 304, 157, 415, 174, 77, 332]:
    with open(f'../source/競賽資料集/reference_text/faq/{idx}.txt', 'r', encoding='utf-8') as f:
        text = f.read()
        texts.append(remove_stopwords(text))


embeddings_1 = model.encode(texts)['dense_vecs']
embeddings_2 = model.encode(query)['dense_vecs']
similarity = embeddings_1 @ embeddings_2.T
print(similarity)

[0.506  0.6475 0.6294 0.4812 0.618  0.3884 0.3748 0.768  0.4712 0.3342
 0.3657 0.3481 0.3606 0.5205 0.4097]


In [16]:
from pathlib import Path
cate_folder = Path('../source/競賽資料集/reference_text/finance')
for file in cate_folder.glob('*.txt'):
    with open(file, 'r', encoding='utf-8') as f:
        text = f.read()
        text = remove_stopwords(text)

    tokens = model.tokenizer.encode(text)
    if len(tokens) > 8192:
        print(file, len(tokens))
        



../source/競賽資料集/reference_text/finance/510.txt 9613
../source/競賽資料集/reference_text/finance/672.txt 16055
../source/競賽資料集/reference_text/finance/471.txt 11074
../source/競賽資料集/reference_text/finance/667.txt 9099
../source/競賽資料集/reference_text/finance/28.txt 17494
../source/競賽資料集/reference_text/finance/507.txt 8444
../source/競賽資料集/reference_text/finance/711.txt 12295
../source/競賽資料集/reference_text/finance/739.txt 12251
../source/競賽資料集/reference_text/finance/117.txt 14167
../source/競賽資料集/reference_text/finance/301.txt 14637
../source/競賽資料集/reference_text/finance/260.txt 15518
../source/競賽資料集/reference_text/finance/106.txt 13979
../source/競賽資料集/reference_text/finance/304.txt 11740
../source/競賽資料集/reference_text/finance/113.txt 9573
../source/競賽資料集/reference_text/finance/107.txt 21322
../source/競賽資料集/reference_text/finance/503.txt 11393
../source/競賽資料集/reference_text/finance/878.txt 13864
../source/競賽資料集/reference_text/finance/925.txt 14251
../source/競賽資料集/reference_text/finance/266.txt 1607

In [56]:
with open('../source/競賽資料集/reference_text/finance/1001.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    text = remove_stopwords(text)

tokens = model.tokenizer.encode(text, add_special_tokens=False)
print(len(tokens))

split_texts = []
for i in range(0, len(tokens), 3072):
    split_texts.append(model.tokenizer.decode(tokens[i : i + 4096]))
    if i + 4096 > len(tokens):
        break

print(len(split_texts))

for split_token in split_texts:
    print(len(split_token))


10110
3
5071
5338
5028


In [60]:
embeddings = model.encode(split_texts)['dense_vecs']
print(embeddings.shape)

KeyboardInterrupt: 

In [None]:
import json
with open('../source/競賽資料集/dataset/preliminary/questions_example.json', 'r', encoding='utf-8') as f:
    questions = json.load(f)['questions']

for question in questions:
    print(len(model.tokenizer.encode(question['query'])))