In [53]:
from utility import create_milvus_collection, milvus_collection_exists, drop_milvus_collection, create_sqlite_db, sqlite_db_exists, drop_sqlite_db
from relational_db import SQLiteDB
from setting import TABEL2FIELD, METRIC_TYPE
from json import load
from vector_db import MilvusCollection
import os
from pymilvus import connections
from towhee import AutoConfig, AutoPipes
import numpy as np
from FlagEmbedding import BGEM3FlagModel
from sklearn.metrics.pairwise import cosine_similarity

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from preprocess import KeyWords_generator, read_json, segment_text, preprocess


In [2]:
keyword2context = {}

# Create a relational db to store context and its id
create_sqlite_db()
db = SQLiteDB()

for table, fields in TABEL2FIELD.items():
    db.create_table(table, fields)

# Read context-to-keywords json file and make keyword-to-contexts mapping
with open('./data/Keywords.json', 'r') as f:
    data = load(f)

for i, datum in enumerate(data):
    db.insert('context', f"{i}, '" + datum['context'].replace("'", "''") + "'")
    for keyword in datum['keywords']:
        if keyword in keyword2context:
            keyword2context[keyword].append(i)
        else:
            keyword2context[keyword] = [i]

# Embed all keywords and insert into vector DB
if not milvus_collection_exists():
    create_milvus_collection()

collection = MilvusCollection()
if not collection.has_partition("test"):
    collection.create_partition("test")

config = AutoConfig.load_config('sentence_embedding')
config.model = 'average_word_embeddings_glove.6B.300d'
sentence_embedding = AutoPipes.pipeline('sentence_embedding', config=config)

keywords = list(keyword2context.keys())
collection.insert([keywords,
                   [embedding.get()[0] for embedding in sentence_embedding.batch(keywords)],
                   list(keyword2context.values())], "test")

Exception: Database test already exists.

In [30]:
collection = MilvusCollection()

In [3]:
config = AutoConfig.load_config('sentence_embedding')
config.model = 'average_word_embeddings_glove.6B.300d'
sentence_embedding = AutoPipes.pipeline('sentence_embedding', config=config)

In [7]:
search_params = {
    "metric_type": METRIC_TYPE
}

results = collection.search([sentence_embedding('World War').get()[0]], "embedding", search_params, 10, partition_names=["test"], output_fields=["keyword", "context_ids"])[0]
print(results)

["id: World War One, distance: 1.0000001192092896, entity: {'keyword': 'World War One', 'context_ids': [12657]}", "id: First World War, distance: 1.0000001192092896, entity: {'keyword': 'First World War', 'context_ids': [920, 1529, 2223, 6244, 6253, 6682, 7342, 7574, 7597, 7659, 8546, 8608, 8837, 9402, 9538, 10344, 11111, 11506]}", "id: World War I, distance: 1.0000001192092896, entity: {'keyword': 'World War I', 'context_ids': [207, 571, 752, 1038, 1043, 1278, 1344, 2407, 2453, 2462, 2571, 2994, 3648, 3795, 4256, 4737, 4840, 4923, 5047, 5140, 5288, 5290, 5819, 6395, 6758, 6798, 6995, 7127, 7173, 7176, 7224, 7230, 7258, 7272, 7398, 7479, 7969, 8001, 8018, 8057, 8072, 8205, 8358, 8463, 8497, 8500, 8546, 8678, 8850, 9471, 9766, 9799, 9975, 10112, 10167, 10259, 10291, 10510, 10639, 10809, 10812, 10827, 11001, 11125, 11142, 11224, 11239, 11316, 11521, 11547, 11620, 11676, 11873, 11894, 11995, 12119, 12120, 12132, 12169, 12233, 12554, 13768, 13834, 14315]}", "id: World War, distance: 1.0000

In [8]:

tokenizer = AutoTokenizer.from_pretrained("yanekyuk/bert-uncased-keyword-extractor")
model = AutoModelForTokenClassification.from_pretrained("yanekyuk/bert-uncased-keyword-extractor")
kw_extractor = pipeline("token-classification",
                        model=model, 
                        tokenizer=tokenizer)


Query = "In what year did WikiLeaks first display information on the Internet?"
Keyword = KeyWords_generator(kw_extractor(Query),Query)

print(f'Keyword:{Keyword}')

results = []
for kw in Keyword:
    result = collection.search([sentence_embedding(kw).get()[0]], "embedding", search_params, 10, partition_names=["test"], output_fields=["keyword", "context_ids"])[0]
    print(f'result:{result}')
    results.extend(result)
    
# collection.search([sentence_embedding("WikeLeaks, Internet").get()[0]], "embedding", search_params, 10, partition_names=["test"], output_fields=["keyword", "context_ids"])[0]


Keyword:['WikiLeaks', 'Internet']
result:["id: WikiLeaks, distance: 1.0, entity: {'keyword': 'WikiLeaks', 'context_ids': [0, 429, 1615]}", "id: diplomatic cable leaks, distance: 0.41681885719299316, entity: {'keyword': 'diplomatic cable leaks', 'context_ids': [11199]}", "id: Facebook, distance: 0.40166175365448, entity: {'keyword': 'Facebook', 'context_ids': [640, 4873, 5814, 7653, 7883, 11253, 11312, 13641]}", "id: Facebook, Inc., distance: 0.40166175365448, entity: {'keyword': 'Facebook, Inc.', 'context_ids': [12002]}", "id: Twitter, distance: 0.399120956659317, entity: {'keyword': 'Twitter', 'context_ids': [102, 2227, 2266, 2357, 4145, 4873, 11823]}", "id: emails, distance: 0.3482602834701538, entity: {'keyword': 'emails', 'context_ids': [11699]}", "id: YouTube, distance: 0.3356497287750244, entity: {'keyword': 'YouTube', 'context_ids': [4541, 4649, 4873, 5681, 8933, 11140, 14122]}", "id: web, distance: 0.325663298368454, entity: {'keyword': 'web', 'context_ids': [3165]}", "id: PayP

In [10]:
json_file_path = 'text_embedding.json'  # 

# 读取JSON文件
with open(json_file_path, 'r') as file:
    data = load(file)

embedding_array = np.array(data) #(14867, 1024)



(14867, 1024)

In [12]:
model = BGEM3FlagModel('BAAI/bge-m3',  
                       use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

Fetching 19 files:   0%|          | 0/19 [00:00<?, ?it/s]

loading existing colbert_linear and sparse_linear---------


In [54]:
Query = "Is same-sex marriage for male legal in the United States under the United States Constitution?"
embeddings_1 = model.encode(Query, 
                            max_length=50, # If you don't need such a long length, you can set a smaller value to speed up the encoding process.
                            )['dense_vecs']
similarity = embeddings_1 @ embedding_array.T #
# candidate_context = np.argsort

encoding:   0%|          | 0/1 [00:00<?, ?it/s]You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
encoding: 100%|██████████| 1/1 [00:34<00:00, 34.19s/it]


ValueError: Expected 2D array, got 1D array instead:
array=[-0.04351807  0.01844788 -0.01145935 ...  0.03143311 -0.04693604
  0.06317139].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
similarity
idx = np.argsort(similarity)[::-1]
nums_candidate_context = 10
candidate_context = []

candidate_context.append({"similarity":similarity[idx[0]],
                          "Query": Query,
                          "Candidate Context":data[idx[0]]['context']})
count = 1
for i in idx:
    if count == nums_candidate_context:
        break
    if np.isclose(similarity[i], candidate_context[-1]["similarity"]):
        continue
    
    print(f'similarity: {candidate_context[-1]["similarity"]}')
    print(f'Query: {candidate_context[-1]["Query"]}')
    print("==================================================================")
    print("candidate context: ",candidate_context[-1]["Candidate Context"])
    print('\n')
    count+=1
    candidate_context.append({"similarity":similarity[i],
                          "Query": Query,
                          "Candidate Context":data[i]['context']})
 
    
print(f'similarity: {candidate_context[-1]["similarity"]}')
print(f'Query: {candidate_context[-1]["Query"]}')
print("==================================================================")
print("candidate context: ",candidate_context[-1]["Candidate Context"])



similarity: 0.7133298164104076
Query: Is same-sex marriage for male legal in the United States under the United States Constitution?
candidate context:  Same-sex marriage in the United States expanded from one state in 2004 to all fifty states in 2015 through various state court rulings, state legislation, direct popular votes, and federal court rulings. Same-sex marriage is also referred to as gay marriage, while the political status in which the marriages of same-sex couples and the marriages of opposite-sex couples are recognized as equal by the law is referred to as marriage equality. The fifty states each have separate marriage laws, which must adhere to rulings by the Supreme Court of the United States that recognize marriage as a fundamental right that is guaranteed by both the Due Process Clause and the Equal Protection Clause of the Fourteenth Amendment to the United States Constitution, as first established in the 1967 landmark civil rights case of "Loving v. Virginia".


sim