In [1]:
file_dataset = 'data_proces.txt'
with open(file_dataset, 'r') as f:
    data = f.readlines()

In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings

model_name = "BAAI/bge-large-en"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [3]:
from langchain_experimental.text_splitter import SemanticChunker
text_splitter = SemanticChunker(
    embeddings, breakpoint_threshold_type="standard_deviation"
)



In [13]:
from FlagEmbedding import FlagModel

model = FlagModel('BAAI/bge-large-en', 
                  query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章：",
                  use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation
# embeddings_1 = model.encode(sentences_1)
# embeddings_2 = model.encode(sentences_2)
# similarity = embeddings_1 @ embeddings_2.T
# print(similarity)

# # for s2p(short query to long passage) retrieval task, suggest to use encode_queries() which will automatically add the instruction to each query
# # corpus in retrieval task can still use encode() or encode_corpus(), since they don't need instruction
# queries = ['query_1', 'query_2']
# passages = ["样例文档-1", "样例文档-2"]
# q_embeddings = model.encode_queries(queries)
# p_embeddings = model.encode(passages)
# scores = q_embeddings @ p_embeddings.T



1024

In [14]:
import re
import openai
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


def _split_sentences(text):
    list_sentences = []
    for senten in text:
    # Use regular expressions to split the text into sentences based on punctuation followed by whitespace.
        sentences = re.split(r'(?<=[.?!])\s+', senten)
        for i in sentences:
            list_sentences.append(i)
    return list_sentences

def _combine_sentences(sentences):
    print('len : ',len(sentences))
    # Create a buffer by combining each sentence with its previous and next sentence to provide a wider context.
    combined_sentences = []
    for i in range(len(sentences)):
        combined_sentence = sentences[i]
        if i > 0:
            combined_sentence = sentences[i-1] + ' ' + combined_sentence
        if i < len(sentences) - 1:
            combined_sentence += ' ' + sentences[i+1]
        combined_sentences.append(combined_sentence)
    return combined_sentences


from tqdm import tqdm

def convert_to_vector(list_sentences):
    embeddings = []
    
    # Sử dụng tqdm để hiển thị tiến trình
    for sentence in tqdm(list_sentences, desc="Converting sentences to vectors"):
        embedding = model.encode(sentence)
        embeddings.append(embedding)
    
    return embeddings


def _calculate_cosine_distances(embeddings):
    # Calculate the cosine distance (1 - cosine similarity) between consecutive embeddings.
    distances = []
    for i in range(len(embeddings) - 1):
        similarity = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
        distance = 1 - similarity
        distances.append(distance)
    return distances


In [15]:
def chunk_text(text):
    # Split the input text into individual sentences.
    single_sentences_list = _split_sentences(text)
    # Combine adjacent sentences to form a context window around each sentence.
    combined_sentences = _combine_sentences(single_sentences_list)
    # Convert the combined sentences into vector representations using a neural network model.
    embeddings = convert_to_vector(combined_sentences)
    
    # Calculate the cosine distances between consecutive combined sentence embeddings to measure similarity.
    distances = _calculate_cosine_distances(embeddings)
    
    # Determine the threshold distance for identifying breakpoints based on the 80th percentile of all distances.
    breakpoint_percentile_threshold = 80
    
    breakpoint_distance_threshold = np.percentile(distances, breakpoint_percentile_threshold)
    # Find all indices where the distance exceeds the calculated threshold, indicating a potential chunk breakpoint.
    indices_above_thresh = [i for i, distance in enumerate(distances) if distance > breakpoint_distance_threshold]
    # Initialize the list of chunks and a variable to track the start of the next chunk.
    chunks = []
    start_index = 0
    # Loop through the identified breakpoints and create chunks accordingly.
    for index in indices_above_thresh:
        chunk = ' '.join(single_sentences_list[start_index:index+1])
        chunks.append(chunk)
        start_index = index + 1
    
    # If there are any sentences left after the last breakpoint, add them as the final chunk.
    if start_index < len(single_sentences_list):
        chunk = ' '.join(single_sentences_list[start_index:])
        chunks.append(chunk)
    # Return the list of text chunks.
    return chunks

In [24]:
with open('data_proces.txt', 'r') as filedata:
    data = filedata.readlines()
    filedata.close()

In [None]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

model_name = 'BAAI/bge-large-en'
hf_embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
)
texts = ["Hello, world!", "How are you?"]   
hf_embeddings.embed_documents(texts)


In [5]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

text_splitter = SemanticChunker(hf_embeddings)


In [6]:
with open('data_proces.txt', 'r') as filedata:
    state_of_the_union = filedata.read()

In [None]:
docs = text_splitter.create_documents([state_of_the_union])
print(docs[0].page_content)

In [10]:
from langchain_qdrant import QdrantVectorStore
qdrant = QdrantVectorStore.from_documents(
    docs,
    hf_embeddings,
    location=":memory:",  # Local mode with in-memory storage only
    collection_name="my_documents",
)

In [16]:
query = "Cơ sở 1 nằm ở đâu ?"
found_docs = qdrant.similarity_search(query)
found_docs

[Document(metadata={'_id': 'f68ae838de764f42a077bd7ec2cf0199', '_collection_name': 'my_documents'}, page_content='Tiếng anh giao tiếp cơ bản.( môi trường làm việc đa quốc gia).'),
 Document(metadata={'_id': 'a7f6d3f3aba04062a413ba193e8d5d58', '_collection_name': 'my_documents'}, page_content='Tiếng anh giao tiếp cơ bản.( môi trường làm việc đa quốc gia).'),
 Document(metadata={'_id': 'cb285504ba8240b8b64e5f8238a13048', '_collection_name': 'my_documents'}, page_content='Nguyễn Văn Thiện. TS.'),
 Document(metadata={'_id': '05b3eabafa9b46799d5c8e3438fbdddd', '_collection_name': 'my_documents'}, page_content='Nguyễn Văn Thiện. TS.')]

In [18]:
retriever = qdrant.as_retriever()
# retriever = qdrant.as_retriever(search_type="mmr")
query = "có bao nhiêu giảng viên"
retriever.invoke(query)[0]

Document(metadata={'_id': 'f68ae838de764f42a077bd7ec2cf0199', '_collection_name': 'my_documents'}, page_content='Tiếng anh giao tiếp cơ bản.( môi trường làm việc đa quốc gia).')