In [1]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance
from typing import List
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_qdrant import QdrantVectorStore
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import DirectoryLoader, CSVLoader
from langchain.retrievers.parent_document_retriever import ParentDocumentRetriever
from langchain_core.stores import InMemoryStore

In [2]:

folder_path='data'
db_path='db_qdrant'
graph_db='neo4j_db'

In [3]:
def load_text(file_path):
    return CSVLoader(file_path=file_path,encoding='utf-8')


class AdvanceTextSplitter(RecursiveCharacterTextSplitter):
    def __init__(self, chunk_size=350, chunk_overlap=50, *args, **kwargs):
        super().__init__(chunk_size=chunk_size, chunk_overlap=chunk_overlap, *args, **kwargs)


    def remove_redundant_chunks(self, chunks: List[str]) -> List[str]:
        result=[]
        seen_chunks=set()
        idx=0
        for chunk in chunks:
            if chunk.page_content not in seen_chunks:
                chunk.id=idx
                result.append(chunk)
                seen_chunks.add(chunk.page_content)
                idx+=1
        return result
    def split_documents(self, documents: List[str]) -> List[str]:

        chunks = super().split_documents(documents)

        chunks = self.remove_redundant_chunks(chunks)
        
        return chunks



In [4]:
def create_vector_db(folder_path):
    # Load documents from the directory
    loader = DirectoryLoader(folder_path, glob="*.csv", loader_cls=load_text)
    documents = loader.load()
    # Split documents into chunks
    text_splitter = AdvanceTextSplitter(chunk_size=250, chunk_overlap=40)
    chunks = text_splitter.split_documents(documents)
    # Generate embeddings
    model_name = "hiieu/halong_embedding"
    embeddings = HuggingFaceEmbeddings(model_name=model_name)

    # Create and configure Qdrant client
    client = QdrantClient(path=db_path)
    collection_name = "cmc_corp_full_web"
    client.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=768, distance=Distance.COSINE),
    )

    # Store the chunks with summaries in the vector database
    vector_store = QdrantVectorStore(
        client=client,
        collection_name=collection_name,
        embedding=embeddings,
    )
    vector_store.add_documents(chunks)
    print("Database created and documents added successfully.")
    return vector_store


# Call the function with the folder path
db=create_vector_db(folder_path)


  from tqdm.autonotebook import tqdm, trange
  client.recreate_collection(


Database created and documents added successfully.
