In [1]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance
from typing import List
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_qdrant import QdrantVectorStore
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import DirectoryLoader, CSVLoader
from langchain_core.documents import Document
from dotenv import load_dotenv
import os

In [2]:

folder_path='data'
db_path='db_qdrant'
url="5d9673e8-d966-4738-adbb-95a5842604ba.europe-west3-0.gcp.cloud.qdrant.io:6333"
load_dotenv()
qdrant_key = os.getenv('qdrant_key')

In [3]:
def load_text(file_path):
    return CSVLoader(file_path=file_path,encoding='utf-8')

import re
from datetime import datetime

# Function to extract date or return current date if not present
def extract_date(text):
    # Split the text by newlines
    lines = text.strip().split("\n")
    
    # Check if the 3rd line exists
    if len(lines) >= 3:
        # Try to match a date pattern in the 3rd line (e.g., "19 August 2024")
        date_match = re.search(r'(\d{1,2} \w+ \d{4})', lines[2])
        if date_match:
            return date_match.group(1)  # Return the extracted date

    # If no date found, return the current date
    return datetime.now().strftime("%d %B %Y")

class AdvanceTextSplitter(RecursiveCharacterTextSplitter):
    def __init__(self, chunk_size=350, chunk_overlap=50, *args, **kwargs):
        super().__init__(chunk_size=chunk_size, chunk_overlap=chunk_overlap, *args, **kwargs)


    def remove_redundant_chunks(self, chunks: List[str]) -> List[str]:
        result=[]
        seen_chunks=set()
        idx=0
        for chunk in chunks:
            if chunk.page_content not in seen_chunks:
                chunk.id=idx
                result.append(chunk)
                seen_chunks.add(chunk.page_content)
                idx+=1
        return result
    def split_documents(self, documents: List[str]) -> List[str]:
        chunks=[]
        for doc in documents:
            curr_chunk = super().split_documents([doc])
            date=extract_date(doc.page_content)
            for chunk in curr_chunk:
                chunk.metadata['date']=date
            chunks.extend(curr_chunk)
        chunks = self.remove_redundant_chunks(chunks)
        
        return chunks



In [4]:
def create_vector_db(folder_path):
    # Load documents from the directory
    loader = DirectoryLoader(folder_path, glob="*.csv", loader_cls=load_text)
    documents = loader.load()
    # Split documents into chunks
    text_splitter = AdvanceTextSplitter(chunk_size=250, chunk_overlap=40)
    chunks = text_splitter.split_documents(documents)
    print(chunks[100].metadata)
    # Generate embeddings
    model_name = "hiieu/halong_embedding"
    embeddings = HuggingFaceEmbeddings(model_name=model_name)
    # Create and configure Qdrant client
    client = QdrantClient(
        url=url, 
        api_key=qdrant_key,
    )
    print(client.get_collections())
    collection_name = "cmc_corp_full_web"
    if client.collection_exists(collection_name):
        client.delete_collection(collection_name=collection_name)
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=768, distance=Distance.COSINE),
    )
    # Store the chunks with summaries in the vector database
    vector_store = QdrantVectorStore(
        client=client,
        collection_name=collection_name,
        embedding=embeddings,
    )
    vector_store.add_documents(chunks)

    print("Database created and documents added successfully.")
    return vector_store


# Call the function with the folder path
db=create_vector_db(folder_path)


{'source': 'data\\insight.csv', 'row': 1, 'date': '26 August 2024'}


  from tqdm.autonotebook import tqdm, trange


collections=[CollectionDescription(name='cmc_corp_full_web')]
Database created and documents added successfully.
