In [74]:
# paramter cell do not remove!!
nb_parm='datalake|raw/pdf|Birddiversityanddistribution|pdf||300|150'
# nb_parm='datalake|raw/text-csv|PFW_spp_translation_table_May2024|csv||300|150'
# nb_parm='datalake|raw/text-csv|BRID_data|txt||300|150'
embed_model = "mxbai-embed-large" 
gen_model = "deepseek-r1:7b" #"command-r"
collection = "Bridknowledge"

In [75]:
import sys
import os

sys.path.append("/home/jovyan/notebooks")
from Framework.module import Utility

## Do the task After this

In [77]:
bucket_name, path_to_file, file_name, file_type, dlm, chunk_size, overlap = nb_parm.split('|')
######PREPROCESSING###################
file_name = file_name + '.' + file_type
chunk_size = int(chunk_size)
overlap = int(overlap)
######################################
print("bucket_name:", bucket_name)
print("path_to_file:", path_to_file)
print("file_name:", file_name)
print("file_type:", file_type)
print("dlm:", dlm)
print("chunk_size:", chunk_size)
print("overlap:", overlap)
######################################
print("embed_model:", embed_model)
print("gen_model:", gen_model)
print("collection:", collection)


bucket_name: datalake
path_to_file: raw/text-csv
file_name: BRID_data.txt
file_type: txt
dlm: 
chunk_size: 300
overlap: 150
embed_model: mxbai-embed-large
gen_model: deepseek-r1:7b
collection: Bridknowledge


## delete collection for test

In [78]:
# client = Utility.registerClient()
# if collection in client.collections.list_all():
#     client.collections.delete(collection)
# client.close()

## Download module

In [1]:
import uuid
import weaviate
import pdfplumber
from weaviate.classes.config import Configure
from datetime import datetime
from weaviate.classes.config import Property, DataType
import io
import pandas as pd
from langchain.prompts import PromptTemplate
from langchain_community.embeddings import OllamaEmbeddings
from langchain_ollama.llms import OllamaLLM
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableLambda

## Define Fucntion

In [79]:
OLLAMA_API = "http://host.docker.internal:11434"

# Post-processing
def format_docs(docs): # result in long text type str
    # return "\n\n".join(doc.page_content for doc in docs)
    return RunnableLambda(lambda _: "\n".join(doc for doc in docs))
    
def createCollection(client):
    # if present
    if collection in client.collections.list_all():
        return client.collections.get(collection)

    # if not
    return client.collections.create(
        name=collection,
        vectorizer_config=Configure.Vectorizer.text2vec_ollama(
            api_endpoint=OLLAMA_API,
            model=embed_model
        ),
        generative_config=Configure.Generative.ollama(
            api_endpoint=OLLAMA_API,
            model=gen_model
        ),
        properties=[
            Property(name="title", data_type=DataType.TEXT),
            Property(name="content", data_type=DataType.TEXT),
            Property(name="source_type", data_type=DataType.TEXT),
            Property(name="source_id", data_type=DataType.TEXT),
            Property(name="index", data_type=DataType.INT),
            Property(name="timestamp", data_type=DataType.DATE)
        ]
    )

def chunkText(text, chunk_size, overlap):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

def ingestFileToWeaviate(bucket_name, path_to_file, title, chunk_size, overlap):
    client = Utility.registerClient()

    kb = createCollection(client)
    
    # Get file as bytes in memory from MinIO
    file_bytes = Utility.readFileFromMinio(bucket_name, path_to_file)

    # Define as one document
    title = title
    source_id = str(uuid.uuid4())
    timestamp = datetime.utcnow().replace(microsecond=0).isoformat() + "Z"

    # Prompt
    prompt = PromptTemplate.from_template(
        """
        Answer the question based only on the context.
        Context:{context}
        
        Question: {question}
        
        Answer:
        """
    )

    # LLM
    llm = OllamaLLM(
        model="deepseek-r1:7b",
        temperature=0,
        base_url="http://host.docker.internal:11434" 
    )
        
    if file_type == 'pdf':
        with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
            texts = [page.extract_text() for page in pdf.pages]
            ## Representation Summary
            rag_chain = (
                {"context": format_docs(texts), "question": RunnablePassthrough()}
                | prompt
                | llm
                | StrOutputParser()
            )
            # generate summary
            result = rag_chain.invoke("Summary this document for futher retrieval purpose, Only give the summary without explanation or extra reasoning")
            result = result.split('</think>')[-1]
            kb.data.insert({
                "title": title,
                "content": result,
                "source_type": "summary",
                "source_id": source_id,
                "index": 0,
                "timestamp": timestamp
            })
            
            
            ## Data in Retrieve
            for page in pdf.pages:
                text = page.extract_text()
                if text:
                    chunks = chunkText(text, chunk_size, overlap)
                    index = 0
                    for chunk in chunks:
                        kb.data.insert({
                            "title": title,
                            "content": chunk,
                            "source_type": file_type,
                            "source_id": source_id,
                            "index": index,
                            "timestamp": timestamp
                        })
                        index += 1
                        
    elif file_type == 'txt' or file_type == 'csv':
        file_bytes = Utility.readFileFromMinio(bucket_name, path_to_file)
        df = pd.read_csv(io.StringIO(file_bytes.decode('utf-8')), on_bad_lines='skip')
        texts = df.astype(str).apply(lambda row: " | ".join(row), axis=1).tolist()
        ## Representation Summary
        rag_chain = (
            {"context": format_docs(texts), "question": RunnablePassthrough()}
            | prompt
            | llm
            | StrOutputParser()
        )
        # generate summary
        result = rag_chain.invoke("Summary this document for futher retrieval purpose, Only give the summary without explanation or extra reasoning")
        result = result.split('</think>')[-1]
        kb.data.insert({
            "title": title,
            "content": result,
            "source_type": "summary",
            "source_id": source_id,
            "index": 0,
            "timestamp": timestamp
        })
        
        index = 0
        for text in texts:
            kb.data.insert({
                "title": title,
                "content": text,
                "source_type": file_type,
                "source_id": source_id,
                "index": index,
                "timestamp": timestamp
            })
            index += 1
    
    

    client.close()
    print(f"Ingested {index} chunks from '{title}'")
    

 

## Ingest data into vector database

In [80]:
ingestFileToWeaviate(bucket_name, path_to_file+'/'+file_name, file_name, chunk_size, overlap)

Ingested 944 chunks from 'BRID_data.txt'


  ingestFileToWeaviate(bucket_name, path_to_file+'/'+file_name, file_name, chunk_size, overlap)


## Observe collection

In [81]:
# client = Utility.registerClient()
# for i in client.collections.list_all():
#     # print(client.collections.get(i))
#     print("##################################################")
#     print(i)
#     print("\n\n break line here")
# # client.collections.delete("Question")
# # client.collections.delete("Csvbrid")

                
# client.close()


##################################################
Bridknowledge


 break line here


## query from database

In [90]:
# import weaviate
# import json
# from datetime import datetime
# from weaviate.classes.query import MetadataQuery, Filter

# def serialize(obj):
#     """Helper function to handle serialization of datetime objects."""
#     if isinstance(obj, datetime):
#         return obj.isoformat()  # Convert datetime to string in ISO format
#     raise TypeError("Type not serializable")

# client = Utility.registerClient()

# retriever = client.collections.get(collection)

# # Perform the text generation
# # Each query must target one collection at a time

# # retriever.query.where(Filter.by_property("source_type").equal("summary"))
# response = retriever.query.bm25( # search without model
#     # query="Conclusion",
#     query = "give me a record that Spp = PIJE, DBH = 9.652 and x-y coord are (143.799883, 105.20826)",
#     limit=1,
#     query_properties=["content"],
#     return_metadata=MetadataQuery(score=True),
# )

# for obj in response.objects:
#     print(json.dumps(obj.properties, default=serialize, indent=2))
#     print(json.dumps(obj.metadata.score, default=serialize, indent=2))
#     print(f"content : {obj.properties['content']}")
#     print("#############################################")

# client.close()  # Free up resources


{
  "title": "BRID_data.txt",
  "source_id": "03df5026-748d-49da-a66d-437dcc1a31fd",
  "index": 5,
  "content": "6\tPIJE\t9.652\t143.799883\t105.20826",
  "timestamp": "2025-04-09T15:28:02+00:00",
  "source_type": "txt"
}
17.68316650390625
content : 6	PIJE	9.652	143.799883	105.20826
#############################################
