In [10]:
from langchain.chains import RetrievalQA
from src.datamodels.qdrant import qdrant_datamodel
from src.datamodels.constant import COLLECTION_NAME
from src.models.llm import get_llm_model
from src.models.huggingface import get_embedding_model
from src.ingestion.load_data import download_file, load_json

## Vector Database Ingestion

In [2]:
def ingest_collection(collection_name: str = COLLECTION_NAME) -> None:
    save_path = download_file()
    embedding = get_embedding_model()

    report = load_json(
        file_path=save_path,
        jq_schema=".data.attributes.sigma_analysis_results[]",
        content_key="rule_description",
    )
    qdrant_datamodel.create_collection(collection_name=collection_name)
    vectorstore = qdrant_datamodel.create_vectorstore(
        documents=report, embedding=embedding, collection_name=collection_name
    )

### create vector database client

In [3]:
client = qdrant_datamodel.create_client()

### get huggingface embedding model

In [4]:
embedding = get_embedding_model()

### vector database ingestion

In [5]:
if not client.collection_exists(collection_name=COLLECTION_NAME):
    ingest_collection(collection_name=COLLECTION_NAME)
print(f"{'':=>20} {COLLECTION_NAME=} exist, connect without create.")



### get vectorstore

In [6]:
vectorstore = qdrant_datamodel.get_vectorstore(
    client=client,
    collection_name=COLLECTION_NAME,
    embedding=embedding,
)

### init llm model

In [7]:
llm = get_llm_model()

### init QA model

In [8]:
qa = RetrievalQA.from_chain_type(
    llm=llm, retriever=vectorstore.as_retriever(), chain_type="stuff"
)

In [9]:
questions = [
    "Give me a short and precise summary about the report.",
    "Can you tell me about the malicious use of Microsoft Word and COM objects?",
]

for question in questions:
    result = qa.invoke(question)
    query = result.get("query")
    answer = result.get("result")
    print(
        f"""{'':=>20} 
{query=} 
{answer=}"""
        )

query='Give me a short and precise summary about the report.' 
answer='The report outlines indicators of potential malicious activity related to Microsoft Office applications and Windows startup processes. It highlights detection methods for files being created in the Windows startup directory, the spawning of Office applications by svchost processes as a means of creating malicious documents, and the potentially harmful file "wwlib.dll" associated with DLL sideloading.'
query='Can you tell me about the malicious use of Microsoft Word and COM objects?' 
answer="Malicious actors can exploit Microsoft Word and its COM objects to create and execute harmful Office documents, particularly through the use of macros. When Word generates an instance of a COM object, such as 'Word.Application', it can be leveraged to automatically spawn and execute scripts or executable files. This behavior can enable the creation of malicious Office documents on the fly, allowing attackers to bypass traditiona