In [1]:
import os
from pathlib import Path
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from qdrant_client import QdrantClient
from qdrant_client.http.models import (
    VectorParams,
    Distance,
    PointStruct,
    Fusion,
    Prefetch,
    SparseVector,
    SearchRequest, Vector, Filter
)
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from langchain.vectorstores import Qdrant
from langchain_ollama import OllamaEmbeddings
from langchain_qdrant import QdrantVectorStore, RetrievalMode
from langchain_core.documents.base import Document
from langchain_ollama import OllamaEmbeddings

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Config
FOLDER_PATH = Path("transcripts/")
files = list(FOLDER_PATH.glob("*.txt"))
COLLECTION_NAME = "dense_texts"
DENSE_MODEL = "bge-m3:latest" 

In [3]:
# Initialize models
embeddings = OllamaEmbeddings(model=DENSE_MODEL)
client = QdrantClient(path="dense")

In [4]:
documents = []
for file in files:
    with open(file, "r", encoding="utf-8") as f:
        text = f.read()
        documents.append({"content": text, "metadata": {"source": str(file)}})

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)
chunks = []
for doc in documents:
    doc_chunks = text_splitter.split_text(doc["content"])
    for chunk in doc_chunks:
        chunks.append({"content": chunk, "metadata": doc["metadata"]})

In [6]:
chunks = [Document(page_content=chunk["content"],metadata=chunk["metadata"]) for chunk in chunks]

In [7]:
client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=VectorParams(size=1024, distance=Distance.COSINE),
)

qdrant = QdrantVectorStore(
    client=client,
    collection_name=COLLECTION_NAME,
    embedding=embeddings,
    retrieval_mode=RetrievalMode.DENSE
)

In [9]:
qdrant.add_documents(documents=chunks)

['7b456ef5fbbc427eafb6d1cce86babf1',
 '8e881c9f72a74026b04a2bcc938816c4',
 '8b931430ef15497c90854b56942039a2',
 'b991d8511ca545a18392693122261b0d',
 '5b072f4848974614bcae60f98e947094',
 '7751dad8579f4760a45b11d8ab880281',
 'e5f3753529834f84af24e018da146bcc',
 '05eef6bde069499bbb96f70806946d83',
 '10b2b61324a4415ea65589548c37e4cd',
 '705ceb20b6774f9d8d81b57cb38ba2fc',
 'd30b136234874a33a6c51d8b0d43ed2c',
 'de210ae2de254a91af23f47c50d4fa6a',
 '283c33fb461042e089ff656ca9874217',
 'f9a5829b2b8a4eb2a36eef25d264034d',
 '370bf9866705491d96b5b20d5bf12894',
 '6bba98fae56e4769b689255e98a4c6da',
 '16102dc4aa29449d85936dacde5d086a',
 '246395c893e843ac9b410852c4cab121',
 '708fd46ee3e64ee8a7d49c22c97f2cc9',
 '9ac7e4a8d6c0486a89ca32f5450ba48a',
 '0f764984ad51484ca984eb653d89201a',
 'a105327656e94302805ee0b3c3456881',
 '8a335aa168924461aedf097e303110d3',
 '3aa05016d9494803bf12ee18ef17d2e0',
 '8586e53c21bf43ee9c5d5e184b54bc84',
 'dfa601408a954c38bce1cb69286e9231',
 'fde5740180dc4f768b0300a50a8b852b',
 

In [11]:
question = "Which city is best to buy right now?"

In [14]:
retriever = qdrant.as_retriever(search_type="mmr", search_kwargs={"k": 5})
relevant = retriever.invoke(question)

In [21]:
relevant

[Document(metadata={'source': 'transcripts\\Melbourne will Kill Your Portfolio Growth if you don’t know this!!! - With Simon Loo & Todd Sloan.txt', '_id': '006b8864ef5c4864b276e41b923b59ba', '_collection_name': 'dense_texts'}, page_content="make any decisions. Think about whether it lines with your long-term goals. Don't forget that Melbourne is not the only city that makes sense to buy in. I've talked about Perth. I've talked about Adelaide as well in previous episodes. And even though everyone's been saying these two cities have gone rubbish from one or even two years ago, statistics, data, everyone loves data except for me, but even I will concede that the data that shows these areas have grown a lot. In fact, probably more"),
 Document(metadata={'source': 'transcripts\\6 Months of Rising Property Prices The 3 Shocking Signals Everyone’s Missing [APS094].txt', '_id': '3c7cf2db9b57439787a4d03521a788c7', '_collection_name': 'dense_texts'}, page_content="be approaching a critical point

In [27]:
contents = [x.page_content for x in relevant]
sources = [x.metadata["source"] for x in relevant]

context = "\n\n---\n\n".join([c+" ##from video## "+s for c, s in zip(contents, sources)])

In [None]:
def retrieve(self, question):
    """Retrieve most relevant docs from vectorstore."""
    retriever = as_retriever(search_type="mmr", search_kwargs={"k": 5})
    relevant = retriever.invoke(question)

    self.relevant = relevant

    return relevant

def generate_prompt(self, question):
    """Build prompt."""
    relevant = self.retrieve(question)

    # combine all relevant docs
    context_text = "\n\n---\n\n".join([format_document(doc) for doc in relevant])

    # generate template
    template = self.system_prompt()
    prompt_template = ChatPromptTemplate.from_template(template)
    prompt = prompt_template.format(
        context=context_text,
        question=question
    )
    return prompt, [doc.page_content for doc in relevant]

def query(self, question):
    """Pass context to LLM and query."""
    prompt, relevant = self.generate_prompt(question)

    return self.model.invoke(prompt), relevant

## Query