# RAG Pipeline — E-Commerce Revenue Analysis
Qdrant + HuggingFace + LangChain

In [None]:
import os
from pathlib import Path
from qdrant_client import QdrantClient
from langchain_qdrant import QdrantVectorStore
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpoint, ChatHuggingFace
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv

## 1. Load Environment Variables

In [None]:
# Notebook-safe ROOT_DIR — no __file__ needed
ROOT_DIR = Path(os.getcwd())

ENV_PATH = ROOT_DIR / ".env"
load_dotenv(dotenv_path=ENV_PATH)

# Config
HF_HOME = os.getenv("HF_HOME")
HF_TOKEN = os.getenv("HF_TOKEN")
Qdrant_API_KEY = os.getenv("Qdrant_API_KEY")
Qdrant_Cluster_Endpoint = os.getenv("Qdrant_Cluster_Endpoint")

# Validate — fail fast with a clear message if any key is missing
assert Qdrant_Cluster_Endpoint, "Qdrant_Cluster_Endpoint is not set in .env"
assert Qdrant_API_KEY,          "Qdrant_API_KEY is not set in .env"
assert HF_TOKEN,                "HF_TOKEN is not set in .env"

print("Environment variables loaded successfully.")

## 2. Connect to Qdrant Cloud

In [None]:
qdrant_client = QdrantClient(
    url=Qdrant_Cluster_Endpoint,
    api_key=Qdrant_API_KEY
)
print(f"Connected to Qdrant cloud at {Qdrant_Cluster_Endpoint} successfully.")

collections = qdrant_client.get_collections()
print(f"Collections: {collections}")

## 3. Define Embedding Model

In [None]:
# Runs locally — no HuggingFace token needed
embedding = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)
print("Embedding model loaded.")

## 4. Load or Create Vector Store

In [None]:
try:
    vector_store = QdrantVectorStore.from_existing_collection(
        client=qdrant_client,
        collection_name="ecommerce_revenue_analysis",
        embedding=embedding
    )
    print("Vector store loaded from existing collection.")

except Exception as e:
    print(f"Collection not found: {e}. Creating from documents...")

    loader = PyPDFLoader(str(ROOT_DIR / "ai_engineer" / "FAISS" / "Docs" / "PDF_ecommerce_revenue.pdf"))
    documents = loader.load()

    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = splitter.split_documents(documents=documents)

    vector_store = QdrantVectorStore.from_documents(
        documents=chunks,
        embedding=embedding,
        client=qdrant_client,
        collection_name="ecommerce_revenue_analysis"
    )
    print("Vector store created from documents.")

## 5. Load LLM from HuggingFace Hub

In [None]:
endpoint = HuggingFaceEndpoint(
    repo_id="HuggingFaceH4/zephyr-7b-beta",
    max_new_tokens=512,
    task="text-generation",
    do_sample=False,
    huggingfacehub_api_token=HF_TOKEN,
    temperature=0.5
)
llm = ChatHuggingFace(llm=endpoint)
print("LLM loaded successfully.")

## 6. Build RAG Chain

In [None]:
retriever = vector_store.as_retriever(search_kwargs={"k": 3})

prompt = PromptTemplate.from_template("""
You are an e-commerce senior business analyst.
Use ONLY the following context from document to answer the question.
If the context is insufficient to answer the question, make hypothesis based on your knowledge and reasoning.

Context:
{context}

Question: {question}

Notes:
1. Minimal 350 words explanation
2. Explaining the reasoning process step by step, including any assumptions made and how you arrived at the conclusion.
3. Provide detailed analysis and insights based on the retrieved context
4. If the context does not contain specific data or figures relevant to document, infer possible as senior business analyst, and clearly state.
5. Make it statement clearly based on similar to document, if nothing cite specific data or figures in source_pages, infer possible values based on industry standards and trends, and clearly state that these are assumptions.
6. Do not hallucinate
""")

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

print("RAG Chain created successfully.")

## 7. Run a Query

In [None]:
question = "What are the main revenue drivers?"

response = rag_chain.invoke(question)
print(response)