In [31]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [32]:
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

In [33]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0.7,
)

In [34]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader('1706.03762v7.pdf')
document = loader.load()

In [35]:
from langchain_text_splitters import NLTKTextSplitter
text_splitter = NLTKTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)

In [36]:
split_documents = text_splitter.split_documents(document)

In [37]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

In [42]:
from langchain_community.vectorstores import Milvus
vector_store = Milvus(
    embedding_function=embeddings,
    connection_args={"uri": "https://in03-945f9d7d1a53526.serverless.aws-eu-central-1.cloud.zilliz.com", "token": os.getenv("MILVUS_API_KEY")},
    index_params={"index_type": "FLAT", "metric_type": "L2"},
    auto_id=True
)

In [45]:
vector_store

<langchain_community.vectorstores.milvus.Milvus at 0x1abdea667d0>

In [None]:
from langchain_community.vectorstores import Milvus
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import NLTKTextSplitter
import uuid
import os

# 1. Load PDF
loader = PyPDFLoader("1706.03762v7.pdf")
documents = loader.load()

# 2. Split
text_splitter = NLTKTextSplitter(chunk_size=1000, chunk_overlap=200)
split_documents = text_splitter.split_documents(documents)

# 3. Clean metadata
def clean_metadata(metadata):
    return {
        k.replace(".", "_").replace("-", "_"): v
        for k, v in metadata.items()
    }

for doc in split_documents:
    doc.metadata = clean_metadata(doc.metadata)

# 4. Embeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

# 5. Milvus config
vector_store = Milvus(
    embedding_function=embeddings,
    connection_args={
        "uri": "",
        "token": os.getenv("MILVUS_API_KEY")
    },
    collection_name="my_pdf_docs",
    index_params={"index_type": "HNSW", "metric_type": "L2"},
    search_params={"ef": 128},  
    auto_id=True
)

# 6. Store
texts = [doc.page_content for doc in split_documents]
metadatas = [doc.metadata for doc in split_documents]
ids = [str(uuid.uuid4()) for _ in texts]

vector_store.add_texts(texts=texts, metadatas=metadatas)


[459601663938464653,
 459601663938464654,
 459601663938464655,
 459601663938464656,
 459601663938464657,
 459601663938464658,
 459601663938464659,
 459601663938464660,
 459601663938464661,
 459601663938464662,
 459601663938464663,
 459601663938464664,
 459601663938464665,
 459601663938464666,
 459601663938464667,
 459601663938464668,
 459601663938464669,
 459601663938464670,
 459601663938464671,
 459601663938464672,
 459601663938464673,
 459601663938464674,
 459601663938464675,
 459601663938464676,
 459601663938464677,
 459601663938464678,
 459601663938464679,
 459601663938464680,
 459601663938464681,
 459601663938464682,
 459601663938464683,
 459601663938464684,
 459601663938464685,
 459601663938464686,
 459601663938464687,
 459601663938464688,
 459601663938464689,
 459601663938464690,
 459601663938464691,
 459601663938464692,
 459601663938464693,
 459601663938464694,
 459601663938464695,
 459601663938464696,
 459601663938464697,
 459601663938464698,
 459601663938464699,
 459601663938

In [18]:
results = vector_store.similarity_search("who is the author?")
results

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex_fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': '1706.03762v7.pdf', 'total_pages': 15, 'page': 10, 'page_label': '11', 'pk': 459601663938464695}, page_content='Convolu-\ntional sequence to sequence learning.\n\narXiv preprint arXiv:1705.03122v2, 2017.\n\n[10] Alex Graves.\n\nGenerating sequences with recurrent neural networks.\n\narXiv preprint\narXiv:1308.0850, 2013.\n\n[11] Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun.\n\nDeep residual learning for im-\nage recognition.\n\nIn Proceedings of the IEEE Conference on Computer Vision and Pattern\nRecognition, pages 770–778, 2016.\n\n[12] Sepp Hochreiter, Yoshua Bengio, Paolo Frasconi, and Jürgen Schmidhuber.\n\nGradi

In [19]:
retreiver = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})