In [1]:
from dotenv import load_dotenv
import os
load_dotenv()

# print(os.getenv("LANGCHAIN_PROJECT"))
#  connection string: postgresql+psycopg://langchain:langchain@localhost:6024/langchain
PG_CONNECT_STRING = 'postgresql+psycopg://langchain:langchain@localhost:6024/langchain'

In [2]:
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_postgres.vectorstores import PGVector
from langchain_core.documents import Document
import uuid

In [3]:
# Load the document, split it into chunks
raw_documents = TextLoader('../../test.txt').load()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=20
)
chunks=text_splitter.split_documents(raw_documents)
embeddings_model = OpenAIEmbeddings()
embeddings = embeddings_model.embed_documents(
    [chunk.page_content for chunk in chunks]
)
print(len(embeddings))
print(len(embeddings[0]))

799
1536


In [4]:
db = PGVector.from_documents(chunks, embeddings_model, connection=PG_CONNECT_STRING)

In [5]:
db.similarity_search("query", k=4)

[Document(id='e83da7bd-958f-46c0-9b75-0e7ecd06529a', metadata={'source': '../../test.txt'}, page_content='V.'),
 Document(id='3fd54062-2423-479f-bc11-e169f8cf3d4f', metadata={'source': '../../test.txt'}, page_content='V.'),
 Document(id='d62d25f3-9d6d-418e-bcc9-ce5c1c56904e', metadata={'source': '../../test.txt'}, page_content='V.'),
 Document(id='7b2c795d-ce0e-43bc-b47b-f9aaf435d858', metadata={'source': '../../test.txt'}, page_content='V.')]

In [6]:
ids = [str(uuid.uuid4()), str(uuid.uuid4())]
db.add_documents(
    [
        Document(
            page_content="there are cats in the pond",
            metadata={"location": "pond", "topic": "animal"}
        ),
        Document(
            page_content="ducks are also found in the pond",
            metadata={"location": "pond", "topic": "animals"}
        )
    ],
    ids = ids
)

['eedb17e0-5c36-4569-86a4-e29ae0ce91b6',
 '1dd65ed6-71a8-4247-a644-7c9dcdce8c9e']

In [7]:
db.delete([ids[1]])

In [8]:
from langchain.indexes import SQLRecordManager, index

collection_name ="my_docs"
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small")
namespace = "my_docs_namespace"
	
vectorstore = PGVector(
    embeddings=embeddings_model,
    collection_name=collection_name,
    connection=PG_CONNECT_STRING,
    use_jsonb=True,
)
	
record_manager = SQLRecordManager(
    namespace,
    db_url="postgresql+psycopg://langchain:langchain@localhost:6024/langchain",
)

In [9]:
# Create the schema if it doesn't exist
record_manager.create_schema()
	
# Create documents
docs = [
    Document(page_content='there are cats in the pond', metadata={
        "id": 1, "source": "cats.txt"}),
    Document(page_content='ducks are also found in the pond', metadata={
        "id": 2, "source": "ducks.txt"}),
]
	
# Index the documents
index_1 = index(
    docs,
    record_manager,
    vectorstore,
    cleanup="incremental",  # prevent duplicate documents
    source_id_key="source",  # use the source field as the source_id
)

In [10]:
print("Index attempt 1:", index_1)

Index attempt 1: {'num_added': 1, 'num_updated': 0, 'num_skipped': 1, 'num_deleted': 1}


In [11]:
# second time you attempt to index, it will not add the documents again
index_2 = index(
    docs,
    record_manager,
    vectorstore,
    cleanup="incremental",
    source_id_key="source",
)
print("Index attempt 2:", index_2)

Index attempt 2: {'num_added': 0, 'num_updated': 0, 'num_skipped': 2, 'num_deleted': 0}


In [12]:
# If we mutate a document, the new version will be written and all old 
# versions sharing the same source will be deleted.
	
docs[0].page_content = "I just modified this document!"
	
index_3 = index(
    docs,
    record_manager,
    vectorstore,
    cleanup="incremental",
    source_id_key="source",
)
	
print("Index attempt 3:", index_3)

Index attempt 3: {'num_added': 1, 'num_updated': 0, 'num_skipped': 1, 'num_deleted': 1}


## MultiVectorRetrieval

In [13]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_postgres.vectorstores import PGVector
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI
from langchain_core.documents import Document
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
import uuid

In [15]:
connection = "postgresql+psycopg://langchain:langchain@localhost:6024/langchain"
collection_name = "summaries"
embeddings_model = OpenAIEmbeddings()
# Load the document
loader = TextLoader("../../test.txt", encoding="utf-8")
docs = loader.load()

print("length of loaded docs: ", len(docs[0].page_content))
# Split the document
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(docs)

# The rest of your code remains the same, starting from:
prompt_text = "Summarize the following document:\n\n{doc}"
	
prompt = ChatPromptTemplate.from_template(prompt_text)
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")
summarize_chain = {
    "doc": lambda x: x.page_content} | prompt | llm | StrOutputParser()
	
# batch the chain across the chunks
summaries = summarize_chain.batch(chunks, {"max_concurrency": 5})

length of loaded docs:  624212


In [16]:
type(summaries)

list

In [17]:
summaries[0]

'Chapter 1 explores the rich cultural, social, and political aspects of life in ancient Greece, focusing on the importance of the polis, or city-state, in shaping Greek society. It discusses the communal living, intellectual pursuits, and artistic innovations that characterized ancient Greek civilization, as well as the social structure, education, religion, economy, and contributions to art and architecture that defined this remarkable society.'

In [18]:
# The vectorstore to use to index the child chunks
vectorstore = PGVector(
    embeddings=embeddings_model,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,
)
# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"
	
# indexing the summaries in our vector store, whilst retaining the original 
# documents in our document store:
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)
	
# Changed from summaries to chunks since we need same length as docs
doc_ids = [str(uuid.uuid4()) for _ in chunks]
	
# Each summary is linked to the original document by the doc_id
summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(summaries)
]
	
# Add the document summaries to the vector store for similarity search
retriever.vectorstore.add_documents(summary_docs)
	
# Store the original documents in the document store, linked to their summaries 
# via doc_ids
# This allows us to first search summaries efficiently, then fetch the full 
# docs when needed
retriever.docstore.mset(list(zip(doc_ids, chunks)))
	
# vector store retrieves the summaries
sub_docs = retriever.vectorstore.similarity_search(
    "chapter on philosophy", k=2)

In [19]:
sub_docs

[Document(id='b3d44e76-97d0-4570-9f8a-419ac1696ea9', metadata={'doc_id': '6527dddc-85ab-4239-8701-ca05dfbdd81d'}, page_content="This chapter discusses Plato, his teachers, and his times. It explores the evolution of philosophy and the high reputation of Plato's philosophy despite its extravagances. It also delves into the dialectic method used by Plato, his views on divine inspiration, and his criticism of Socratic ethics. The chapter also touches on Plato's relation to the Sophists and his philosophical teachings as embodied in his dialogues."),
 Document(id='56a420d8-140d-43a1-9de5-57c18337a183', metadata={'doc_id': 'b87fe0db-5e07-4c07-b15a-8cdbe6248e24'}, page_content='Chapter I discusses the early Greek thought, highlighting the strength and universality of the Greek intellect, the specialization of individual genius, and the pervading sense of harmony and union. It also explores the circumstances that shaped the intellectual character of the Greeks, emphasizing that philosophy was

In [20]:
# Whereas the retriever will return the larger source document chunks:
retrieved_docs = retriever.invoke("chapter on philosophy")

In [21]:
retrieved_docs

[Document(metadata={'source': '../../test.txt'}, page_content='CHAPTER IV.\n\nPLATO; HIS TEACHERS AND HIS TIMES    pages 171-213\n\nI. New meaning given to systems of philosophy by the method of\nevolution, 171—Extravagances of which Plato’s philosophy seems to be\nmade up, 172—The high reputation which it, nevertheless, continues\nto enjoy, 174—Distinction between speculative tendencies and the\nsystematic form under which they are transmitted, 174—Genuineness\nof the Platonic Dialogues, 175—Their chronological order, 177—They\nembody the substance of Plato’s philosophical teaching, 177.\n\nII. Wider application given to the dialectic method by Plato, 179—He\ngoes back to the initial doubt of Socrates, 180—To what extent\nhe shared in the religious reaction of his time, 181—He places\ndemonstrative reasoning above divine inspiration, 182—His criticism\nof the Socratic ethics, 183—Exceptional character of the _Crito_\naccounted for, 184—Traces of Sophistic influence, 185—General\nrelat

In [22]:
# RAGatouille is a library that makes it simple to use ColBERT
#! pip install -U ragatouille

from ragatouille import RAGPretrainedModel
RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")

import requests

def get_wikipedia_page(title: str):
    """
    Retrieve the full text content of a Wikipedia page.

    :param title: str - Title of the Wikipedia page.
    :return: str - Full text content of the page as raw string.
    """
    # Wikipedia API endpoint
    URL = "https://en.wikipedia.org/w/api.php"

    # Parameters for the API request
    params = {
        "action": "query",
        "format": "json",
        "titles": title,
        "prop": "extracts",
        "explaintext": True,
    }

    # Custom User-Agent header to comply with Wikipedia's best practices
    headers = {"User-Agent": "RAGatouille_tutorial/0.0.1"}

    response = requests.get(URL, params=params, headers=headers)
    data = response.json()

    # Extracting page content
    page = next(iter(data["query"]["pages"].values()))
    return page["extract"] if "extract" in page else None

full_document = get_wikipedia_page("Hayao_Miyazaki")

## Create an index
RAG.index(
    collection=[full_document],
    index_name="Miyazaki-123",
    max_document_length=180,
    split_documents=True,
)

#query
results = RAG.search(query="What animation studio did Miyazaki found?", k=3)
results

#utilize langchain retriever
retriever = RAG.as_langchain_retriever(k=3)
retriever.invoke("What animation studio did Miyazaki found?")

  from .autonotebook import tqdm as notebook_tqdm


ImportError: cannot import name 'AdamW' from 'transformers' (/home/chenyang/Git/learning-langchain/.venv/lib/python3.12/site-packages/transformers/__init__.py)