In [1]:
!pip install pinecone-client langchain datasets openai tiktoken

Collecting pinecone-client
  Downloading pinecone_client-2.2.4-py3-none-any.whl (179 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.4/179.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain
  Downloading langchain-0.0.330-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai
  Downloading openai-0.28.1-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tiktoken
  Downloading tiktoken-0.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [2]:
from datasets import load_dataset

data = load_dataset("jamescalam/ai-arxiv-chunked", split="train")
data

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/153M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['doi', 'chunk-id', 'chunk', 'id', 'title', 'summary', 'source', 'authors', 'categories', 'comment', 'journal_ref', 'primary_category', 'published', 'updated', 'references'],
    num_rows: 41584
})

In [3]:
from langchain.docstore.document import Document

docs = []
for row in data:
    doc = Document(
        page_content=row["chunk"],
        metadata={
            "title": row["title"],
            "source": row["source"],
            "id": row["id"],
            "chunk-id": row["chunk-id"],
            "text": row["chunk"]
        }
    )
    docs.append(doc)

In [4]:
import os
from getpass import getpass
from langchain.embeddings.openai import OpenAIEmbeddings

model_name = "text-embedding-ada-002"

OPENAI_API_KEY = ""

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY,
    disallowed_special=()
)

In [6]:
import pinecone
import time

index_name = "langchain-multi-query-demo"

PINECONE_API_KEY = ""
PINECONE_ENV = ""

pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENV
)

if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        name=index_name,
        metric="cosine",
        dimension=1536
    )

    while not pinecone.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pinecone.Index(index_name)

In [7]:
len(docs)

41584

In [9]:
docs = docs[:1000]

In [10]:
from tqdm.auto import tqdm
from uuid import uuid4

batch_size = 64

for i in tqdm(range(0, len(docs), batch_size)):
    i_end = min(len(docs), i + batch_size)
    docs_batch = docs[i:i_end]
    ids = [f"{doc.metadata['id']}-{doc.metadata['chunk-id']}" for doc in docs_batch]
    texts = [d.page_content for d in docs_batch]
    embeds = embed.embed_documents(texts=texts)
    metadata = [d.metadata for d in docs_batch]
    to_upsert = zip(ids, embeds, metadata)
    index.upsert(vectors=to_upsert)

  0%|          | 0/16 [00:00<?, ?it/s]

In [11]:
from langchain.vectorstores import Pinecone

text_field = "text"

vectorstore = Pinecone(
    index,
    embed.embed_query,
    text_field
)



In [12]:
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(
    temperature=0,
    openai_api_key=OPENAI_API_KEY
)

In [13]:
from langchain.retrievers.multi_query import MultiQueryRetriever

retriever = MultiQueryRetriever.from_llm(
    retriever=vectorstore.as_retriever(),
    llm=llm
)

In [14]:
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [15]:
question = "Tell me about llama2?"

docs = retriever.get_relevant_documents(query=question)
len(docs)

KeyboardInterrupt: ignored

In [16]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

QA_PROMPT = PromptTemplate(
    input_variables=["query", "contexts"],
    template = """
    You are a helpful assistant who answers user queries using the
    contexts provided. If the question cannot be answered using the information
    provided say "I don't know".

    Contexts:
    {contexts}

    Question:
    {query}
    """
)

qa_chain = LLMChain(llm=llm, prompt=QA_PROMPT)

In [17]:
output = qa_chain(
    inputs = {
        "query": question,
        "contexts": "\n---\n".join([d.page_content for d in docs])
    }
)

output["text"]



KeyboardInterrupt: ignored

In [18]:
from langchain.chains import TransformChain

def retrieval_transform(inputs: dict) -> dict:
    docs = retriever.get_relevant_documents(query=inputs["question"])
    docs = [d.page_content for d in docs]
    docs_dict = {
        "query": inputs["question"],
        "contexts": "\n---\n".join(docs)
    }
    return docs_dict

retrieval_chain = TransformChain(
    input_variables=["question"],
    output_variables=["query", "contexts"],
    transform=retrieval_transform
)

In [19]:
from langchain.chains import SequentialChain

rag_chain = SequentialChain(
    chains=[retrieval_chain, qa_chain],
    input_variables=["question"],
    output_variables=["query", "contexts", "text"]
)

In [20]:
output = rag_chain({"question": question})
output["text"]

KeyboardInterrupt: ignored

In [21]:
from typing import List
from langchain.chains import LLMChain
from pydantic import BaseModel, Field
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser

class LineList(BaseModel):
    lines: List[str] = Field(description="Lines of text")

class LineListOutputParser(PydanticOutputParser):
    def __init__(self) -> None:
        super().__init__(pydantic_object=LineList)

    def parse(self, text: str) -> LineList:
        lines = text.strip().split("\n")
        return LineList(lines=lines)

output_parser = LineListOutputParser()

template = """
Your task is to generate 3 different search queries that aim to
answer the user question from multiple perspectives. The user questions
are focused on Large Language Models, Machine Learning, and related
disciplines.
Each query MUST tackle the question from a different viewpoint, we
want to get a variety of RELEVANT search results.
Provide these alternative questions separated by newlines.
Original question: {question}
"""

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template=template
)
llm = ChatOpenAI(
    temperature=0,
    openai_api_key=OPENAI_API_KEY
)

llm_chain = LLMChain(
    llm=llm,
    prompt=QUERY_PROMPT,
    output_parser=output_parser
)

In [22]:
retriever = MultiQueryRetriever(
    retriever=vectorstore.as_retriever(),
    llm_chain=llm_chain,
    parser_key="lines"
)

docs = retriever.get_relevant_documents(
    query=question
)
len(docs)

INFO:langchain.retrievers.multi_query:Generated queries: ['1. What are the applications and use cases of large language models like llama2 in machine learning?', '2. How does llama2 compare to other large language models in terms of performance and capabilities?', '3. What are the latest advancements and research developments in the field of large language models, specifically focusing on llama2?']


9

In [23]:
docs

[Document(page_content='Zevenbergen, Vinodkumar Prabhakaran, Mark Diaz, Ben Hutchinson, Kristen Olson, Alejandra Molina,\nErin Hoffman-John, Josh Lee, Lora Aroyo, Ravi Rajakumar, Alena Butryna, Matthew Lamm, Viktoriya\nKuzmina, Joe Fenton, Aaron Cohen, Rachel Bernstein, Ray Kurzweil, Blaise Aguera-Arcas, Claire Cui,\nMarian Croak, Ed Chi, and Quoc Le. LaMDA: Language Models for Dialog Applications. arXiv preprint\narXiv:2201.08239 , 2022.\nAshish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz\nKaiser, and Illia Polosukhin. Attention is All you Need. In NeurIPS , 2017.\nSahil Verma, John Dickerson, and Keegan Hines. Counterfactual Explanations for Machine Learning: A\nReview. arXiv preprint arXiv:2010.10596 , 2020.\nGeorge Walkden. English VP-fronting and the Syntax of Yoda. talk given at LinguistMix , 2012.', metadata={'chunk-id': '101', 'id': datetime.date(2209, 11, 5), 'source': 'http://arxiv.org/pdf/2209.07686', 'title': 'Text and Patterns: 

In [24]:
retrieval_chain = TransformChain(
    input_variables=["question"],
    output_variables=["query", "contexts"],
    transform=retrieval_transform
)

rag_chain = SequentialChain(
    chains=[retrieval_chain, qa_chain],
    input_variables=["question"],
    output_variables=["query", "contexts", "text"]
)

In [25]:
output = rag_chain({"question": question})
output["text"]

INFO:langchain.retrievers.multi_query:Generated queries: ['1. What are the applications and use cases of large language models like llama2 in machine learning?', '2. How does llama2 compare to other large language models in terms of performance and capabilities?', '3. What are the latest advancements and research developments in the field of large language models, specifically focusing on llama2?']


"I don't know about llama2 based on the given information."

In [26]:
pinecone.delete_index(index_name)