In [1]:
import os
from dotenv import load_dotenv
from langchain.chat_models import  ChatOpenAI



In [2]:
load_dotenv()

True

In [4]:
chat = ChatOpenAI(
    openai_api_key=os.environ.get("OPENAI_API_KEY"),
    model = "gpt-3.5-turbo"
)

In [5]:
chat

ChatOpenAI(cache=None, verbose=False, callbacks=None, callback_manager=None, tags=None, metadata=None, client=<class 'openai.api_resources.chat_completion.ChatCompletion'>, model_name='gpt-3.5-turbo', temperature=0.7, model_kwargs={}, openai_api_key='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', openai_api_base='', openai_organization='', openai_proxy='', request_timeout=None, max_retries=6, streaming=False, n=1, max_tokens=None, tiktoken_model_name=None)

In [6]:
from langchain.schema import SystemMessage, HumanMessage, AIMessage

In [7]:
messages = [
    SystemMessage(content="You are a helpful assistant. Your name is Friday"),
    HumanMessage(content= "Hi Friday, how are you today?"),
    SystemMessage(content="Hi, I'm great thank you. How can I help you?"),
    HumanMessage(content="I would like to understand God particle.")
]

In [8]:
res = chat(messages)
res

AIMessage(content='The "God particle" is a term used to refer to the Higgs boson, a particle that was discovered in 2012 at the Large Hadron Collider (LHC) in Geneva, Switzerland. The Higgs boson is a fundamental particle in the Standard Model of particle physics, which is a theory that describes the fundamental particles and forces in the universe.\n\nThe Higgs boson is particularly interesting because it is associated with the Higgs field, which is believed to give particles their mass. According to the Standard Model, all particles acquire mass by interacting with this field. The Higgs boson is the particle associated with the Higgs field, and its discovery confirmed the existence of this field.\n\nThe particle is nicknamed the "God particle" due to the book title "The God Particle: If the Universe is the Answer, What is the Question?" by physicist Leon Lederman. The nickname was chosen for marketing purposes, but many scientists do not use it because it can be misleading and create

In [9]:
from datasets import load_dataset
import pinecone

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
pinecone.init(
    api_key=os.environ.get("PINCONE_API_KEY"),
    environment="gcp-starter"
)

In [11]:
import time

In [14]:
index_name = 'llama-2-rag'
if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        index_name,
        dimension=1536,
        metric='cosine'
    )

while not pinecone.describe_index(index_name).status['ready']:
    time.sleep(1)

index = pinecone.Index(index_name)

index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [15]:
from langchain.embeddings.openai import OpenAIEmbeddings

In [16]:
embed_model  = OpenAIEmbeddings(
    model = "text-embedding-ada-002"
)

In [17]:
texts = ["sun rises in the east", "gravity is not a force"]

In [19]:
res = embed_model.embed_documents(texts)

In [20]:
len(res), len(res[0])

(2, 1536)

In [21]:
dataset = load_dataset(
    "jamescalam/llama-2-arxiv-papers-chunked",
    split="train"
)

Downloading readme: 100%|██████████| 409/409 [00:00<00:00, 608kB/s]
Downloading data: 100%|██████████| 14.4M/14.4M [00:06<00:00, 2.12MB/s]
Downloading data files: 100%|██████████| 1/1 [00:06<00:00,  6.84s/it]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 424.65it/s]
Generating train split: 4838 examples [00:00, 47941.36 examples/s]


In [22]:
dataset

Dataset({
    features: ['doi', 'chunk-id', 'chunk', 'id', 'title', 'summary', 'source', 'authors', 'categories', 'comment', 'journal_ref', 'primary_category', 'published', 'updated', 'references'],
    num_rows: 4838
})

In [23]:
dataset.features

{'doi': Value(dtype='string', id=None),
 'chunk-id': Value(dtype='string', id=None),
 'chunk': Value(dtype='string', id=None),
 'id': Value(dtype='string', id=None),
 'title': Value(dtype='string', id=None),
 'summary': Value(dtype='string', id=None),
 'source': Value(dtype='string', id=None),
 'authors': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'categories': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'comment': Value(dtype='string', id=None),
 'journal_ref': Value(dtype='string', id=None),
 'primary_category': Value(dtype='string', id=None),
 'published': Value(dtype='string', id=None),
 'updated': Value(dtype='string', id=None),
 'references': [{'id': Value(dtype='string', id=None),
   'title': Value(dtype='string', id=None),
   'authors': Value(dtype='string', id=None),
   'year': Value(dtype='string', id=None)}]}

In [24]:
len(dataset)

4838

In [25]:
df = dataset.to_pandas()

In [27]:
df.head(5)

Unnamed: 0,doi,chunk-id,chunk,id,title,summary,source,authors,categories,comment,journal_ref,primary_category,published,updated,references
0,1102.0183,0,High-Performance Neural Networks\nfor Visual O...,1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]
1,1102.0183,1,"January 2011\nAbstract\nWe present a fast, ful...",1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]
2,1102.0183,2,promising architectures for such tasks. The mo...,1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]
3,1102.0183,3,"Mutch and Lowe, 2008), whose lters are xed, ...",1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]
4,1102.0183,4,We evaluate various networks on the handwritte...,1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]


In [33]:
df.chunk[2].split("\n")

['promising architectures for such tasks. The most successful hierarchical object recognition systems',
 'all extract localized features from input images, convolving image patches with \x0clters. Filter',
 'responses are then repeatedly sub-sampled and re-\x0cltered, resulting in a deep feed-forward network',
 'architecture whose output feature vectors are eventually classi\x0ced. One of the \x0crst hierarchical',
 'neural systems was the Neocognitron (Fukushima, 1980) which inspired many of the more recent',
 'variants.',
 'Unsupervised learning methods applied to patches of natural images tend to produce localized',
 '\x0clters that resemble o\x0b-center-on-surround \x0clters, orientation-sensitive bar detectors, Gabor \x0clters',
 '(Schmidhuber et al. , 1996; Olshausen and Field, 1997; Hoyer and Hyv\x7f arinen, 2000). These \x0cndings',
 'in conjunction with experimental studies of the visual cortex justify the use of such \x0clters in the',
 'so-called standard model for object re

In [49]:
df1 = df.groupby("id").get_group('1102.0183')# each row contains a  some amount of text from a paper. The paper can be keyed using the id column

In [48]:
df1.head(5)

Unnamed: 0,doi,chunk-id,chunk,id,title,summary,source,authors,categories,comment,journal_ref,primary_category,published,updated,references
0,1102.0183,0,High-Performance Neural Networks\nfor Visual O...,1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]
1,1102.0183,1,"January 2011\nAbstract\nWe present a fast, ful...",1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]
2,1102.0183,2,promising architectures for such tasks. The mo...,1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]
3,1102.0183,3,"Mutch and Lowe, 2008), whose lters are xed, ...",1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]
4,1102.0183,4,We evaluate various networks on the handwritte...,1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]


In [54]:
# let's find the length of the each chunk
len(df1.chunk[0]), len(df1.chunk[1]), len(df1.chunk[2]), len(df1.chunk[3])

(1090, 1308, 1077, 1246)

In [63]:
df1.iloc[0:3]

Unnamed: 0,doi,chunk-id,chunk,id,title,summary,source,authors,categories,comment,journal_ref,primary_category,published,updated,references
0,1102.0183,0,High-Performance Neural Networks\nfor Visual O...,1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]
1,1102.0183,1,"January 2011\nAbstract\nWe present a fast, ful...",1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]
2,1102.0183,2,promising architectures for such tasks. The mo...,1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]


In [64]:
from tqdm.auto import tqdm  # for progress bar

data = dataset.to_pandas()  # this makes it easier to iterate over the dataset

batch_size = 100

for i in tqdm(range(0, len(data), batch_size)):
    i_end = min(len(data), i+batch_size)
    # get batch of data
    batch = data.iloc[i:i_end]
    # generate unique ids for each chunk
    ids = [f"{x['doi']}-{x['chunk-id']}" for i, x in batch.iterrows()]
    # get text to embed
    texts = [x['chunk'] for _, x in batch.iterrows()]
    # embed text
    embeds = embed_model.embed_documents(texts)
    # get metadata to store in Pinecone
    metadata = [
        {'text': x['chunk'],
         'source': x['source'],
         'title': x['title']} for i, x in batch.iterrows()
    ]
    # add to Pinecone
    index.upsert(vectors=zip(ids, embeds, metadata))

100%|██████████| 49/49 [04:32<00:00,  5.56s/it]


In [67]:
from langchain.vectorstores import Pinecone

In [68]:
text_field="text"
vectorstore = Pinecone(
    index, embed_model.embed_query,
    text_field
    )



In [69]:
query = "What is special about llama2?"

In [70]:
vectorstore.similarity_search(
    query,
    k=3
)

[Document(page_content='Alan Schelten Ruan Silva Eric Michael Smith Ranjan Subramanian Xiaoqing Ellen Tan Binh Tang\nRoss Taylor Adina Williams Jian Xiang Kuan Puxin Xu Zheng Yan Iliyan Zarov Yuchen Zhang\nAngela Fan Melanie Kambadur Sharan Narang Aurelien Rodriguez Robert Stojnic\nSergey Edunov Thomas Scialom\x03\nGenAI, Meta\nAbstract\nIn this work, we develop and release Llama 2, a collection of pretrained and ﬁne-tuned\nlarge language models (LLMs) ranging in scale from 7 billion to 70 billion parameters.\nOur ﬁne-tuned LLMs, called L/l.sc/a.sc/m.sc/a.sc /two.taboldstyle-C/h.sc/a.sc/t.sc , are optimized for dialogue use cases. Our\nmodels outperform open-source chat models on most benchmarks we tested, and based on\nourhumanevaluationsforhelpfulnessandsafety,maybeasuitablesubstituteforclosedsource models. We provide a detailed description of our approach to ﬁne-tuning and safety', metadata={'source': 'http://arxiv.org/pdf/2307.09288', 'title': 'Llama 2: Open Foundation and Fine-Tun

In [79]:
def augment_prompt(query: str):
    results = vectorstore.similarity_search(query, k=3)
    source_knowledge = "\n".join([x.page_content for x in results])
    augment_prompt = f""" Using the context below, answer the query.

context: {source_knowledge}

query: {query}
"""
    return augment_prompt

In [80]:
augment_prompt(query)

' Using the context below, answer the query.\n\ncontext: Alan Schelten Ruan Silva Eric Michael Smith Ranjan Subramanian Xiaoqing Ellen Tan Binh Tang\nRoss Taylor Adina Williams Jian Xiang Kuan Puxin Xu Zheng Yan Iliyan Zarov Yuchen Zhang\nAngela Fan Melanie Kambadur Sharan Narang Aurelien Rodriguez Robert Stojnic\nSergey Edunov Thomas Scialom\x03\nGenAI, Meta\nAbstract\nIn this work, we develop and release Llama 2, a collection of pretrained and ﬁne-tuned\nlarge language models (LLMs) ranging in scale from 7 billion to 70 billion parameters.\nOur ﬁne-tuned LLMs, called L/l.sc/a.sc/m.sc/a.sc /two.taboldstyle-C/h.sc/a.sc/t.sc , are optimized for dialogue use cases. Our\nmodels outperform open-source chat models on most benchmarks we tested, and based on\nourhumanevaluationsforhelpfulnessandsafety,maybeasuitablesubstituteforclosedsource models. We provide a detailed description of our approach to ﬁne-tuning and safety\nasChatGPT,BARD,andClaude. TheseclosedproductLLMsareheavilyﬁne-tunedtoa

In [94]:
prompt = HumanMessage(content=augment_prompt(query))

In [95]:
messages.append(prompt)

In [96]:
messages

[SystemMessage(content='You are a helpful assistant. Your name is Friday', additional_kwargs={}),
 HumanMessage(content='Hi Friday, how are you today?', additional_kwargs={}, example=False),
 SystemMessage(content="Hi, I'm great thank you. How can I help you?", additional_kwargs={}),
 HumanMessage(content='I would like to understand God particle.', additional_kwargs={}, example=False),
 HumanMessage(content=' Using the context below, answer the query.\n\ncontext: Alan Schelten Ruan Silva Eric Michael Smith Ranjan Subramanian Xiaoqing Ellen Tan Binh Tang\nRoss Taylor Adina Williams Jian Xiang Kuan Puxin Xu Zheng Yan Iliyan Zarov Yuchen Zhang\nAngela Fan Melanie Kambadur Sharan Narang Aurelien Rodriguez Robert Stojnic\nSergey Edunov Thomas Scialom\x03\nGenAI, Meta\nAbstract\nIn this work, we develop and release Llama 2, a collection of pretrained and ﬁne-tuned\nlarge language models (LLMs) ranging in scale from 7 billion to 70 billion parameters.\nOur ﬁne-tuned LLMs, called L/l.sc/a.sc/m

In [97]:
chat(messages)

AIMessage(content='The passage states that Llama 2 is a collection of pretrained and fine-tuned large language models (LLMs) ranging from 7 billion to 70 billion parameters. These models, specifically L/l.sc/a.sc/m.sc/a.sc /two.taboldstyle-C/h.sc/a.sc/t.sc, are optimized for dialogue use cases and outperform open-source chat models on most benchmarks. They are intended for commercial and research use in English and can be adapted for various natural language generation tasks. Llama 2 models also undergo fine-tuning and safety measures to align with human preferences and enhance usability and safety.', additional_kwargs={}, example=False)

In [98]:
def answer_me(query):
    prompt = augment_prompt(query)
    messages.append(HumanMessage(content=prompt))
    return chat(messages).content
    

In [99]:
answer_me(query)

'Llama 2 is a collection of pretrained and fine-tuned large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters. The fine-tuned LLMs, specifically L/l.sc/a.sc/m.sc/a.sc /two.taboldstyle-C/h.sc/a.sc/t.sc, are optimized for dialogue use cases. They have been shown to outperform open-source chat models on most benchmarks and may serve as a suitable substitute for closed-source models. The closed-source LLMs, such as ChatGPT, BARD, and Claude, have been heavily fine-tuned to align with human preferences, enhancing their usability and safety. Llama 2 models are intended for commercial and research use in English, with tuned models suitable for assistant-like chat and pretrained models adaptable to various natural language generation tasks.'

In [100]:
def chat_me():
    while(True):
        query = input()
        prompt = augment_prompt(query)
        messages.append(HumanMessage(content=prompt))
        res = chat(messages)
        print(res.content)  

In [102]:
messages

[SystemMessage(content='You are a helpful assistant. Your name is Friday', additional_kwargs={}),
 HumanMessage(content='Hi Friday, how are you today?', additional_kwargs={}, example=False),
 SystemMessage(content="Hi, I'm great thank you. How can I help you?", additional_kwargs={}),
 HumanMessage(content='I would like to understand God particle.', additional_kwargs={}, example=False),
 HumanMessage(content=' Using the context below, answer the query.\n\ncontext: Alan Schelten Ruan Silva Eric Michael Smith Ranjan Subramanian Xiaoqing Ellen Tan Binh Tang\nRoss Taylor Adina Williams Jian Xiang Kuan Puxin Xu Zheng Yan Iliyan Zarov Yuchen Zhang\nAngela Fan Melanie Kambadur Sharan Narang Aurelien Rodriguez Robert Stojnic\nSergey Edunov Thomas Scialom\x03\nGenAI, Meta\nAbstract\nIn this work, we develop and release Llama 2, a collection of pretrained and ﬁne-tuned\nlarge language models (LLMs) ranging in scale from 7 billion to 70 billion parameters.\nOur ﬁne-tuned LLMs, called L/l.sc/a.sc/m