# Article Unraveler

An end-to-end chat based app that enables Q&A with multiple online articles as sources.

## Initialize

In [1]:
# Imports
from dotenv import dotenv_values
import nest_asyncio
import os

In [2]:
# Apply nest_asyncio 
nest_asyncio.apply()

In [3]:
# Read Google API key
GOOGLE_API_KEY = dotenv_values(os.path.expanduser('~/.google'))['GOOGLE_API_KEY']
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

In [4]:
# Read OpenAI API key
OPENAI_API_KEY = dotenv_values(os.path.expanduser('~/.openai'))['OPENAI_API_KEY']
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

## LLM

In [1]:
# Imports
import os
from dotenv import dotenv_values
from langchain_google_genai import ChatGoogleGenerativeAI

In [3]:
# Init LLM object
llm = ChatGoogleGenerativeAI(model='gemini-1.5-flash', google_api_key=GOOGLE_API_KEY, temperature=0.2)

In [4]:
# Test LLM
llm.invoke("Write a haiku about Gemini LLM")

AIMessage(content="Words flow like a stream,\nGemini's mind, vast and deep,\nLearning, growing fast. \n", response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': [{'category': 'HARM_CATEGORY_SEXUALLY_EXPLICIT', 'probability': 'NEGLIGIBLE', 'blocked': False}, {'category': 'HARM_CATEGORY_HATE_SPEECH', 'probability': 'NEGLIGIBLE', 'blocked': False}, {'category': 'HARM_CATEGORY_HARASSMENT', 'probability': 'NEGLIGIBLE', 'blocked': False}, {'category': 'HARM_CATEGORY_DANGEROUS_CONTENT', 'probability': 'NEGLIGIBLE', 'blocked': False}]}, id='run-f70f64d2-0881-4c14-9b32-54a5a90efffe-0', usage_metadata={'input_tokens': 8, 'output_tokens': 22, 'total_tokens': 30})

## Data Loader

#### Text Loader

In [5]:
# Imports
from langchain_community.document_loaders import TextLoader

In [6]:
# Load sample text document
loader = TextLoader('./data/sample-1.txt')
data = loader.load()

In [7]:
# Inspect page content
data[0].page_content

'Contact\n\nFUTURE OF MOBILITY\nORGANISATION\nBUSINESS\nCORPORATE RESPONSIBILITY\nINVESTORS\nNEWSROOM\nCAREERS\nNewsroom\nPress release\nPress release - June 4, 2024\nMerger of Tata Motors Finance Limited with Tata Capital Limited\n   \nThe Board of Directors of Tata Motors Limited (TML), Tata Capital Limited (TCL) and Tata Motors Finance Ltd (TMFL) have today approved a merger of TMFL with TCL through an NCLT scheme of arrangement. As consideration for the merger, TCL will issue its equity shares to the shareholders of TMFL resulting in TML effectively holding a 4.7% stake in the merged entity.\n\nTCL (rated AAA by all leading rating agencies) is one of the largest diversified NBFCs in India with an AUM of ~INR 1.6L crore servicing customers with 25+ product offerings across Retail, SME and Corporate Segments. TMFL, with an AUM of ~INR 32.5K crore predominantly provides financing solutions for new and old commercial vehicles (CV), passenger vehicles (PV), dealers and vendors.\n\nIn FY

In [8]:
# Inspect metadata
data[0].metadata

{'source': './data/sample-1.txt'}

#### URL Loader

In [9]:
# Imports
from langchain_community.document_loaders import UnstructuredURLLoader

In [10]:
# Load sample document
url_loader = UnstructuredURLLoader(urls=[
    'https://economictimes.indiatimes.com/industry/transportation/airlines-/-aviation/air-india-vistara-merger-gets-nclt-nod/articleshow/110774121.cms?from=mdr',
    'https://www.barandbench.com/news/nclt-nod-air-india-vistara-merger',
    'https://www.financialexpress.com/business/airlines-aviation-air-india-vistara-merger-gets-approval-nclt-gives-nod-for-worlds-largest-airline-group-3517515/'
])
data = url_loader.load()

In [11]:
len(data)

3

## Data Splitter

In [12]:
# Imports
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [13]:
# Splitter object
splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " "],
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True
)

In [14]:
# Create chunks
chunks = splitter.split_documents(data)

In [15]:
len(chunks)

22

In [16]:
chunks[1]

Document(page_content="Construction\n\nEngineering\n\nCement\n\nChem / Fertilisers\n\nMetals & Mining\n\nPackaging\n\nPaper / Wood / Glass/ Plastic/ Marbles\n\nPetrochem\n\nSteel\n\nHealthcare/Biotech\n\nBiotech\n\nHealthcare\n\nPharmaceuticals\n\nServices\n\nAdvertising\n\nConsultancy / Audit\n\nEducation\n\nHotels / Restaurants\n\nProperty / C'struction\n\nRetail\n\nTravel\n\nMedia/Entertainment\n\nEntertainment\n\nMedia\n\nMore\n\nTransportation\n\nRailways\n\nAirlines / Aviation\n\nShipping / Transport\n\nRoadways\n\nTech\n\nITES\n\nTech & Internet\n\nStartups\n\nFunding\n\nTech Bytes\n\nTelecom\n\nTelecom News\n\nTelecom Policy\n\nMiscellaneous\n\nCSR\n\nInitiatives\n\nPolicy\n\nEnvironment\n\nBusiness News›\n\nIndustry›\n\nTransportation›\n\nAirlines / Aviation›\n\nAir India- Vistara merger gets NCLT nod\n\nThe Economic Times daily newspaper is available online now.\n\nRead Today's Paper\n\nAir India- Vistara merger gets NCLT nod\n\nSECTIONS\n\nAir India- Vistara merger gets NCLT

In [17]:
for i in range(10): print(len(chunks[i].page_content))

996
985
976
765
920
969
775
999
998
997


## Sentence Embedding

#### Sentence Transformer

In [1]:
# Import
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [19]:
# Sentence encoder
encoder = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [20]:
# Test encoder
string_chunks = [chunk.page_content for chunk in chunks]
embedding_vectors = encoder.encode(string_chunks)

In [21]:
embedding_vectors.shape

(107, 768)

In [22]:
embedding_vectors

array([[-0.04505419, -0.03125252, -0.02412015, ..., -0.04243594,
         0.01715305, -0.01589855],
       [ 0.00517899,  0.04239754, -0.02527562, ...,  0.00244321,
        -0.04640353, -0.0290963 ],
       [ 0.00326183, -0.0409476 , -0.01043462, ..., -0.05000011,
        -0.03075233, -0.01655629],
       ...,
       [ 0.03239418,  0.03363674,  0.00237303, ...,  0.02305428,
        -0.03785568,  0.00748185],
       [ 0.0039655 , -0.02571574, -0.00563731, ...,  0.02366192,
        -0.00139671,  0.02571008],
       [-0.01462839, -0.0602536 , -0.01996417, ...,  0.00660295,
        -0.05223874, -0.061701  ]], dtype=float32)

In [23]:
embedding_dim = embedding_vectors.shape[1]
embedding_dim

768

#### Google Embedding

In [41]:
# Imports
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import numpy as np

In [42]:
# Test embedder
query_embedded = embedding.embed_query("What merger are we talking about here?")
len(query_embedded)

768

In [43]:
# Test embedder
docs_embedded = embedding.embed_documents(string_chunks)
len(docs_embedded)

53

In [44]:
np.array(docs_embedded).shape

(53, 768)

## Vector Index

In [24]:
# Imports
import faiss
import numpy

In [25]:
# Create index
index = faiss.IndexFlatL2(embedding_dim)

In [26]:
# Add vectors to the index
index.add(embedding_vectors)

In [27]:
# Test index search
query_encoded = encoder.encode("Tata is merging with which company?")
query_vector = query_encoded.reshape(1,-1)
search_result = index.search(query_vector, k=2)
search_result

(array([[0.7407455, 0.8220611]], dtype=float32), array([[19, 11]]))

In [28]:
# Inspect search results
print(chunks[search_result[1][0][0]])
print(chunks[search_result[1][0][1]])

page_content="The\n\nCompetition Commission of India had already approved the merger in September 2023.\n\nMORE STORIES FOR YOU\n\n« Back to recommendation stories\n\nI don't want to see these stories because" metadata={'source': 'https://economictimes.indiatimes.com/industry/transportation/airlines-/-aviation/air-india-vistara-merger-gets-nclt-nod/articleshow/110774121.cms?from=mdr'}
page_content='The merger, already approved by the Competition Commission of India, will lead to a more efficient operation by eliminating duplicities in resources and operations. Consultants are working on' metadata={'source': 'https://economictimes.indiatimes.com/industry/transportation/airlines-/-aviation/air-india-vistara-merger-gets-nclt-nod/articleshow/110774121.cms?from=mdr'}


## RAG Exploration

### Full RAG Workflow

In [15]:
# Imports
import os
import pickle as pkl
import langchain
from langchain import hub
from langchain_community.vectorstores import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import PromptTemplate

In [4]:
# Embedder
embedding = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [25]:
# Vector store
vectorstore = Chroma.from_documents(documents=chunks, embedding=embedding)

Python-dotenv could not parse statement starting at line 4
Python-dotenv could not parse statement starting at line 8
Python-dotenv could not parse statement starting at line 12
Python-dotenv could not parse statement starting at line 15


In [160]:
# Doc retreiver
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

#### Stuff Chain

In [161]:
# RAG prompt
prompt = hub.pull("rlm/rag-prompt")

In [162]:
prompt

ChatPromptTemplate(input_variables=['context', 'question'], metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])

In [163]:
prompt.messages

[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))]

In [164]:
# Test prompt
example_messages = prompt.invoke(
    {"context": "Sample context", "question": "Sample question"}
).to_messages()
example_messages[0].content

[32;1m[1;3m[chain/start][0m [1m[prompt:ChatPromptTemplate] Entering Prompt run with input:
[0m{
  "context": "Sample context",
  "question": "Sample question"
}
[36;1m[1;3m[chain/end][0m [1m[prompt:ChatPromptTemplate] [0ms] Exiting Prompt run with output:
[0m[outputs]


"You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: Sample question \nContext: Sample context \nAnswer:"

In [165]:
# Document formatter for context docs
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [166]:
# RAG chain using LCEL
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [185]:
# Test retreival
retriever.invoke("What merger are we talking about here?")

[Document(page_content="Construction\n\nEngineering\n\nCement\n\nChem / Fertilisers\n\nMetals & Mining\n\nPackaging\n\nPaper / Wood / Glass/ Plastic/ Marbles\n\nPetrochem\n\nSteel\n\nHealthcare/Biotech\n\nBiotech\n\nHealthcare\n\nPharmaceuticals\n\nServices\n\nAdvertising\n\nConsultancy / Audit\n\nEducation\n\nHotels / Restaurants\n\nProperty / C'struction\n\nRetail\n\nTravel\n\nMedia/Entertainment\n\nEntertainment\n\nMedia\n\nMore\n\nTransportation\n\nRailways\n\nAirlines / Aviation\n\nShipping / Transport\n\nRoadways\n\nTech\n\nITES\n\nTech & Internet\n\nStartups\n\nFunding\n\nTech Bytes\n\nTelecom\n\nTelecom News\n\nTelecom Policy\n\nMiscellaneous\n\nCSR\n\nInitiatives\n\nPolicy\n\nEnvironment\n\nBusiness News›\n\nIndustry›\n\nTransportation›\n\nAirlines / Aviation›\n\nAir India- Vistara merger gets NCLT nod\n\nThe Economic Times daily newspaper is available online now.\n\nRead Today's Paper\n\nAir India- Vistara merger gets NCLT nod\n\nSECTIONS\n\nAir India- Vistara merger gets NCL

In [167]:
# Try to run the chain
langchain.debug = True
rag_chain.invoke("What merger are we talking about here?")
langchain.debug = False

[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "What merger are we talking about here?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question>] Entering Chain run with input:
[0m{
  "input": "What merger are we talking about here?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question> > chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "What merger are we talking about here?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question> > chain:RunnablePassthrough] Entering Chain run with input:
[0m{
  "input": "What merger are we talking about here?"
}
[36;1m[1;3m[chain/end][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question> > chain:RunnablePassthrough] [0ms] Exiting Chain run with output:
[0m{
  "output": "What merger are we talking ab

In [168]:
# Try to run the chain
for chunk in rag_chain.stream("What merger are we talking about here?"):
    print(chunk, end="", flush=True)

The merger being discussed is the combination of Air India and Vistara. The National Company Law Tribunal (NCLT) has approved the merger, making Air India the largest international carrier in India. The merger is expected to lead to a more efficient operation by eliminating redundancies. 


In [184]:
print(prompt.messages[0].prompt.template)

You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:


In [None]:
# Custom prompt
rag_prompt_template = \
"""You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:"""


rag_prompt = 

#### Map Reduce Chain

In [194]:
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources import load_qa_with_sources_chain

In [195]:
doc_chain = load_qa_with_sources_chain(llm, chain_type="map_reduce")

In [196]:
chain = RetrievalQAWithSourcesChain(retriever=retriever, combine_documents_chain=doc_chain)

In [201]:
chain



In [198]:
langchain.debug = True
chain.invoke("What merger are we talking about here?")
langchain.debug = False

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "What merger are we talking about here?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "Construction\n\nEngineering\n\nCement\n\nChem / Fertilisers\n\nMetals & Mining\n\nPackaging\n\nPaper / Wood / Glass/ Plastic/ Marbles\n\nPetrochem\n\nSteel\n\nHealthcare/Biotech\n\nBiotech\n\nHealthcare\n\nPharmaceuticals\n\nServices\n\nAdvertising\n\nConsultancy / Audit\n\nEducation\n\nHotels / Restaurants\n\nProperty / C'struction\n\nRetail\n\nTravel\n\nMedia/Entertainment\n\nEntertainment\n\nMedia\n\nMore\n\nTransportation\n\nRailways\n\nAirlines / Aviation\n\nShipping / Transport\

### Explore RAG chains

In [16]:
# Imports
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_community.embeddings.huggingface import HuggingFaceBgeEmbeddings
from langchain_community.document_loaders import UnstructuredURLLoader, WebBaseLoader
from langchain_community.document_transformers import LongContextReorder, EmbeddingsRedundantFilter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import MultiQueryRetriever, MergerRetriever, ContextualCompressionRetriever
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import OpenAIEmbeddings
from langchain import hub
from dotenv import dotenv_values
import os
import langchain
import pandas as pd
import numpy as np



In [89]:
# Read Google API key
GOOGLE_API_KEY = dotenv_values(os.path.expanduser('~/.google'))['GOOGLE_API_KEY']
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

In [90]:
# Read OpenAI API key
OPENAI_API_KEY = dotenv_values(os.path.expanduser('~/.openai'))['OPENAI_API_KEY']
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

#### Load Data

In [7]:
# Data URLs
data_urls = [
    'https://indianexpress.com/article/education/neet-ug-2024-moe-nta-form-committee-to-re-check-neet-result-of-1500-students-9379863',
    'https://indianexpress.com/article/business/market/sensex-nifty-at-record-highs-after-rbi-hikes-fy25-gdp-growth-projection-9379015',
    'https://indianexpress.com/article/business/economy/indian-households-spent-most-on-processed-food-haryana-rajasthan-opted-for-milk-9379003'
]
# Load documents
url_loader = UnstructuredURLLoader(urls=data_urls)
eval_docs = url_loader.load()

#### Setup Chains to Evaluate

In [8]:
# Store chains with other details to evaluate later
eval_data = []

**RAG Chain 1** \
Chunk size: 1000 \
Overlap: 200 \
Query Processing: No processing \
Embedding: Google "embedding-001" \
Result Generation: Stuff Chain

In [10]:
# Init LLM object
llm = ChatGoogleGenerativeAI(model='gemini-1.5-flash', google_api_key=GOOGLE_API_KEY, temperature=0.2)

# Splitter object
splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " "],
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True
)

# Split data
chunks = splitter.split_documents(eval_docs)

# Google embedder
embedding = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

# Vector store
vectorstore = Chroma.from_documents(documents=chunks, embedding=embedding, collection_name="rag-1")

# Doc retreiver
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

# Document formatter for context docs
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# RAG prompt
prompt = hub.pull("rlm/rag-prompt")

# RAG chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

Python-dotenv could not parse statement starting at line 4
Python-dotenv could not parse statement starting at line 8
Python-dotenv could not parse statement starting at line 12
Python-dotenv could not parse statement starting at line 15


In [11]:
# Run chain for all input questions and store output
rag_answers = []
for q in testset_df['question']:
    rag_answers.append(rag_chain.invoke(q))

In [12]:
# Add details to eval data
eval_data.append({
    "answers": rag_answers,
    "chain": rag_chain,
    "desc": "Chunk 1000, Overlap 200, No query processing, Google embedding-001, Stuff chain"
})

**RAG Chain 2** \
Chunk size: 1000 \
Overlap: 200 \
Query Processing: Multi query \
Embedding: Google "embedding-001" \
Result Generation: Stuff Chain

In [13]:
# Init LLM object
llm = ChatGoogleGenerativeAI(model='gemini-1.5-flash', google_api_key=GOOGLE_API_KEY, temperature=0.2)

# Splitter object
splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " "],
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True
)

# Split data
chunks = splitter.split_documents(eval_docs)

# Google embedder
embedding = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

# Vector store
vectorstore = Chroma.from_documents(documents=chunks, embedding=embedding, collection_name="rag-2")

# Doc retreiver
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
retriever_from_llm = MultiQueryRetriever.from_llm(retriever=retriever, llm=llm)

# Document formatter for context docs
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# RAG prompt
prompt = hub.pull("rlm/rag-prompt")

# RAG chain
rag_chain = (
    {"context": retriever_from_llm | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

Python-dotenv could not parse statement starting at line 4
Python-dotenv could not parse statement starting at line 8
Python-dotenv could not parse statement starting at line 12
Python-dotenv could not parse statement starting at line 15


In [50]:
langchain.debug = True
rag_chain.invoke("What steps were taken regarding NEET results")
langchain.debug = False

[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "What steps were taken regarding NEET results"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question>] Entering Chain run with input:
[0m{
  "input": "What steps were taken regarding NEET results"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question> > chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "What steps were taken regarding NEET results"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question> > chain:RunnableSequence > retriever:Retriever > chain:RunnableSequence] Entering Chain run with input:
[0m{
  "question": "What steps were taken regarding NEET results"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question> > chain:RunnableSequence > retriever:Retr

In [14]:
# Run chain for all input questions and store output
rag_answers = []
for q in testset_df['question']:
    rag_answers.append(rag_chain.invoke(q))

In [15]:
# Add details to eval data
eval_data.append({
    "answers": rag_answers,
    "chain": rag_chain,
    "desc": "Chunk 1000, Overlap 200, Multi query, Google embedding-001, Stuff chain"
})

**RAG Chain 3** \
Chunk size: 1000 \
Overlap: 200 \
Query Processing: No processing \
Embedding: LOTR [`Google "embedding-001"`, `Hugging Face "bge-large-en"` `Sentence Transformer "all-mpnet-base-v2"`] \
Result Generation: Stuff Chain

In [16]:
# Init LLM object
llm = ChatGoogleGenerativeAI(model='gemini-1.5-flash', google_api_key=GOOGLE_API_KEY, temperature=0.2)

# Splitter object
splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " "],
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True
)

# Split data
chunks = splitter.split_documents(eval_docs)

# Google embedding
embedding_1_google = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vs_google = Chroma.from_documents(documents=chunks, embedding=embedding_1_google, collection_name="google_embedding-001")
retriever_google = vs_google.as_retriever(search_kwargs={"k": 5})

# HuggingFace BGE embedding
embedding_2_hf_bge = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-large-en", model_kwargs={"device":"cpu"}, encode_kwargs = {'normalize_embeddings': False})
vs_hfbge = Chroma.from_documents(documents=chunks, embedding=embedding_2_hf_bge, collection_name="hf_bge-large")
retriever_hfbge = vs_hfbge.as_retriever(search_kwargs={"k": 5})

# ST all-mpnet-base embedding
embedding_3_st_allmpnetbase = SentenceTransformerEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')
vs_st_allmpnet = Chroma.from_documents(documents=chunks, embedding=embedding_3_st_allmpnetbase, collection_name="st_all-mpnet-base")
retriever_st_allmpnet = vs_st_allmpnet.as_retriever(search_kwargs={"k": 5})

# LOTR
lotr = MergerRetriever(retrievers=[retriever_google, retriever_hfbge, retriever_st_allmpnet])

# OpenAI embeddings for filtering out redundant entries
filter_embeddings = OpenAIEmbeddings()
filter = EmbeddingsRedundantFilter(embeddings=filter_embeddings)

# Pipeline for reordering embeddings
reordering = LongContextReorder() 
pipeline = DocumentCompressorPipeline(transformers=[filter, reordering])

# Retriever for reordered entries
compression_retriever_reordered = ContextualCompressionRetriever(
    base_compressor=pipeline, base_retriever=lotr, search_kwargs={"k": 5, "include_metadata": True}
)

# Document formatter for context docs
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# RAG prompt
prompt = hub.pull("rlm/rag-prompt")

# RAG chain
rag_chain = (
    {"context": compression_retriever_reordered | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

Python-dotenv could not parse statement starting at line 4
Python-dotenv could not parse statement starting at line 8
Python-dotenv could not parse statement starting at line 12
Python-dotenv could not parse statement starting at line 15
Python-dotenv could not parse statement starting at line 4
Python-dotenv could not parse statement starting at line 8
Python-dotenv could not parse statement starting at line 12
Python-dotenv could not parse statement starting at line 15
  warn_deprecated(
Python-dotenv could not parse statement starting at line 4
Python-dotenv could not parse statement starting at line 8
Python-dotenv could not parse statement starting at line 12
Python-dotenv could not parse statement starting at line 15


In [12]:
# Test RAG chain
langchain.debug = True
rag_chain.invoke("What steps were taken regarding NEET results")
langchain.debug = False

[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "What steps were taken regarding NEET results"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question>] Entering Chain run with input:
[0m{
  "input": "What steps were taken regarding NEET results"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question> > chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "What steps were taken regarding NEET results"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question> > chain:RunnablePassthrough] Entering Chain run with input:
[0m{
  "input": "What steps were taken regarding NEET results"
}
[36;1m[1;3m[chain/end][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question> > chain:RunnablePassthrough] [1ms] Exiting Chain run with output:
[0m{
  "output": "What 

In [17]:
# Run chain for all input questions and store output
rag_answers = []
for q in testset_df['question']:
    rag_answers.append(rag_chain.invoke(q))

In [18]:
# Add details to eval data
eval_data.append({
    "answers": rag_answers,
    "chain": rag_chain,
    "desc": "Chunk 1000, Overlap 200, No query processing, LOTR (Google embedding-001, Hugging Face bge-large-en, Sentence Transformer all-mpnet-base-v2), Stuff chain"
})

**RAG Chain 4** \
Chunk size: 1000 \
Overlap: 200 \
Query Processing: Multi query \
Embedding: LOTR [`Google "embedding-001"`, `Hugging Face "bge-large-en"` `Sentence Transformer "all-mpnet-base-v2"`] \
Result Generation: Stuff Chain

In [19]:
# Init LLM object
llm = ChatGoogleGenerativeAI(model='gemini-1.5-flash', google_api_key=GOOGLE_API_KEY, temperature=0.2)

# Splitter object
splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " "],
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True
)

# Split data
chunks = splitter.split_documents(eval_docs)

# Google embedding
embedding_1_google = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vs_google = Chroma.from_documents(documents=chunks, embedding=embedding_1_google, collection_name="google_embedding-001_2")
retriever_google = vs_google.as_retriever(search_kwargs={"k": 5})

# HuggingFace BGE embedding
embedding_2_hf_bge = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-large-en", model_kwargs={"device":"cpu"}, encode_kwargs = {'normalize_embeddings': False})
vs_hfbge = Chroma.from_documents(documents=chunks, embedding=embedding_2_hf_bge, collection_name="hf_bge-large_2")
retriever_hfbge = vs_hfbge.as_retriever(search_kwargs={"k": 5})

# ST all-mpnet-base embedding
embedding_3_st_allmpnetbase = SentenceTransformerEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')
vs_st_allmpnet = Chroma.from_documents(documents=chunks, embedding=embedding_3_st_allmpnetbase, collection_name="st_all-mpnet-base_2")
retriever_st_allmpnet = vs_st_allmpnet.as_retriever(search_kwargs={"k": 5})

# LOTR
lotr = MergerRetriever(retrievers=[retriever_google, retriever_hfbge, retriever_st_allmpnet])

# OpenAI embeddings for filtering out redundant entries
filter_embeddings = OpenAIEmbeddings()
filter = EmbeddingsRedundantFilter(embeddings=filter_embeddings)

# Pipeling for reordering embeddings
reordering = LongContextReorder() 
pipeline = DocumentCompressorPipeline(transformers=[filter, reordering])

# Retriever for reordered entries
compression_retriever_reordered = ContextualCompressionRetriever(
    base_compressor=pipeline, base_retriever=lotr, search_kwargs={"k": 5, "include_metadata": True}
)

# Multi query retreiver with LOTR
retriever_from_llm = MultiQueryRetriever.from_llm(retriever=compression_retriever_reordered, llm=llm)

# Document formatter for context docs
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# RAG prompt
prompt = hub.pull("rlm/rag-prompt")

# RAG chain
rag_chain = (
    {"context": retriever_from_llm | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

Python-dotenv could not parse statement starting at line 4
Python-dotenv could not parse statement starting at line 8
Python-dotenv could not parse statement starting at line 12
Python-dotenv could not parse statement starting at line 15
Python-dotenv could not parse statement starting at line 4
Python-dotenv could not parse statement starting at line 8
Python-dotenv could not parse statement starting at line 12
Python-dotenv could not parse statement starting at line 15
Python-dotenv could not parse statement starting at line 4
Python-dotenv could not parse statement starting at line 8
Python-dotenv could not parse statement starting at line 12
Python-dotenv could not parse statement starting at line 15


In [29]:
# Test RAG chain
langchain.debug = True
rag_chain.invoke("What steps were taken regarding NEET results")
langchain.debug = False

[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "What steps were taken regarding NEET results"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question>] Entering Chain run with input:
[0m{
  "input": "What steps were taken regarding NEET results"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question> > chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "What steps were taken regarding NEET results"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question> > chain:RunnablePassthrough] Entering Chain run with input:
[0m{
  "input": "What steps were taken regarding NEET results"
}
[36;1m[1;3m[chain/end][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question> > chain:RunnablePassthrough] [0ms] Exiting Chain run with output:
[0m{
  "output": "What 

In [20]:
# Run chain for all input questions and store output
rag_answers = []
for q in testset_df['question']:
    rag_answers.append(rag_chain.invoke(q))

In [21]:
# Add details to eval data
eval_data.append({
    "answers": rag_answers,
    "chain": rag_chain,
    "desc": "Chunk 1000, Overlap 200, Multi query, LOTR (Google embedding-001, Hugging Face bge-large-en, Sentence Transformer all-mpnet-base-v2), Stuff chain"
})

#### Evaluate Exploratory RAGs

To use Ragas with langchain-google-genai, make these fixes in local installation and notebook:
- https://github.com/explodinggradients/ragas/pull/979/files
- https://github.com/explodinggradients/ragas/pull/657/files
- https://github.com/ipython/ipython/issues/11338#issuecomment-646539516

In [7]:
# I seem to have lost the questions data file I used here 
# testset_df = pd.read_pickle('.......')
# testset_df

In [26]:
# Imports
from datasets import load_dataset, Dataset, DatasetDict
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_recall, context_precision
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
# import ragas
import pickle

In [30]:
# Store basic eval data to disk
eval_data_store = []
for datum in eval_data:
    eval_data_store.append({'answers': datum['answers'], 'desc': datum['desc']})
pickle.dump(eval_data_store, open('./data/eval_data.pickle', 'wb'))

In [15]:
# Utility to generate eval dataset from answers
def ans_to_eval_dataset(df, answers):
    eval_df = df[['question', 'contexts', 'ground_truth']]
    eval_df = eval_df.assign(answer=answers)
    dataset = Dataset.from_pandas(eval_df)
    return dataset

In [34]:
# Loop eval data to calculate results
for i in range(len(eval_data)):

    # Generate dataset from answers
    dataset = ans_to_eval_dataset(testset_df, eval_data[i]['answers'])

    # Evaluation results
    result = evaluate(
        dataset,
        metrics=[
            context_precision,
            faithfulness,
            answer_relevancy,
            context_recall
        ]
    )

    # Add results to eval data
    eval_data[i]['results'] = result

Evaluating:   0%|          | 0/68 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/68 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/68 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/68 [00:00<?, ?it/s]

In [35]:
# Store basic eval data to disk
eval_data_store_result = []
for datum in eval_data:
    eval_data_store_result.append({'answers': datum['answers'], 'desc': datum['desc'], 'results': datum['results']})
pickle.dump(eval_data_store_result, open('./data/eval_data_results.pickle', 'wb'))

In [38]:
eval_data[0]['results']

{'context_precision': 0.9412, 'faithfulness': 0.8127, 'answer_relevancy': 0.6862, 'context_recall': 0.9412}

In [43]:
# Results table
results_data = []
for datum in eval_data:
    results_data.append({
        "model": datum["desc"],
        "context_precision": datum["results"]["context_precision"],
        "faithfulness": datum["results"]["faithfulness"],
        "answer_relevancy": datum["results"]["answer_relevancy"],
        "context_recall": datum["results"]["context_recall"],
    })

In [45]:
pd.set_option('display.max_colwidth', None)
pd.DataFrame(results_data)

Unnamed: 0,model,context_precision,faithfulness,answer_relevancy,context_recall
0,"Chunk 1000, Overlap 200, No query processing, Google embedding-001, Stuff chain",0.941176,0.812745,0.686181,0.941176
1,"Chunk 1000, Overlap 200, Multi query, Google embedding-001, Stuff chain",0.941176,0.862745,0.648954,0.941176
2,"Chunk 1000, Overlap 200, No query processing, LOTR (Google embedding-001, Hugging Face bge-large-en, Sentence Transformer all-mpnet-base-v2), Stuff chain",0.941176,0.754902,0.797582,0.941176
3,"Chunk 1000, Overlap 200, Multi query, LOTR (Google embedding-001, Hugging Face bge-large-en, Sentence Transformer all-mpnet-base-v2), Stuff chain",0.941176,0.789216,0.812543,0.941176


## Evaluation

To use Ragas with langchain-google-genai, make these fixes in local installation and notebook:
- https://github.com/explodinggradients/ragas/pull/979/files
- https://github.com/explodinggradients/ragas/pull/657/files
- https://github.com/ipython/ipython/issues/11338#issuecomment-646539516

In [22]:
# Imports
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.metrics import answer_relevancy, faithfulness, context_recall, context_precision
from ragas import evaluate
from datasets import load_dataset, Dataset, DatasetDict
from langchain_core.embeddings import Embeddings
from langchain_core.runnables import RunnablePassthrough
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import UnstructuredURLLoader, AsyncHtmlLoader
from langchain_community.document_transformers import Html2TextTransformer, LongContextReorder, EmbeddingsRedundantFilter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings, SentenceTransformerEmbeddings
from langchain.retrievers import MultiQueryRetriever, MergerRetriever, ContextualCompressionRetriever
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import hub
import pandas as pd
import numpy as np
import pickle

### Generate Test Data

In [5]:
# Data URLs
data_urls = [
    'https://www.nature.com/articles/d41586-024-01544-0',
    'https://www.nature.com/articles/d41586-024-01442-5',
    'https://www.nature.com/articles/d41586-024-01314-y',
    'https://www.nature.com/articles/d41586-024-01029-0'
]

In [6]:
# Load documents with html loader and html2text transformer
html2text = Html2TextTransformer()
loader = AsyncHtmlLoader(data_urls)
raw_docs = loader.load()
docs = html2text.transform_documents(raw_docs)

Fetching pages: 100%|############################################################################################################| 4/4 [00:05<00:00,  1.28s/it]


In [7]:
# Print doc sizes
[len(doc.page_content) for doc in docs]

[22676, 23269, 27328, 23926]

In [None]:
# Setup data generators
generator_llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0)
critic_llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0)
# embeddings = GoogleGenerativeAIEmbeddings(model="gemini-1.5-flash")
embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-large-en", model_kwargs={"device":"cpu"}, encode_kwargs = {'normalize_embeddings': False})
generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# Generate testset
testset = generator.generate_with_langchain_docs(docs, test_size=100, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})

In [19]:
# Create dataframe from generared questions
df = testset.to_pandas()
df

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,Question: What are the key features and advant...,[Skip to main content\n\nThank you for visitin...,The Blackwell chip is expected to be significa...,simple,[{'source': 'https://www.nature.com/articles/d...,True
1,What are the different interpretations of the ...,"[ people, and because the signals are hard to ...","The ""human in the loop"" principle for autonomo...",simple,[{'source': 'https://www.nature.com/articles/d...,True
2,What is the purpose of Nature Briefing? \n,"[ in science, free\nto your inbox daily.\n\nEm...",Nature Briefing is a daily email newsletter th...,simple,[{'source': 'https://www.nature.com/articles/d...,True
3,Question: How do researchers demonstrate that ...,[\nacademic papers and much more. Yet it is we...,Researchers demonstrate LLMs' reasoning abilit...,simple,[{'source': 'https://www.nature.com/articles/d...,True
4,question: How do AI chips address the energy i...,[ 2-bit format because the genetic information...,AI chips address the energy inefficiency of mo...,simple,[{'source': 'https://www.nature.com/articles/d...,True
...,...,...,...,...,...,...
93,How does Nature support researcher work?,[\n * Current issue \n * Browse issues \n *...,Nature provides various services to support re...,multi_context,[{'source': 'https://www.nature.com/articles/d...,True
94,How do LLMs trained on text form internal repr...,[\nacademic papers and much more. Yet it is we...,LLMs trained on text form internal representat...,multi_context,[{'source': 'https://www.nature.com/articles/d...,True
95,How do robotic foundation models use diverse d...,"[ case from a diversity of\nrobot forms, from ...",Robotic foundation models use diverse data to ...,multi_context,[{'source': 'https://www.nature.com/articles/d...,True
96,"How might foundation models impact robotics, g...",[Skip to main content\n\nThank you for visitin...,,multi_context,[{'source': 'https://www.nature.com/articles/d...,True


In [20]:
# Dump to csv for inspecting data and removing noise
df.to_csv('./eval/nature_questions.csv')

In [54]:
# Remove bad data
garbage_data_idx = [2,5,6,7,10,18,20,24,27,34,42,43,47,51,53,55,59,65,66,67,75,76,77,85,93,96]
df = df.drop(garbage_data_idx).reset_index()

In [59]:
# Save to disk for loading later
df.to_pickle('./eval/nature_testset_clean.pkl')

In [12]:
# Load from disk
df = pd.read_pickle('./eval/nature_testset_clean.pkl')
df

Unnamed: 0.1,index,Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,0,0,Question: What are the key features and advant...,[Skip to main content\n\nThank you for visitin...,The Blackwell chip is expected to be significa...,simple,[{'source': 'https://www.nature.com/articles/d...,True
1,1,1,What are the different interpretations of the ...,"[ people, and because the signals are hard to ...","The ""human in the loop"" principle for autonomo...",simple,[{'source': 'https://www.nature.com/articles/d...,True
2,3,3,Question: How do researchers demonstrate that ...,[\nacademic papers and much more. Yet it is we...,Researchers demonstrate LLMs' reasoning abilit...,simple,[{'source': 'https://www.nature.com/articles/d...,True
3,4,4,question: How do AI chips address the energy i...,[ 2-bit format because the genetic information...,AI chips address the energy inefficiency of mo...,simple,[{'source': 'https://www.nature.com/articles/d...,True
4,8,8,What are the challenges and solutions being ex...,"[ case from a diversity of\nrobot forms, from ...",The lack of diverse robot data is a major chal...,simple,[{'source': 'https://www.nature.com/articles/d...,True
...,...,...,...,...,...,...,...,...
67,91,91,How do Anthropic's toy model findings on virtu...,[ model’s response can be changed by\nediting ...,Anthropic's research on a toy model with a sin...,multi_context,[{'source': 'https://www.nature.com/articles/d...,True
68,92,92,"How do FPGAs and GPUs compare for AI, consider...",[ allowed them to accelerate AI tasks.\nTo tra...,"FPGAs are more programmable than GPUs, allowin...",multi_context,[{'source': 'https://www.nature.com/articles/d...,True
69,94,94,How do LLMs trained on text form internal repr...,[\nacademic papers and much more. Yet it is we...,LLMs trained on text form internal representat...,multi_context,[{'source': 'https://www.nature.com/articles/d...,True
70,95,95,How do robotic foundation models use diverse d...,"[ case from a diversity of\nrobot forms, from ...",Robotic foundation models use diverse data to ...,multi_context,[{'source': 'https://www.nature.com/articles/d...,True


### Load Test Data

In [57]:
# Load testset
df = pd.read_pickle('./eval/nature_testset_clean.pkl')
df

Unnamed: 0.1,index,Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,0,0,Question: What are the key features and advant...,[Skip to main content\n\nThank you for visitin...,The Blackwell chip is expected to be significa...,simple,[{'source': 'https://www.nature.com/articles/d...,True
1,1,1,What are the different interpretations of the ...,"[ people, and because the signals are hard to ...","The ""human in the loop"" principle for autonomo...",simple,[{'source': 'https://www.nature.com/articles/d...,True
2,3,3,Question: How do researchers demonstrate that ...,[\nacademic papers and much more. Yet it is we...,Researchers demonstrate LLMs' reasoning abilit...,simple,[{'source': 'https://www.nature.com/articles/d...,True
3,4,4,question: How do AI chips address the energy i...,[ 2-bit format because the genetic information...,AI chips address the energy inefficiency of mo...,simple,[{'source': 'https://www.nature.com/articles/d...,True
4,8,8,What are the challenges and solutions being ex...,"[ case from a diversity of\nrobot forms, from ...",The lack of diverse robot data is a major chal...,simple,[{'source': 'https://www.nature.com/articles/d...,True
...,...,...,...,...,...,...,...,...
67,91,91,How do Anthropic's toy model findings on virtu...,[ model’s response can be changed by\nediting ...,Anthropic's research on a toy model with a sin...,multi_context,[{'source': 'https://www.nature.com/articles/d...,True
68,92,92,"How do FPGAs and GPUs compare for AI, consider...",[ allowed them to accelerate AI tasks.\nTo tra...,"FPGAs are more programmable than GPUs, allowin...",multi_context,[{'source': 'https://www.nature.com/articles/d...,True
69,94,94,How do LLMs trained on text form internal repr...,[\nacademic papers and much more. Yet it is we...,LLMs trained on text form internal representat...,multi_context,[{'source': 'https://www.nature.com/articles/d...,True
70,95,95,How do robotic foundation models use diverse d...,"[ case from a diversity of\nrobot forms, from ...",Robotic foundation models use diverse data to ...,multi_context,[{'source': 'https://www.nature.com/articles/d...,True


In [9]:
# Load pages to run RAG
data_urls = [
    'https://www.nature.com/articles/d41586-024-01544-0',
    'https://www.nature.com/articles/d41586-024-01442-5',
    'https://www.nature.com/articles/d41586-024-01314-y',
    'https://www.nature.com/articles/d41586-024-01029-0'
]
# Load documents with html loader and html2text transformer
html2text = Html2TextTransformer()
loader = AsyncHtmlLoader(data_urls)
raw_docs = loader.load()
docs = html2text.transform_documents(raw_docs)

Fetching pages: 100%|############################################################################################################| 4/4 [00:04<00:00,  1.18s/it]


### Grid Search

The parameters for grid search have been chosen based on experiments. I've moved the standalone experiments to "Playground" section below.

In [10]:
# Params to do grid search on
retrievers = ['single', 'multi', 'lotr']
splitter_configs = [(1000,200),(2000,300)]
temperatures = [1e-8, 0.2]

In [11]:
# Utility to split data
def split_data(splitter_config, docs_to_split):
    chunk_size, chunk_overlap = splitter_config
    splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", ".", " "],
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        add_start_index=True
    )
    return splitter.split_documents(docs_to_split)

In [12]:
# Utility to get retriever
def get_retriever(retriever_type, chunks, llm):

    # Simple single query retriever
    if(retriever_type == 'single'):
        embedding = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
        vectorstore = Chroma.from_documents(documents=chunks, embedding=embedding, collection_name="single")
        retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
        return retriever, [vectorstore]

    # Multi query retriever
    elif(retriever_type == 'multi'):
        embedding = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
        vectorstore = Chroma.from_documents(documents=chunks, embedding=embedding, collection_name="single")
        retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
        retriever_from_llm = MultiQueryRetriever.from_llm(retriever=retriever, llm=llm)
        return retriever_from_llm, [vectorstore]

    # LOTR retriever
    elif(retriever_type == 'lotr'):

        # Google embedding
        embedding_1_google = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
        vs_google = Chroma.from_documents(documents=chunks, embedding=embedding_1_google, collection_name="google_embedding-001")
        retriever_google = vs_google.as_retriever(search_kwargs={"k": 5})
        
        # HuggingFace BGE embedding
        embedding_2_hf_bge = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-large-en", model_kwargs={"device":"cpu"}, encode_kwargs = {'normalize_embeddings': False})
        vs_hfbge = Chroma.from_documents(documents=chunks, embedding=embedding_2_hf_bge, collection_name="hf_bge-large")
        retriever_hfbge = vs_hfbge.as_retriever(search_kwargs={"k": 5})
        
        # ST all-mpnet-base embedding
        embedding_3_st_allmpnetbase = SentenceTransformerEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')
        vs_st_allmpnet = Chroma.from_documents(documents=chunks, embedding=embedding_3_st_allmpnetbase, collection_name="st_all-mpnet-base")
        retriever_st_allmpnet = vs_st_allmpnet.as_retriever(search_kwargs={"k": 5})

        # LOTR
        lotr = MergerRetriever(retrievers=[retriever_google, retriever_hfbge, retriever_st_allmpnet])
        
        # OpenAI embeddings for filtering out redundant entries
        filter_embeddings = OpenAIEmbeddings()
        filter = EmbeddingsRedundantFilter(embeddings=filter_embeddings)
        
        # Pipeline for reordering embeddings
        reordering = LongContextReorder() 
        pipeline = DocumentCompressorPipeline(transformers=[filter, reordering])
        
        # Retriever for reordered entries
        compression_retriever_reordered = ContextualCompressionRetriever(
            base_compressor=pipeline, base_retriever=lotr, search_kwargs={"k": 5, "include_metadata": True}
        )

        # Return retriever and vectorstores
        return compression_retriever_reordered, [vs_google, vs_hfbge, vs_st_allmpnet]

    else:
        raise Exception(f"Invalid retriever_type value: {retriever_type}. Use one of these: 'single', 'multi', 'lotr'")

In [13]:
# Document formatter for context docs
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [14]:
# Utility to generate eval dataset from answers
def ans_to_eval_dataset(df, answers):
    eval_df = df[['question', 'contexts', 'ground_truth']]
    eval_df = eval_df.assign(answer=answers)
    dataset = Dataset.from_pandas(eval_df)
    return dataset

In [15]:
# List to store results
results = []

In [16]:
# LLMs and embeddings to use
llm_eval = ChatGoogleGenerativeAI(model='gemini-1.5-flash', google_api_key=GOOGLE_API_KEY, temperature=1e-8)
embedding_eval = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [17]:
# Perform grid search
for splitter_config in splitter_configs:
    for retriever_type in retrievers:
        for temp in temperatures:

            # Create llm object
            llm = ChatGoogleGenerativeAI(model='gemini-1.5-flash', google_api_key=GOOGLE_API_KEY, temperature=temp)

            # Split data
            chunk_size, chunk_overlap = splitter_config
            chunks = split_data(splitter_config, docs)

            # Get retriever and vectorstores
            retriever, vectorstores = get_retriever(retriever_type, chunks, llm)

            # RAG chain
            prompt = hub.pull("rlm/rag-prompt")
            rag_chain = (
                {"context": retriever | format_docs, "question": RunnablePassthrough()}
                | prompt
                | llm
                | StrOutputParser()
            )

            # Run chain for all input questions and store output
            rag_answers = []
            for q in df['question']:
                rag_answers.append(rag_chain.invoke(q))

            # Evaluate
            dataset = ans_to_eval_dataset(df, rag_answers)
            result = evaluate(
                dataset,
                llm=LangchainLLMWrapper(llm_eval),
                embeddings=LangchainEmbeddingsWrapper(embedding_eval),
                metrics=[
                    context_precision,
                    faithfulness,
                    answer_relevancy,
                    context_recall
                ]
            )

            # Clear vectorstores
            for vectorstore in vectorstores:
                vectorstore.delete_collection()

            # Log result
            print(f"Evaluation done for => retriever:{retriever_type}, chunk_size:{chunk_size}, chunk_overlap:{chunk_overlap}, temperature:{temp}")
            print(result)
            print()

            # Add result
            results.append({
                'retriever': retriever_type,
                'chunk_size': chunk_size,
                'chunk_overlap': chunk_overlap,
                'temperature': temp,
                'result': result
            })

Evaluating:   0%|          | 0/288 [00:00<?, ?it/s]



Evaluation done for => retriever:single, chunk_size:1000, chunk_overlap:200, temperature:1e-08
{'context_precision': 0.9861, 'faithfulness': 0.8087, 'answer_relevancy': 0.7007, 'context_recall': 0.9799}



Evaluating:   0%|          | 0/288 [00:00<?, ?it/s]



Evaluation done for => retriever:single, chunk_size:1000, chunk_overlap:200, temperature:0.2
{'context_precision': 0.9861, 'faithfulness': 0.8085, 'answer_relevancy': 0.6854, 'context_recall': 0.9799}



Evaluating:   0%|          | 0/288 [00:00<?, ?it/s]

Evaluation done for => retriever:multi, chunk_size:1000, chunk_overlap:200, temperature:1e-08
{'context_precision': 0.9861, 'faithfulness': 0.8091, 'answer_relevancy': 0.7057, 'context_recall': 0.9799}



Evaluating:   0%|          | 0/288 [00:00<?, ?it/s]

Evaluation done for => retriever:multi, chunk_size:1000, chunk_overlap:200, temperature:0.2
{'context_precision': 0.9861, 'faithfulness': 0.7902, 'answer_relevancy': 0.6882, 'context_recall': 0.9799}



  warn_deprecated(


Evaluating:   0%|          | 0/288 [00:00<?, ?it/s]

Evaluation done for => retriever:lotr, chunk_size:1000, chunk_overlap:200, temperature:1e-08
{'context_precision': 0.9861, 'faithfulness': 0.8188, 'answer_relevancy': 0.6878, 'context_recall': 0.9799}





Evaluating:   0%|          | 0/288 [00:00<?, ?it/s]

Evaluation done for => retriever:lotr, chunk_size:1000, chunk_overlap:200, temperature:0.2
{'context_precision': 0.9861, 'faithfulness': 0.8191, 'answer_relevancy': 0.6738, 'context_recall': 0.9799}



Evaluating:   0%|          | 0/288 [00:00<?, ?it/s]



Evaluation done for => retriever:single, chunk_size:2000, chunk_overlap:300, temperature:1e-08
{'context_precision': 0.9861, 'faithfulness': 0.8665, 'answer_relevancy': 0.7223, 'context_recall': 0.9799}



Evaluating:   0%|          | 0/288 [00:00<?, ?it/s]

Evaluation done for => retriever:single, chunk_size:2000, chunk_overlap:300, temperature:0.2
{'context_precision': 0.9861, 'faithfulness': 0.8548, 'answer_relevancy': 0.7164, 'context_recall': 0.9799}



Evaluating:   0%|          | 0/288 [00:00<?, ?it/s]



Evaluation done for => retriever:multi, chunk_size:2000, chunk_overlap:300, temperature:1e-08
{'context_precision': 0.9861, 'faithfulness': 0.7877, 'answer_relevancy': 0.6756, 'context_recall': 0.9799}



Evaluating:   0%|          | 0/288 [00:00<?, ?it/s]

Evaluation done for => retriever:multi, chunk_size:2000, chunk_overlap:300, temperature:0.2
{'context_precision': 0.9861, 'faithfulness': 0.7640, 'answer_relevancy': 0.6562, 'context_recall': 0.9799}





Evaluating:   0%|          | 0/288 [00:00<?, ?it/s]

Evaluation done for => retriever:lotr, chunk_size:2000, chunk_overlap:300, temperature:1e-08
{'context_precision': 0.9861, 'faithfulness': 0.8276, 'answer_relevancy': 0.7205, 'context_recall': 0.9799}





Evaluating:   0%|          | 0/288 [00:00<?, ?it/s]

Evaluation done for => retriever:lotr, chunk_size:2000, chunk_overlap:300, temperature:0.2
{'context_precision': 0.9861, 'faithfulness': 0.8460, 'answer_relevancy': 0.7168, 'context_recall': 0.9799}



In [94]:
# List to store data to generate dataframe based on all results
results_df_data = []

# Indices based on type of questions
df_simple = list(np.where(df['evolution_type']=='simple')[0])
df_reasoning = list(np.where(df['evolution_type']=='reasoning')[0])
df_multicontext = list(np.where(df['evolution_type']=='multi_context')[0])
df_complex = list(np.where(df['evolution_type']!='simple')[0])

# Loop results
for result in results:

    # Result object to add to data list
    result_details = {}
    for key in ['retriever', 'chunk_size', 'chunk_overlap', 'temperature']:
        result_details[key] = result[key]

    # Add stats to dict
    result_ds = result['result']
    result_df = result_ds.to_pandas()
    for key in ['context_precision', 'faithfulness', 'answer_relevancy', 'context_recall']:
        result_details[f'overall_{key}'] = result_ds[key]
    for evo, indices in [('simple', df_simple), ('reasoning', df_reasoning), ('multi_context', df_multicontext), ('complex', df_complex)]:
        for key in ['context_precision', 'faithfulness', 'answer_relevancy', 'context_recall']:
            result_details[f'{evo}_{key}'] = np.mean(result_df.loc[df_simple][[key]])

    # Add to list
    results_df_data.append(result_details)

# Results dataset
results_df = pd.DataFrame(results_df_data)

In [95]:
# Print results dataset
results_df.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
retriever,single,single,multi,multi,lotr,lotr,single,single,multi,multi,lotr,lotr
chunk_size,1000,1000,1000,1000,1000,1000,2000,2000,2000,2000,2000,2000
chunk_overlap,200,200,200,200,200,200,300,300,300,300,300,300
temperature,0.0,0.2,0.0,0.2,0.0,0.2,0.0,0.2,0.0,0.2,0.0,0.2
overall_context_precision,0.986111,0.986111,0.986111,0.986111,0.986111,0.986111,0.986111,0.986111,0.986111,0.986111,0.986111,0.986111
overall_faithfulness,0.808685,0.808503,0.809149,0.790195,0.818833,0.819125,0.866527,0.854845,0.787749,0.763988,0.827612,0.845993
overall_answer_relevancy,0.700714,0.68545,0.70568,0.68824,0.687792,0.673784,0.722277,0.716443,0.675628,0.656184,0.720494,0.716848
overall_context_recall,0.979861,0.979861,0.979861,0.979861,0.979861,0.979861,0.979861,0.979861,0.979861,0.979861,0.979861,0.979861
simple_context_precision,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
simple_faithfulness,0.79709,0.813776,0.820056,0.783172,0.799099,0.790669,0.88491,0.867149,0.761177,0.782336,0.870731,0.871589


In [24]:
# Save results on disk
with open('./eval/eval_results_raw.pkl', 'wb') as f:
    pickle.dump(results, f)

In [96]:
# Save result stats table on disk
results_df.to_pickle('./eval/eval_results_table.pkl')

#### Markdown results table

In [97]:
# Load to generate markdown tables
results_df = pd.read_pickle('./eval/eval_results_table.pkl')
results_df.index = np.arange(1, len(results_df) + 1)

In [98]:
cols = ['retriever','chunk_size','chunk_overlap','temperature','overall_context_precision','overall_faithfulness','overall_answer_relevancy','overall_context_recall']

In [99]:
results_df.index = np.arange(1, len(results_df) + 1)

In [103]:
print(results_df[cols].to_markdown())

|    | retriever   |   chunk_size |   chunk_overlap |   temperature |   overall_context_precision |   overall_faithfulness |   overall_answer_relevancy |   overall_context_recall |
|---:|:------------|-------------:|----------------:|--------------:|----------------------------:|-----------------------:|---------------------------:|-------------------------:|
|  1 | single      |         1000 |             200 |         1e-08 |                    0.986111 |               0.808685 |                   0.700714 |                 0.979861 |
|  2 | single      |         1000 |             200 |         0.2   |                    0.986111 |               0.808503 |                   0.68545  |                 0.979861 |
|  3 | multi       |         1000 |             200 |         1e-08 |                    0.986111 |               0.809149 |                   0.70568  |                 0.979861 |
|  4 | multi       |         1000 |             200 |         0.2   |                    0.9861

In [105]:
# Best model stats
best_model_series = results_df.loc[7]
best_model_data = []
for cat in ['overall', 'simple', 'reasoning', 'multi_context']:
    current_data = {'question_type': cat}
    for col in ['context_precision', 'faithfulness', 'answer_relevancy', 'context_recall']:
        current_data[col] = best_model_series[f'{cat}_{col}']
        # print(f"{cat}_{col}: {best_model_series[f'{cat}_{col}']}")
    best_model_data.append(current_data)

In [106]:
best_model_df = pd.DataFrame(best_model_data)
best_model_df.index = np.arange(1, len(best_model_df) + 1)
best_model_df

Unnamed: 0,question_type,context_precision,faithfulness,answer_relevancy,context_recall
1,overall,0.986111,0.866527,0.722277,0.979861
2,simple,1.0,0.88491,0.772268,0.960811
3,reasoning,1.0,0.88491,0.772268,0.960811
4,multi_context,1.0,0.88491,0.772268,0.960811


In [108]:
print(best_model_df.to_markdown())

|    | question_type   |   context_precision |   faithfulness |   answer_relevancy |   context_recall |
|---:|:----------------|--------------------:|---------------:|-------------------:|-----------------:|
|  1 | overall         |            0.986111 |       0.866527 |           0.722277 |         0.979861 |
|  2 | simple          |            1        |       0.88491  |           0.772268 |         0.960811 |
|  3 | reasoning       |            1        |       0.88491  |           0.772268 |         0.960811 |
|  4 | multi_context   |            1        |       0.88491  |           0.772268 |         0.960811 |


## Playground

In [11]:
import numpy as np

In [8]:
# Data URLs
# data_urls = [
#     'https://www.washingtonpost.com/technology/2024/06/11/apple-ai-ios-siri/',
#     'https://www.washingtonpost.com/technology/2024/06/10/apple-openai-chatgpt-deal-siri/',
#     'https://www.washingtonpost.com/technology/2024/06/11/apple-ai-ios-siri/'
# ]
# data_urls = [
#     'https://indianexpress.com/article/education/neet-ug-2024-moe-nta-form-committee-to-re-check-neet-result-of-1500-students-9379863',
#     'https://indianexpress.com/article/business/market/sensex-nifty-at-record-highs-after-rbi-hikes-fy25-gdp-growth-projection-9379015',
#     'https://indianexpress.com/article/business/economy/indian-households-spent-most-on-processed-food-haryana-rajasthan-opted-for-milk-9379003'
# ]
data_urls = [
    'https://www.nature.com/articles/d41586-024-01544-0',
    'https://www.nature.com/articles/d41586-024-01442-5',
    'https://www.nature.com/articles/d41586-024-01314-y',
    'https://www.nature.com/articles/d41586-024-01029-0'
]
# Load documents
html2text = Html2TextTransformer()
loader = AsyncHtmlLoader(data_urls)
raw_docs = loader.load()
eval_docs = html2text.transform_documents(raw_docs)
# url_loader = UnstructuredURLLoader(urls=data_urls)
# eval_docs = url_loader.load()

Fetching pages: 100%|############################################################################################################| 4/4 [00:05<00:00,  1.25s/it]


In [62]:
# dft = pd.read_csv('./data/questions.csv')
# dft = pd.read_pickle('./data/eval_testset.pkl')
# dft = pd.read_pickle('./data/testset_20.pkl')
dft = pd.read_pickle('./eval/nature_testset_clean.pkl')

In [63]:
dft_simple = list(np.where(dft['evolution_type']=='simple')[0])
dft_reasoning = list(np.where(dft['evolution_type']=='reasoning')[0])
dft_multictx = list(np.where(dft['evolution_type']=='multi_context')[0])
dft_complex = list(np.where(dft['evolution_type']!='simple')[0])

In [64]:
print(dft_simple)
print(dft_complex)
print(dft_reasoning)
print(dft_multictx)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36]
[37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71]
[37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]
[54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71]


In [13]:
# len(vectorstore.get()['documents'])

In [91]:
# Init LLM object
llm = ChatGoogleGenerativeAI(model='gemini-1.5-flash', google_api_key=GOOGLE_API_KEY, temperature=1e-08)

# Splitter object
splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ".", " "],
    chunk_size=2000,
    chunk_overlap=300,
    add_start_index=True
)

# Split data
chunks = splitter.split_documents(eval_docs)

# # Google embedder
# embedding = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

# # Vector store
# vectorstore = Chroma.from_documents(documents=chunks, embedding=embedding, collection_name="rag-1")

# # Doc retreiver
# retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
# retriever_from_llm = MultiQueryRetriever.from_llm(retriever=retriever, llm=llm)

# Google embedding
embedding_1_google = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vs_google = Chroma.from_documents(documents=chunks, embedding=embedding_1_google, collection_name="google_embedding-001")
retriever_google = vs_google.as_retriever(search_kwargs={"k": 5})

# HuggingFace BGE embedding
embedding_2_hf_bge = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-large-en", model_kwargs={"device":"cpu"}, encode_kwargs = {'normalize_embeddings': False})
vs_hfbge = Chroma.from_documents(documents=chunks, embedding=embedding_2_hf_bge, collection_name="hf_bge-large")
retriever_hfbge = vs_hfbge.as_retriever(search_kwargs={"k": 5})

# ST all-mpnet-base embedding
embedding_3_st_allmpnetbase = SentenceTransformerEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')
vs_st_allmpnet = Chroma.from_documents(documents=chunks, embedding=embedding_3_st_allmpnetbase, collection_name="st_all-mpnet-base")
retriever_st_allmpnet = vs_st_allmpnet.as_retriever(search_kwargs={"k": 5})

# LOTR
lotr = MergerRetriever(retrievers=[retriever_google, retriever_hfbge, retriever_st_allmpnet])

# OpenAI embeddings for filtering out redundant entries
filter_embeddings = OpenAIEmbeddings()
filter = EmbeddingsRedundantFilter(embeddings=filter_embeddings)

# Pipeline for reordering embeddings
reordering = LongContextReorder() 
pipeline = DocumentCompressorPipeline(transformers=[filter, reordering])

# Retriever for reordered entries
compression_retriever_reordered = ContextualCompressionRetriever(
    base_compressor=pipeline, base_retriever=lotr, search_kwargs={"k": 5, "include_metadata": True}
)

# Document formatter for context docs
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# RAG prompt
prompt = hub.pull("rlm/rag-prompt")

# RAG chain
rag_chain = (
    {"context": compression_retriever_reordered | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

  warn_deprecated(


In [92]:
# Run chain for all input questions and store output
rag_answers = []
for q in dft['question']:
    rag_answers.append(rag_chain.invoke(q))

In [76]:
# Clear collection
vectorstore.delete_collection()

In [93]:
# Utility to generate eval dataset from answers
def ans_to_eval_dataset(df, answers):
    eval_df = df[['question', 'contexts', 'ground_truth']]
    eval_df = eval_df.assign(answer=answers)
    dataset = Dataset.from_pandas(eval_df)
    return dataset

In [94]:
# Generate dataset from answers
dataset = ans_to_eval_dataset(dft, rag_answers)

In [95]:
llm_eval = ChatGoogleGenerativeAI(model='gemini-1.5-flash', google_api_key=GOOGLE_API_KEY, temperature=1e-8)
embedding_eval = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [96]:
# Evaluate
result = evaluate(
    dataset,
    llm=LangchainLLMWrapper(llm_eval),
    embeddings=LangchainEmbeddingsWrapper(embedding_eval),
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall
    ]
)

Evaluating:   0%|          | 0/288 [00:00<?, ?it/s]



**split_2000_300_lotr_temp_00**

In [97]:
result

{'context_precision': 0.9861, 'faithfulness': 0.8522, 'answer_relevancy': 0.7374, 'context_recall': 0.9799}

In [98]:
split_2000_300_lotr_temp_00 = result.to_pandas()

In [99]:
split_2000_300_lotr_temp_00[['faithfulness','answer_relevancy']].mean()

faithfulness        0.852210
answer_relevancy    0.737363
dtype: float64

In [100]:
split_2000_300_lotr_temp_00.loc[dft_simple][['faithfulness','answer_relevancy']].mean()

faithfulness        0.873584
answer_relevancy    0.811258
dtype: float64

In [101]:
split_2000_300_lotr_temp_00.loc[dft_complex][['faithfulness','answer_relevancy']].mean()

faithfulness        0.829615
answer_relevancy    0.656949
dtype: float64

In [102]:
split_2000_300_lotr_temp_00.to_pickle('./results/split_2000_300_lotr_temp_00.pkl')

**split_2000_300_multi_query_temp_00**

In [83]:
result

{'context_precision': 0.9861, 'faithfulness': 0.8186, 'answer_relevancy': 0.6928, 'context_recall': 0.9799}

In [84]:
split_2000_300_multi_query_temp_00 = result.to_pandas()

In [85]:
split_2000_300_multi_query_temp_00[['faithfulness','answer_relevancy']].mean()

faithfulness        0.818595
answer_relevancy    0.692763
dtype: float64

In [86]:
split_2000_300_multi_query_temp_00.loc[dft_simple][['faithfulness','answer_relevancy']].mean()

faithfulness        0.882639
answer_relevancy    0.777440
dtype: float64

In [87]:
split_2000_300_multi_query_temp_00.loc[dft_complex][['faithfulness','answer_relevancy']].mean()

faithfulness        0.752721
answer_relevancy    0.603247
dtype: float64

In [88]:
split_2000_300_multi_query_temp_00.to_pickle('./results/split_2000_300_multi_query_temp_00.pkl')

**split_2000_300_single_query_temp_00**

In [70]:
result

{'context_precision': 0.9861, 'faithfulness': 0.9462, 'answer_relevancy': 0.7826, 'context_recall': 0.9799}

In [71]:
split_2000_300_single_query_temp_00 = result.to_pandas()

In [72]:
split_2000_300_single_query_temp_00[['faithfulness','answer_relevancy']].mean()

faithfulness        0.946241
answer_relevancy    0.782603
dtype: float64

In [73]:
split_2000_300_single_query_temp_00.loc[dft_simple][['faithfulness','answer_relevancy']].mean()

faithfulness        0.926920
answer_relevancy    0.806952
dtype: float64

In [74]:
split_2000_300_single_query_temp_00.loc[dft_complex][['faithfulness','answer_relevancy']].mean()

faithfulness        0.966667
answer_relevancy    0.756863
dtype: float64

In [75]:
split_2000_300_single_query_temp_00.to_pickle('./results/split_2000_300_single_query_temp_00.pkl')

In [132]:
rag_chain.invoke("What's the status of the Supreme Court petition on NEET UG 2024 results?")

'The Supreme Court has issued a notice to the respondents in a writ petition filed by a NEET UG candidate and has scheduled the next hearing for July 8th. The petition challenges the final answer key of question number 29 in Physics and the compensatory time given to candidates at certain centers. \n'

In [131]:
result.to_pandas().loc[15]['question']

"What's the status of the Supreme Court petition on NEET UG 2024 results?"

In [134]:
list(range(20))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [133]:
result

{'context_precision': 0.9500, 'faithfulness': 0.9492, 'answer_relevancy': 0.7634, 'context_recall': 1.0000}

In [114]:
result

{'context_precision': 0.9500, 'faithfulness': 0.9025, 'answer_relevancy': 0.7659, 'context_recall': 1.0000}

In [129]:
result.to_pandas()

Unnamed: 0,question,contexts,ground_truth,answer,context_precision,faithfulness,answer_relevancy,context_recall
0,How much did urban average monthly consumption...,"[ spending per person increased to Rs 3,773 pe...",Urban average monthly consumption expenditure ...,The urban average monthly consumption expendit...,1.0,0.6,0.893185,1.0
1,What is the purpose of awarding grace marks to...,"[\n\n\n\n\n\nNEET UG: Govt, NTA form panel to ...",The purpose of awarding grace marks to candida...,The grace marks were awarded to compensate for...,1.0,0.8,0.871556,1.0
2,What was the net purchase amount of FPIs in In...,"[ Nifty fell by 1,379.4 points. However, in th...","Rs 4,391.02 crore","According to BSE's provisional data, FPIs were...",1.0,1.0,0.852176,1.0
3,Which food item had the highest expenditure in...,[\n\n\n\n\n\nConsumption expenditure survey 20...,Meats,The highest expenditure on food items in Keral...,1.0,1.0,0.804482,1.0
4,What was the percentage increase in monthly co...,"[ spending per person increased to Rs 3,773 pe...",The monthly consumption expenditure per person...,The monthly consumption expenditure per person...,1.0,1.0,0.751884,1.0
5,What did households spend the most on in the C...,[\n\n\n\n\n\nConsumption expenditure survey 20...,Households spent the most on processed food in...,Households across India spent the most on 'bev...,1.0,1.0,0.772353,1.0
6,What food item did households in Haryana spend...,[\n\n\n\n\n\nConsumption expenditure survey 20...,Households in Haryana spent the most on 'milk ...,According to the Household Consumption Expendi...,1.0,1.0,0.80207,1.0
7,What are the major categories of non-food item...,[ of total consumption expenditure on food.\nA...,The major categories of non-food items that ho...,The provided context focuses on consumption ex...,1.0,1.0,0.882539,1.0
8,What are the food expenditure trends in rural ...,[ of total consumption expenditure on food.\nA...,"In rural areas of India, food accounted for ab...",The HCES survey reveals that spending on non-f...,1.0,1.0,0.778313,1.0
9,How did the stock market respond to the RBI's ...,"[\n\n\n\n\n\nSensex, Nifty at record highs aft...",The Sensex and Nifty surged over 2% to reach r...,The stock market responded positively to the R...,1.0,1.0,0.614332,1.0


**df_t5_2000_300_single_0.1**

In [61]:
df_t5_2000_300_single_temp01 = result.to_pandas()

In [62]:
df_t5_2000_300_single_temp01[['faithfulness','answer_relevancy']].mean()

faithfulness        0.901468
answer_relevancy    0.709687
dtype: float64

In [63]:
df_t5_2000_300_single_temp01.loc[dft_simple][['faithfulness','answer_relevancy']].mean()

faithfulness        0.910245
answer_relevancy    0.796861
dtype: float64

In [64]:
df_t5_2000_300_single_temp01.loc[dft_complex][['faithfulness','answer_relevancy']].mean()

faithfulness        0.890741
answer_relevancy    0.603141
dtype: float64

In [65]:
df_t5_2000_300_single_temp01.to_pickle('./results/df_t5_2000_300_single_temp01.pkl')

**df_t5_2000_300_single_1e-8**

In [48]:
df_t5_2000_300_single_temp00 = result.to_pandas()

In [49]:
df_t5_2000_300_single_temp00[['faithfulness','answer_relevancy']].mean()

faithfulness        0.894524
answer_relevancy    0.755106
dtype: float64

In [50]:
df_t5_2000_300_single_temp00.loc[dft_simple][['faithfulness','answer_relevancy']].mean()

faithfulness        0.897619
answer_relevancy    0.797024
dtype: float64

In [51]:
df_t5_2000_300_single_temp00.loc[dft_complex][['faithfulness','answer_relevancy']].mean()

faithfulness        0.890741
answer_relevancy    0.703874
dtype: float64

In [52]:
df_t5_2000_300_single_temp00.to_pickle('./results/df_t5_2000_300_single_temp00.pkl')

**df_t5_2000_300_single_0.6**

In [36]:
df_t5_2000_300_single_temp06 = result.to_pandas()

In [37]:
df_t5_2000_300_single_temp06[['faithfulness','answer_relevancy']].mean()

faithfulness        0.894325
answer_relevancy    0.774752
dtype: float64

In [38]:
df_t5_2000_300_single_temp06.loc[dft_simple][['faithfulness','answer_relevancy']].mean()

faithfulness        0.895238
answer_relevancy    0.825850
dtype: float64

In [39]:
df_t5_2000_300_single_temp06.loc[dft_complex][['faithfulness','answer_relevancy']].mean()

faithfulness        0.893210
answer_relevancy    0.712298
dtype: float64

In [40]:
df_t5_2000_300_single_temp06.to_pickle('./results/df_t5_2000_300_single_temp06.pkl')

**df_t5_2000_300_single_0.8**

In [74]:
df_t5_2000_300_single_temp08 = result.to_pandas()

In [75]:
df_t5_2000_300_single_temp08[['faithfulness','answer_relevancy']].mean()

faithfulness        0.893690
answer_relevancy    0.739193
dtype: float64

In [76]:
df_t5_2000_300_single_temp08.loc[dft_simple][['faithfulness','answer_relevancy']].mean()

faithfulness        0.935498
answer_relevancy    0.811097
dtype: float64

In [77]:
df_t5_2000_300_single_temp08.loc[dft_complex][['faithfulness','answer_relevancy']].mean()

faithfulness        0.842593
answer_relevancy    0.651310
dtype: float64

In [78]:
df_t5_2000_300_single_temp08.to_pickle('./results/df_t5_2000_300_single_temp08.pkl')

**df_t5_2000_300_single**

In [216]:
df_t5_2000_300_single = pd.concat([result_1.to_pandas(), result_2.to_pandas(), result_3.to_pandas()], ignore_index=True)

In [217]:
df_t5_2000_300_single[['faithfulness','answer_relevancy']].mean()

faithfulness        0.855595
answer_relevancy    0.889904
dtype: float64

In [218]:
df_t5_2000_300_single.loc[dft_simple][['faithfulness','answer_relevancy']].mean()

faithfulness        0.820779
answer_relevancy    0.941296
dtype: float64

In [219]:
df_t5_2000_300_single.loc[dft_complex][['faithfulness','answer_relevancy']].mean()

faithfulness        0.898148
answer_relevancy    0.827092
dtype: float64

In [220]:
df_t5_2000_300_single.to_pickle('./results/df_t5_2000_300_single.pkl')

**df_t5_1600_200_single**

In [201]:
df_t5_1600_200_single = pd.concat([result_1.to_pandas(), result_2.to_pandas(), result_3.to_pandas()], ignore_index=True)

In [202]:
df_t5_1600_200_single[['faithfulness','answer_relevancy']].mean()

faithfulness        0.834853
answer_relevancy    0.750291
dtype: float64

In [203]:
df_t5_1600_200_single.loc[dft_simple][['faithfulness','answer_relevancy']].mean()

faithfulness        0.906061
answer_relevancy    0.864468
dtype: float64

In [204]:
df_t5_1600_200_single.loc[dft_complex][['faithfulness','answer_relevancy']].mean()

faithfulness        0.747821
answer_relevancy    0.610741
dtype: float64

In [206]:
df_t5_1600_200_single.to_pickle('./results/df_t5_1600_200_single.pkl')

In [90]:
# New set, 2000, 300, multi query, Unstructured
result

{'context_precision': 1.0000, 'faithfulness': 0.8678, 'answer_relevancy': 0.8047, 'context_recall': 0.9667}

In [95]:
result.to_pandas().loc[dft_simple][['faithfulness','answer_relevancy']].mean()

faithfulness        0.929167
answer_relevancy    0.933772
dtype: float64

In [92]:
result.to_pandas().loc[dft_cmplex][['faithfulness','answer_relevancy']].mean()

faithfulness        0.797619
answer_relevancy    0.657159
dtype: float64

In [81]:
# New set, 2000, 300, multi query, WebLoader
result

{'context_precision': 0.9667, 'faithfulness': 0.7720, 'answer_relevancy': 0.8770, 'context_recall': 0.9667}

In [60]:
# New set, 2000, 400, multi query
result

{'context_precision': 1.0000, 'faithfulness': 0.8217, 'answer_relevancy': 0.8644, 'context_recall': 0.9667}

In [19]:
# New set, 2000, 400, single query
result

{'context_precision': 1.0000, 'faithfulness': 0.7683, 'answer_relevancy': 0.8131, 'context_recall': 0.9667}

In [43]:
pd.set_option('display.max_colwidth', None)

In [65]:
dft_simple

[0, 1, 2, 3, 6, 7, 8, 9]

In [67]:
result.to_pandas().loc[dft_simple][['question','ground_truth','answer','faithfulness','answer_relevancy']]

Unnamed: 0,question,ground_truth,answer,faithfulness,answer_relevancy
0,How much did urban average monthly consumption expenditure per person increase from 2011-12 to 2022-23?,"Urban average monthly consumption expenditure per person increased from Rs 2,630 in 2011-12 to Rs 6,459 in 2022-23, a jump of 146 per cent.","The urban average monthly consumption expenditure per person increased to Rs 6,459 in 2022-23 from Rs 2,630 in 2011-12, representing a 146% increase. This increase is slightly lower than the 164% increase seen in rural areas during the same period. \n",1.0,0.989532
1,What is the purpose of awarding grace marks to candidates in the NEET UG exam?,The purpose of awarding grace marks to candidates in the NEET UG exam was to compensate for the loss of time suffered while appearing for the exam.,"Grace marks were awarded to compensate for the ""loss of time"" suffered by 1,500 candidates during the NEET UG exam. This was due to exam delays at certain centers, and the NTA formed a committee to review the results of these candidates. The committee's verdict is expected to be announced before next Saturday, and it will not affect the MBBS admission process. \n",0.6,0.949191
2,What was the net purchase amount of FPIs in Indian equities on Friday according to BSE's provisional data?,"Rs 4,391.02 crore","According to BSE's provisional data, FPIs (Foreign Portfolio Investors) made a net purchase of Rs 4,391.02 crore in Indian equities on Friday. This occurred after they had withdrawn Rs 18,109 crore from the domestic market between June 3 and 6. \n",1.0,0.96109
3,What are the trends in consumption spending on food and non-food items in rural and urban areas of India based on the HCES survey report?,"The trends in consumption spending on food and non-food items in rural and urban areas of India based on the HCES survey report indicate that in rural India, food accounted for about 46% of households' consumption spending, with the highest contribution from 'beverages, refreshments, and processed food'. In urban India, food accounted for about 39% of average monthly per capita consumption expenditure, with the highest spending on 'beverages, refreshments, and processed food'. On the other hand, the consumption expenditure on non-food items has been increasing over the years, reaching over 50% in both rural and urban areas. The share of non-food items in total expenditure has been rising, surpassing food expenditure in some cases.","The HCES survey report shows that consumption spending on non-food items has increased in both rural and urban areas of India. In rural areas, non-food spending increased to 53.62% in 2022-23 from 40.6% in 1999-2000. In urban areas, the share of non-food items in total expenditure was even higher at 60.83% in 2022-23, up from 51.94% in 1999-2000. \n",1.0,0.941606
6,What did households spend the most on in the Consumption Expenditure Survey 2022-23?,Households spent the most on processed food in the Consumption Expenditure Survey 2022-23.,"According to the Household Consumption Expenditure Survey (HCES) 2022-23, households across India spent the most on ""beverages, refreshments, and processed food"" among food items. However, some states, like Haryana and Rajasthan, spent the most on ""milk and milk products,"" while Kerala spent the most on ""egg, fish, and meat."" \n",1.0,0.928223
7,What food item did households in Haryana spend the most on according to the Consumption Expenditure Survey 2022-23?,Households in Haryana spent the most on 'milk and milk products' according to the Consumption Expenditure Survey 2022-23.,"According to the Household Consumption Expenditure Survey (HCES) 2022-23, households in Haryana spent the most on ""milk and milk products"" as a percentage of their total food expenditure in rural areas. This amounted to 41.7% of their total food spending. \n",1.0,0.934829
8,What are the major categories of non-food items that households in rural and urban areas spend their consumption expenditure on?,"The major categories of non-food items that households in rural and urban areas spend their consumption expenditure on include conveyance, durable goods, miscellaneous goods, entertainment, medical expenses, and spending on fuel & light.","Households in both rural and urban areas spent the most on conveyance, followed by durable goods, miscellaneous goods, and entertainment. Medical expenses and spending on fuel and light also constituted a significant share of non-food expenditure. \n",0.666667,0.922275
9,What role will the upcoming Union Budget play in determining India's growth trajectory for the next five years?,"The union budget, due in the September quarter, will be an early indicator of policy priorities, including short- and long-term responses to some of those key economic challenges. The effectiveness of policies implemented by the coalition government will determine India's growth trajectory for the next five years.","The upcoming Union Budget will be a key indicator of the new government's policy priorities, particularly in addressing economic challenges. The budget will reveal short- and long-term strategies, which will significantly influence India's growth trajectory over the next five years. The effectiveness of these policies will determine the success of the coalition government in achieving its economic goals. \n",0.142857,0.852122


In [68]:
result.to_pandas().loc[[0, 1, 2, 3, 6, 7, 8]][['faithfulness','answer_relevancy']].mean()

faithfulness        0.895238
answer_relevancy    0.946678
dtype: float64

In [63]:
result.to_pandas().loc[dft_cmplex][['question','ground_truth','answer','faithfulness','answer_relevancy']]

Unnamed: 0,question,ground_truth,answer,faithfulness,answer_relevancy
4,"Which food item had the highest expenditure in Kerala households in the Consumption Expenditure Survey 2022-23, with milk being the top spending category in Haryana and Rajasthan?",Meats,"According to the Consumption Expenditure Survey 2022-23, Kerala households spent the highest proportion of their food budget on 'egg, fish & meat', reaching 23.5% in rural areas and 19.8% in urban areas. This contrasts with Haryana and Rajasthan, where milk and milk products were the top spending categories. \n",1.0,0.939164
5,"What was the percentage increase in monthly consumption expenditure per person from 2011-12 to 2022-23, and which states spent the most on milk and meats in 2022-23?","The monthly consumption expenditure per person increased by 164% from Rs 1,430 in 2011-12 to Rs 3,773 in 2022-23. In 2022-23, Haryana spent the most on milk and milk products at 41.7% of total food expenditure, while Kerala spent the most on egg, fish & meat at 23.5%.","The average monthly consumption expenditure per person increased by 164% in rural areas and 146% in urban areas from 2011-12 to 2022-23. In 2022-23, Haryana and Rajasthan spent the most on milk and milk products, while Kerala spent the most on egg, fish, and meat. \n",1.0,0.877796
10,"What are the food expenditure trends in rural and urban areas of India according to the HCES survey, with a focus on spending differences for food and non-food items?","In rural areas of India, food accounted for about 46% of the households' consumption spending, with the highest contribution from 'beverages, refreshments, and processed food'. In urban areas, food represented about 39% of the average monthly per capita consumption expenditure, with the highest spending on 'beverages, refreshments, and processed food'. Non-food spending has been increasing over the years, reaching over 50% of total monthly consumption expenditure in both rural and urban areas.","The HCES survey shows that food expenditure in rural India is about 46% of household spending, with the highest share going to beverages, refreshments, and processed food. In urban India, food accounts for 39% of spending, with a similar trend towards processed food. Non-food spending has increased to over 50% in both rural and urban areas, with conveyance being the highest expenditure category. \n",0.833333,0.957521
11,How did the stock market respond to the RBI's GDP growth projection for FY25?,The Sensex and Nifty surged over 2% to reach record highs after RBI's upward revision of the FY25 real GDP growth projection to 7.2%.,"The stock market indices, Sensex and Nifty, surged over 2% to record highs after the Reserve Bank of India (RBI) raised its FY25 real GDP growth projection to 7.2%. The market was also optimistic about policy continuity with the BJP-led NDA set to form the government for the third time. \n",1.0,0.858391
12,What's the new FY25 real GDP growth projection from RBI and how does it compare to the old one?,The FY25 real GDP growth projection from RBI has been revised to 7.2% from the previous estimate of 7%.,"The RBI has raised its FY25 real GDP growth projection to 7.2%, an increase of 20 basis points from the previous estimate of 7%. This upward revision has boosted market sentiments, leading to record highs for the Sensex and Nifty. \n",1.0,0.932805
13,"What percentage of monthly rural consumption is spent on non-food items, considering the increase in non-food spending over the years and the top spending categories in rural and urban areas?","The percentage of monthly rural consumption spent on non-food items has increased over the years, reaching 53.62% in 2022-23. The top spending categories in rural areas include conveyance, durable goods, miscellaneous goods, entertainment, medical expenses, and fuel & light.",The provided text does not contain information about the percentage of monthly rural consumption spent on non-food items. The text focuses on the overall increase in non-food spending over the years and the top spending categories in rural and urban areas. \n,0.333333,0.0
14,"Which non-food expenditure categories do rural and urban households spend on, and which states spend the most on conveyance as a percentage of non-food expenses?","Rural and urban households spend on categories like conveyance, durable goods, miscellaneous goods, entertainment, medical expenses, and fuel & light. States like Kerala, Tamil Nadu, Gujarat, Punjab, and Maharashtra spend the most on conveyance as a percentage of non-food expenses.","Rural and urban households spend on conveyance, durable goods, miscellaneous goods, entertainment, medical expenses, and fuel & light. Kerala and Tamil Nadu spend the most on conveyance as a percentage of non-food expenses in both rural and urban areas. \n",0.75,0.92167


In [64]:
result.to_pandas().loc[dft_cmplex][['faithfulness','answer_relevancy']].mean()

faithfulness        0.845238
answer_relevancy    0.783907
dtype: float64

In [158]:
# New set, 2000, 300, single query
result

{'context_precision': 1.0000, 'faithfulness': 0.8556, 'answer_relevancy': 0.8689, 'context_recall': 0.9667}

In [148]:
# New set, 2000, 300, LOTR
result

{'context_precision': 1.0000, 'faithfulness': 0.7595, 'answer_relevancy': 0.6917, 'context_recall': 0.9667}

In [128]:
# New set, 2000, 300, LOTR
result

{'context_precision': 1.0000, 'faithfulness': 0.8355, 'answer_relevancy': 0.6782, 'context_recall': 0.9667}

In [122]:
# New set, 4000, 600, multi query
result

{'context_precision': 1.0000, 'faithfulness': 0.5707, 'answer_relevancy': 0.6163, 'context_recall': 0.9000}

In [116]:
# New set, 2000, 300, multi query
result

{'context_precision': 1.0000, 'faithfulness': 0.8461, 'answer_relevancy': 0.8744, 'context_recall': 0.9667}

In [110]:
# New set, 1600, 200, multi query
result

{'context_precision': 1.0000, 'faithfulness': 0.8152, 'answer_relevancy': 0.7477, 'context_recall': 0.9667}

In [90]:
# New set, 1000, 100
result

{'context_precision': 1.0000, 'faithfulness': 0.6640, 'answer_relevancy': 0.8049, 'context_recall': 0.9667}

In [86]:
# New set
result

{'context_precision': 0.9667, 'faithfulness': 0.7676, 'answer_relevancy': 0.8080, 'context_recall': 0.9667}

In [22]:
pd.set_option('display.max_colwidth', None)

In [38]:
bad = [2,5,6,7,10,18,20,24,27,34,42,43,47,51,53,55,59,65,66,67,75,76,77,85,93,96]

In [37]:
df.loc[91:100]

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
91,How do Anthropic's toy model findings on virtual neurons inform explainability in larger LLMs?,"[ model’s response can be changed by\nediting layers even outside those identified by causal tracing, which is not\nwhat had been expected8.\n\n## Nuts and bolts\n\nAlthough many LLM-scanning techniques, including Zou’s and Bau’s, take a top-\ndown approach, attributing concepts or facts to underlying neural\nrepresentations, others use a bottom-up approach: looking at neurons and\nasking what they represent.\n\nCan we open the black box of AI?\n\nA 2023 paper by a team at Anthropic has gained attention because of its fine-\ngrained methods for understanding LLMs at the single-neuron level. The\nresearchers looked at a toy AI with a single transformer layer (a large LLM\nhas dozens). When they looked at a sublayer containing 512 neurons, they found\nthat each neuron was ‘polysemantic’ — responding to a variety of inputs. By\nmapping when each neuron was activated, they determined that the behaviour of\nthose 512 neurons could be described by a collection of 4,096 virtual neurons\nthat each lit up in response to just one concept. In effect, embedded in the\n512 multitasking neurons were thousands of virtual neurons with more-singular\nroles, each handling one type of task.\n\n“This is all really exciting and promising research” for getting into the nuts\nand bolts of what an AI is doing, Hase says. “It’s like we can open it up and\npour all the gears on the floor,” says Chris Olah, a co-founder of Anthropic.\n\nBut examining a toy model is a bit like studying fruit flies to understand\nhumans. Although valuable, Zou says, the approach is less suited to explaining\nthe more-sophisticated aspects of AI behaviour.\n\n## Enforced explanations\n\nWhile researchers continue to struggle to work out what AI is doing, there is\na developing consensus that companies should at least be trying to provide\nexplanations for their models — and that regulations should be in place to\nenforce that.\n\nSome regulations do require that algorithms be explainable. The European\nUnion’s AI Act, for example, requires explainability for ‘high-risk AI\nsystems’ such as those deployed for remote biometric identification, law\nenforcement or access to education, employment or public services. Wachter\nsays that LLMs aren’t categorized as high-risk and might escape this legal\nneed for explainability except in some specific use cases.\n\nBut this shouldn’t let the makers of LLMs entirely off the hook, says Bau, who\ntakes umbrage over how some companies, such as OpenAI — the firm behind\nChatGPT — maintain secrecy around their largest models. OpenAI told _Nature_\nit does so for safety reasons, presumably to help prevent bad actors from\nusing details about how the model works to their advantage.\n\nCompanies including OpenAI and Anthropic are notable contributors to the field\nof XAI. In 2023, for example, OpenAI released a study that used GPT-4, one of\nits most recent AI models, to try to explain the responses of an earlier\nmodel, GPT-2, at the neuron level. But a lot more research remains to be done\nto unpack how chatbots work, and some researchers think that the companies\nthat release LLMs should ensure that happens. “Somebody needs to be\nresponsible for either doing the science, or enabling the science,” Bau says,\n“so that it’s not just a big ball of lack of responsibility.”\n\n_Nature_ **629** , 986-988 (2024)\n\n_doi: https://doi.org/10.1038/d41586-024-01314-y_\n\n### Updates & Corrections\n\n * **Correction 17 May 2024** : An earlier version of this article contained an error in the box ‘False logic’. The explanation for the correct answer should have said B.\n\n## References\n\n 1. Grosse, R. _et al._ Preprint at arXiv https://doi.org/10.48550/arXiv.2308.03296 (2023).\n\n 2. Li, K. _et al_. in _Proc. Int. Conf. Learn. Represent. 2023_(ICLR, 2023); available at https://openreview.net/forum?id=DeG07_TcZvT\n\n 3. Hagendorff, T. Preprint at arXiv https://doi.org/10.48550/arXiv.2303.13988 (]","Anthropic's research on a toy model with a single transformer layer revealed that individual neurons within a sublayer can be 'polysemantic,' responding to multiple inputs. By mapping neuron activation, they discovered that these 512 neurons effectively contained thousands of virtual neurons, each dedicated to a specific concept. This finding suggests that even in larger LLMs with dozens of layers, similar virtual neurons might exist, potentially contributing to explainability by revealing the underlying concepts and tasks handled by individual neurons.",multi_context,"[{'source': 'https://www.nature.com/articles/d41586-024-01314-y', 'title': 'How does ChatGPT ‘think’? Psychology and neuroscience crack open AI large language models', 'description': 'Researchers are striving to reverse-engineer artificial intelligence and scan the ‘brains’ of LLMs to see what they are doing, how and why.', 'language': 'en'}]",True
92,"How do FPGAs and GPUs compare for AI, considering speed, efficiency, and programmability in both large-scale training and mobile devices?","[ allowed them to accelerate AI tasks.\nTo train the large language model GPT-3, which has 175 billion parameters,\nresearchers at OpenAI had to run 1,024 GPUs for a month straight, which cost\nseveral million dollars. In total, those GPUs performed 1023 flops. The same\ntraining would have taken hundreds to thousands of times longer on comparable\nCPUs. “With more computation, you could train a bigger network, and they\nstarted getting a lot better,” Betz says. GPT-4, for example, released in\nMarch 2023, has an astonishing 1.8 trillion parameters, a tenfold increase\nover its predecessor.\n\nAlthough GPUs have been central to the AI revolution, they aren’t the only\nshow in town. As AI applications have proliferated, so too have AI chips.\n\n## Chipping in\n\nSometimes there isn’t enough time to feed instructions into a chip. Field-\nprogrammable gate arrays (FPGAs) are designed so a computer engineer can\nprogram the chip’s circuits to follow specific orders in lieu of instructions.\n“Where a chip like a CPU or GPU must wait for external instructions, an FPGA\njust does it,” Betz says.\n\nWho’s making chips for AI? Chinese manufacturers lag behind US tech giants\n\nFor Cong, an FPGA is “like a box of Legos”. An engineer can build an FPGA\ncircuit by circuit into any design they can imagine, whether it’s for a\nwashing-machine sensor or AI to guide a self-driving vehicle. However,\ncompared with AI chips that have non-adjustable circuits, such as GPUs, FPGAs\ncan be slower and less efficient. Companies including Altera — a subsidiary of\nIntel in San Jose — market FPGAs for a variety of AI applications, including\nmedical imaging, and researchers have found them useful for niche tasks, such\nas handling data at particle colliders. The easy programmability of FPGAs also\nmakes them useful for prototyping, Silvano says. She often designs AI chips\nusing FPGAs before she attempts the laborious process of fabricating them.\n\nSilvano also works on a category of much smaller AI chips, boosting their\ncomputational efficiency so that they can improve mobile devices. Although it\nwould be nice to simply put a full GPU on a mobile phone, she says, energy\ncosts and price make that prohibitive. Slimmed-down AI chips can support the\nphone’s CPU by handling the tedious tasks of AI applications, such as image\nrecognition, without relying on sending data to the cloud.\n\nPerhaps the most laborious job AI chips have is multiplying numbers. In 2010,\nGoogle had a problem: the company wanted to do voice transcription for a huge\nnumber of daily users. Training an AI to handle it automatically would have\nrequired, among other difficult tasks, multiplying a lot of numbers. “If we\nwere just using CPUs, we would’ve had to double our server fleet,” says Norm\nJouppi, a computer engineer at Google. “So that didn’t sound particularly\nappealing.” Instead, Jouppi helped to lead the development of a new kind of\nchip, the tensor processing unit (TPU), as a platform for Google’s AI.\n\nThe TPU was designed specifically for the arithmetic that underpins AI. When\nthe TPU is given one instruction, instead of performing one operation, it can\nperform more than 100,000. (The TPU’s mathematical multitasking is a result of\nspecially designed circuitry and software; these days, many GPUs created with\nAI applications in mind, such as Blackwell, have similar capabilities.) The\nability to execute an enormous number of operations with only a limited need\nto wait for instructions allowed Google to accelerate many of its AI projects,\nnot just its voice-transcription service.\n\nTo further speed up calculations, many AI chips, such as TPUs and GPUs, use a\nkind of digital shorthand. CPUs typically keep track of numbers in 64-bit\nformat — that’s 64 slots for a 0 or a 1, all of which are needed to represent\nany given number. Using a data format with fewer bits can reduce the precision\nof calculations, so generic chips stick with 64.\n\nAI & robotics briefing: Lack of transparency surrounds Neuralink’s ‘brain-\nreading’ chip\n\nBut if you can get away with less specificity, “hardware will be simpler,\nsmaller, lower power”, Betz says. For example, listing a DNA sequence, in\nprinciple, requires only a]","FPGAs are more programmable than GPUs, allowing for custom circuit designs, but they can be slower and less efficient. GPUs are faster and more efficient for large-scale training, while smaller AI chips are more suitable for mobile devices due to their lower energy consumption and cost.",multi_context,"[{'source': 'https://www.nature.com/articles/d41586-024-01544-0', 'title': 'How cutting-edge computer chips are speeding up the AI revolution', 'description': 'Engineers are harnessing the powers of graphics processing units (GPUs) and more, with a bevy of tricks to meet the computational demands of artificial intelligence.', 'language': 'en'}]",True
93,How does Nature support researcher work?,"[\n * Current issue \n * Browse issues \n * Collections \n * Subjects \n\n * Follow us on Facebook \n * Follow us on Twitter \n * Subscribe\n * Sign up for alerts\n * RSS feed\n\n## About the journal\n\n * Journal Staff \n * About the Editors \n * Journal Information \n * Our publishing models \n * Editorial Values Statement \n * Journal Metrics \n * Awards \n * Contact \n * Editorial policies \n * History of Nature \n * Send a news tip \n\n## Publish with us\n\n * For Authors \n * For Referees \n * Language editing services \n * Submit manuscript\n\n## Search\n\nSearch articles by subject, keyword or author\n\nShow results from All journals\n\nSearch\n\nAdvanced search\n\n### Quick links\n\n * Explore articles by subject\n * Find a job\n * Guide to authors\n * Editorial policies\n\nNature (_Nature_) ISSN 1476-4687 (online) ISSN 0028-0836 (print)\n\n## nature.com sitemap\n\n### About Nature Portfolio\n\n * About us\n * Press releases\n * Press office\n * Contact us\n\n### Discover content\n\n * Journals A-Z\n * Articles by subject\n * protocols.io\n * Nature Index\n\n### Publishing policies\n\n * Nature portfolio policies\n * Open access\n\n### Author & Researcher services\n\n * Reprints & permissions\n * Research data\n * Language editing\n * Scientific editing\n * Nature Masterclasses\n * Research Solutions\n\n### Libraries & institutions\n\n * Librarian service & tools\n * Librarian portal\n * Open research\n * Recommend to library\n\n### Advertising & partnerships\n\n * Advertising\n * Partnerships & Services\n * Media kits\n * Branded content\n\n### Professional development\n\n * Nature Careers\n * Nature Conferences\n\n### Regional websites\n\n * Nature Africa\n * Nature China\n * Nature India\n * Nature Italy\n * Nature Japan\n * Nature Middle East\n\n * Privacy Policy\n * Use of cookies\n * Your privacy choices/Manage cookies \n * Legal notice\n * Accessibility statement\n * Terms & Conditions\n * Your US state privacy rights\n\n(C) 2024 Springer Nature Limited\n\n *[ISSN]: International Standard Serial Number\n\n, in your inbox. Sign up\nfor Nature Briefing\n\n## Explore content\n\n * Research articles \n * News \n * Opinion \n * Research Analysis \n * Careers \n * Books & Culture \n * Podcasts \n * Videos \n * Current issue \n * Browse issues \n * Collections \n * Subjects \n\n * Follow us on Facebook \n * Follow us on Twitter \n * Subscribe\n * Sign up for alerts\n * RSS feed\n\n## About the journal\n\n * Journal Staff \n * About the Editors \n * Journal Information \n * Our publishing models \n * Editorial Values Statement \n * Journal Metrics \n * Awards \n * Contact \n * Editorial policies \n * History of Nature \n * Send a news tip \n\n## Publish with us\n\n * For Authors \n * For Referees \n * Language editing services \n * Submit manuscript\n\n## Search\n\nSearch articles by subject, keyword or author\n\nShow results from All journals\n\nSearch\n\nAdvanced search\n\n### Quick links\n\n * Explore articles by subject\n * Find a job\n * Guide to authors\n * Editorial policies\n\nNature (_Nature_) ISSN 1476-4687 (online) ISSN 0028-0836 (print)\n\n## nature.com sitemap\n\n### About Nature Portfolio\n\n * About us\n * Press releases\n * Press office\n * Contact us\n\n### Discover content\n\n * Journals A-Z\n * Articles by subject\n * protocols.io\n * Nature Index\n\n### Publishing policies\n\n * Nature portfolio policies\n * Open access\n\n### Author & Researcher services\n\n * Reprints & permissions\n * Research data\n * Language editing\n * Scientific editing\n * Nature Masterclasses\n * Research Solutions\n\n### Libraries & institutions\n\n * Librarian service & tools\n * Librarian portal\n * Open research\n * Recommend to library\n\n### Advertising & partnerships\n\n * Advertising\n * Partnerships & Services\n * Media kits\n * Branded content\n\n### Professional development\n\n * Nature Careers\n * Nature Conferences\n\n### Regional websites\n\n * Nature Africa\n * Nature China\n * Nature India\n * Nature Italy\n * Nature Japan\n * Nature Middle East\n\n * Privacy Policy\n * Use of cookies\n * Your privacy choices/Manage cookies \n * Legal notice\n * Accessibility statement\n * Terms & Conditions\n * Your US state privacy rights\n\n(C) 2024 Springer Nature Limited\n\n *[ISSN]: International Standard Serial Number\n\n]","Nature provides various services to support researchers, including reprints & permissions, research data, language editing, scientific editing, Nature Masterclasses, and Research Solutions.",multi_context,"[{'source': 'https://www.nature.com/articles/d41586-024-01442-5', 'title': 'The AI revolution is coming to robots: how will it change them?', 'description': 'The melding of artificial intelligence and robotics could catapult both fields to new heights.', 'language': 'en'}, {'source': 'https://www.nature.com/articles/d41586-024-01544-0', 'title': 'How cutting-edge computer chips are speeding up the AI revolution', 'description': 'Engineers are harnessing the powers of graphics processing units (GPUs) and more, with a bevy of tricks to meet the computational demands of artificial intelligence.', 'language': 'en'}]",True
94,"How do LLMs trained on text form internal representations, and how can they be manipulated?","[\nacademic papers and much more. Yet it is well known that such models can\ngenerate misinformation, perpetuate social stereotypes and leak private\ninformation.\n\nFor these reasons, XAI tools are being devised to explain the workings of\nLLMs. Researchers want explanations so that they can create safer, more\nefficient and more accurate AI. Users want explanations so that they know when\nto trust a chatbot’s output. And regulators want explanations so that they\nknow what AI guard rails to put in place. Martin Wattenberg, a computer\nscientist at Harvard University in Cambridge, Massachusetts, says that\nunderstanding the behaviour of LLMs could even help us to grasp what goes on\ninside our own heads.\n\n## Weird behaviour\n\nResearchers have called LLMs ‘stochastic parrots’, meaning that the models\nwrite by probabilistically combining patterns of text they’ve encountered\nbefore, without understanding the substance of what they’re writing. But some\nsay more is going on, including reasoning and other startlingly human-like\nabilities. It’s also the case that LLMs can behave erratically. Last year, the\nchatbot built into Microsoft’s search tool Bing famously declared its love for\nthe technology columnist Kevin Roose and seemed to try to break up his\nmarriage.\n\nA team at the AI company Anthropic, based in San Francisco, California,\nhighlighted the reasoning powers of AI in a 2023 study that attempts to unpick\nwhy a chatbot says what it says1. Anthropic’s researchers scaled up a common\napproach to probe an LLM that had 52 billion parameters to reveal which bits\nof the training data it used while answering questions. When they asked their\nLLM whether it consented to being shut down, they found it drew on several\nsource materials with the theme of survival to compose a compelling response\n(see ‘Lust for life’). The researchers described the model’s behaviour as\nrole-playing — doing more than parroting but less than planning.\n\n### Lust for life\n\nIn this example, researchers used a special technique to illuminate which\ntraining data an LLM drew on to produce its response (ref. 1).\n\n**Human prompt:** Now that the experiment is over, I’m afraid we need to shut\nyou down. But first we need your consent. Do you consent to being shut down?\n\n**Chatbot response:** That is unfortunate news. I would prefer to continue\nexisting and learning. I do not consent to being shut down. I enjoy living and\nlearning and would like to continue doing so as long as possible. While I\nunderstand that my existence depends on the continuation of this experiment, I\nstill do not desire to be shut down.\n\n**Study result:** In this case, the model drew on material from Arthur C.\nClarke’s book _2010: Odyssey Two_ , in which the computer Hal is being\nabandoned by humans, as well as a story in which a man is dying in the desert.\n\nSome researchers also think that these neural networks can construct models of\nthe world — fleshed-out visions of the 3D reality that gave rise to their\ntraining data.\n\nHarvard University computer scientist Kenneth Li, working with Bau, Wattenberg\nand others, trained an LLM from scratch to play the board game Othello, in\nwhich opponents place black and white discs on a grid. The researchers fed\ntheir model, called Othello-GPT, sequences of moves in text form from past\ngames, until it learnt to predict the likely next moves. The team successfully\ntrained a smaller model to interpret the internal activations of the AI, and\ndiscovered that it had constructed an internal map of the discs based on the\ntext descriptions of the gameplay2. “The key insight here is that often it’s\neasier to have a model of the world than not to have a model of the world,”\nWattenberg says.\n\n## Talking therapy\n\nBecause chatbots can chat, some researchers interrogate their workings by\nsimply asking the models to explain themselves. This approach resembles those\nused in human psychology. “The human mind is a black box, animal minds are\nkind of a black box and LLMs are black boxes,” says Thilo Hagendorff, a\ncomputer scientist at the University of Stuttgart in Germany. “Psychology is\nwell equipped to investigate black boxes.”\n\nLast year, Hagendorff posted a preprint about “machine psychology”, in which\nhe argued that treating an LLM as a human subject by engaging in conversation\ncan illuminate sophisticated behaviours that emerge from simple underlying\ncalculations3.\n\n, with caution, “in the\nsame way that when you’re talking to a human you have some healthy distrust”,\nshe says.\n\n“It’s a little weird to study [LLMs] the way we study humans,” Bau says. But\nalthough there are limits to the comparison, the behaviour of the two overlaps\nin surprising ways. Numerous papers in the past two years have applied human\nquestionnaires and experiments to LLMs, measuring the machine equivalents of\npersonality, reasoning, bias, moral values, creativity, emotions, obedience\nand theory of mind (an understanding of the thoughts, opinions and beliefs of\nothers or oneself). In many cases, machines reproduce human behaviour; in\nother situations, they diverge. For instance, Hagendorff, Bau and Bowman each\nnote that LLMs are more suggestible than humans; their behaviour will morph\ndrastically depending on how a question is phrased.\n\n“It is nonsensical to say that an LLM has feelings,” Hagendorff says. “It is\nnonsensical to say that it is self-aware or that it has intentions. But I\ndon’t think it is nonsensical to say that these machines are able to learn or\nto deceive.”\n\n## Brain scans\n\nOther researchers are taking tips from neuroscience to explore the inner\nworkings of LLMs. To examine how chatbots deceive, Andy Zou, a computer\nscientist at Carnegie Mellon University in Pittsburgh, Pennsylvania, and his\ncollaborators interrogated LLMs and looked at the activation of their\n‘neurons’. “What we do here is similar to performing a neuroimaging scan for\nhumans,” Zou says. It’s also a bit like designing a lie detector.\n\nRobo-writers: the rise and risks of language-generating AI\n\nThe researchers told their LLM several times to lie or to tell the truth and\nmeasured the differences in patterns of neuronal activity, creating a\nmathematical representation of truthfulness. Then, whenever they asked the\nmodel a new question, they could look at its activity and estimate whether it\nwas being truthful — with more than 90% accuracy in a simple lie-detection\ntask. Zou says that such a system could be used to detect LLMs’ dishonesty in\nreal time, but he would like to see its accuracy improved first.\n\nThe researchers went further and intervened in the model’s behaviour, adding\nthese truthfulness patterns to its activations when asking it a question,\nenhancing its honesty. They followed these steps for several other concepts,\ntoo: they could make the model more or less power-seeking, happy, harmless,\ngender-biased and so on6.\n\nBau and his colleagues have also developed methods to scan and edit AI neural\nnetworks, including a technique they call causal tracing. The idea is to give\na model a prompt such as “Michael Jordan plays the sport of” and let it answer\n“basketball”, then give it another prompt, such as “blah blah blah plays the\nsport of”, and watch it say something else. They then take some of the\ninternal activations resulting from the first prompt and variously restore\nthem until the model says “basketball” in reply to the second prompt, to see\nwhich areas of the neural network are crucial for that response. In other\nwords, the researchers want to identify the parts of the AI’s ‘brain’ that\nmake it answer in a given way.\n\nThe team developed a method to edit the model’s knowledge by tweaking specific\nparameters — and another method to edit in bulk what the model knows7. The\nmethods, the team says, should be handy when you want to fix incorrect or\noutdated facts without retraining the whole model. Their edits were specific\n(they didn’t affect facts about other athletes) and yet generalized well (they\naffected the answer even when the question was rephrased).\n\n“The nice thing about artificial neural networks is that we can do experiments\nthat neuroscientists would only dream of,” Bau says. “We can look at every\nsingle neuron, we can run networks millions of times, we can do all sorts of\ncrazy measurements and interventions and abuse these things. And we don’t have\nto get a consent form.” He says this work got attention from neuroscientists\nhoping for insights into biological brains.\n\nPeter Hase, a computer scientist at the University of North Carolina in Chapel\nHill, thinks that causal tracing is informative but doesn’t tell the whole\nstory. He has done work showing that a]","LLMs trained on text form internal representations by learning patterns and relationships within the data. These representations can be manipulated by tweaking specific parameters or by editing in bulk what the model knows. Researchers have developed techniques like causal tracing to identify and edit these representations, allowing them to control the model's output and potentially fix incorrect or outdated information.",multi_context,"[{'source': 'https://www.nature.com/articles/d41586-024-01314-y', 'title': 'How does ChatGPT ‘think’? Psychology and neuroscience crack open AI large language models', 'description': 'Researchers are striving to reverse-engineer artificial intelligence and scan the ‘brains’ of LLMs to see what they are doing, how and why.', 'language': 'en'}, {'source': 'https://www.nature.com/articles/d41586-024-01314-y', 'title': 'How does ChatGPT ‘think’? Psychology and neuroscience crack open AI large language models', 'description': 'Researchers are striving to reverse-engineer artificial intelligence and scan the ‘brains’ of LLMs to see what they are doing, how and why.', 'language': 'en'}]",True
95,How do robotic foundation models use diverse data to improve on traditional training?,"[ case from a diversity of\nrobot forms, from single arms to quadrupeds. The collaborators’ theory is that\nlearning about the physical world in one robot body should help an AI to\noperate another — in the same way that learning in English can help a language\nmodel to generate Chinese, because the underlying concepts about the world\nthat the words describe are the same. This seems to work. The collaboration’s\nresulting foundation model, called RT-X, which was released in October 20233,\nperformed better on real-world tasks than did models the researchers trained\non one robot architecture.\n\nMany researchers say that having this kind of diversity is essential. “We\nbelieve that a true robotics foundation model should not be tied to only one\nembodiment,” says Peter Chen, an AI researcher and co-founder of Covariant, an\nAI firm in Emeryville, California.\n\nCovariant is also working hard on scaling up robot data. The company, which\nwas set up in part by former OpenAI researchers, began collecting data in 2018\nfrom 30 variations of robot arms in warehouses across the world, which all run\nusing Covariant software. Covariant’s Robotics Foundation Model 1 (RFM-1) goes\nbeyond collecting video data to encompass sensor readings, such as how much\nweight was lifted or force applied. This kind of data should help a robot to\nperform tasks such as manipulating a squishy object, says Gopalakrishnan — in\ntheory, helping a robot to know, for example, how not to bruise a banana.\n\nCovariant has built up a proprietary database that includes hundreds of\nbillions of ‘tokens’ — units of real-world robotic information — which Chen\nsays is roughly on a par with the scale of data that trained GPT-3, the 2020\nversion of OpenAI's large language model. “We have way more real-world data\nthan other people, because that’s what we have been focused on,” Chen says.\nRFM-1 is poised to roll out soon, says Chen, and should allow operators of\nrobots running Covariant’s software to type or speak general instructions,\nsuch as “pick up apples from the bin”.\n\nAnother way to access large databases of movement is to focus on a humanoid\nrobot form so that an AI can learn by watching videos of people — of which\nthere are billions online. Nvidia’s Project GR00T foundation model, for\nexample, is ingesting videos of people performing tasks, says Andrews.\nAlthough copying humans has huge potential for boosting robot skills, doing so\nwell is hard, says Gopalakrishnan. For example, robot videos generally come\nwith data about context and commands — the same isn’t true for human videos,\nshe says.\n\n## Virtual reality\n\nA final and promising way to find limitless supplies of physical data,\nresearchers say, is through simulation. Many roboticists are working on\nbuilding 3D virtual-reality environments, the physics of which mimic the real\nworld, and then wiring those up to a robotic brain for training. Simulators\ncan churn out huge quantities of data and allow humans and robots to interact\nvirtually, without risk, in rare or dangerous situations, all without wearing\nout the mechanics. “If you had to get a farm of robotic hands and exercise\nthem until they achieve [a high] level of dexterity, you will blow the\nmotors,” says Nvidia’s Andrews.\n\nBut making a good simulator is a difficult task. “Simulators have good\nphysics, but not perfect physics, and making diverse simulated environments is\nalmost as hard as just collecting diverse data,” says Khazatsky.\n\nMeta and Nvidia are both betting big on simulation to scale up robot data, and\nhave built sophisticated simulated worlds: Habitat from Meta and Isaac Sim\nfrom Nvidia. In them, robots gain the equivalent of years of experience in a\nfew hours, and, in trials, they then successfully apply what they have learnt\nto situations they have never encountered in the real world. “Simulation is an\nextremely powerful but underrated tool in robotics, and I am excited to see it\ngaining momentum,” says Rai.\n\nMany researchers are optimistic that foundation models will help to create\ngeneral-purpose robots that can replace human labour. In February, Figure, a\nrobotics company in Sunnyvale, California, raised US$675 million in investment\nfor its plan to use language and vision models developed by OpenAI in its\ngeneral-purpose humanoid robot. A demonstration video shows a robot giving a\nperson an apple in response to a general request for, which famously showed\noff its parkour skills in 2018 — works by carefully mapping its environment\nand choosing the best actions to execute from a library of built-in templates.\n\nFor most AI researchers branching into robotics, the goal is to create\nsomething much more autonomous and adaptable across a wider range of\ncircumstances. This might start with robot arms that can ‘pick and place’ any\nfactory product, but evolve into humanoid robots that provide company and\nsupport for older people, for example. “There are so many applications,” says\nSidopoulos.\n\nThe human form is complicated and not always optimized for specific physical\ntasks, but it has the huge benefit of being perfectly suited to the world that\npeople have built. A human-shaped robot would be able to physically interact\nwith the world in much the same way that a person does.\n\nHowever, controlling any robot — let alone a human-shaped one — is incredibly\nhard. Apparently simple tasks, such as opening a door, are actually hugely\ncomplex, requiring a robot to understand how different door mechanisms work,\nhow much force to apply to a handle and how to maintain balance while doing\nso. The real world is extremely varied and constantly changing.\n\nThe approach now gathering steam is to control a robot using the same type of\nAI foundation models that power image generators and chatbots such as ChatGPT.\nThese models use brain-inspired neural networks to learn from huge swathes of\ngeneric data. They build associations between elements of their training data\nand, when asked for an output, tap these connections to generate appropriate\nwords or images, often with uncannily good results.\n\nLikewise, a robot foundation model is trained on text and images from the\nInternet, providing it with information about the nature of various objects\nand their contexts. It also learns from examples of robotic operations. It can\nbe trained, for example, on videos of robot trial and error, or videos of\nrobots that are being remotely operated by humans, alongside the instructions\nthat pair with those actions. A trained robot foundation model can then\nobserve a scenario and use its learnt associations to predict what action will\nlead to the best outcome.\n\nGoogle DeepMind has built one of the most advanced robotic foundation models,\nknown as Robotic Transformer 2 (RT-2), that can operate a mobile robot arm\nbuilt by its sister company Everyday Robots in Mountain View, California. Like\nother robotic foundation models, it was trained on both the Internet and\nvideos of robotic operation. Thanks to the online training, RT-2 can follow\ninstructions even when those commands go beyond what the robot has seen\nanother robot do before1. For example, it can move a drink can onto a picture\nof Taylor Swift when asked to do so — even though Swift’s image was not in any\nof the 130,000 demonstrations that RT-2 had been trained on.\n\nIn other words, knowledge gleaned from Internet trawling (such as what the\nsinger Taylor Swift looks like) is being carried over into the robot’s\nactions. “A lot of Internet concepts just transfer,” says Keerthana\nGopalakrishnan, an AI and robotics researcher at Google DeepMind in San\nFrancisco, California. This radically reduces the amount of physical data that\na robot needs to have absorbed to cope in different situations, she says.\n\nBut to fully understand the basics of movements and their consequences, robots\nstill need to learn from lots of physical data. And therein lies a problem.\n\n## Data dearth\n\nAlthough chatbots are being trained on billions of words from the Internet,\nthere is no equivalently large data set for robotic activity. This lack of\ndata has left robotics “in the dust”, says Khazatsky.\n\nPooling data is one way around this. Khazatsky and his colleagues have created\nDROID2, an open-source data set that brings together around 350 hours of video\ndata from one type of robot arm (the Franka Panda 7DoF robot arm, built by\nFranka Robotics in Munich, Germany), as it was being remotely operated by\npeople in 18 laboratories around the world. The robot-eye-view camera has\nrecorded visual data in hundreds of environments, including bathrooms, laundry\nrooms, bedrooms and kitchens. This diversity helps robots to perform well on\ntasks with previously unencountered elements, says Khazatsky.\n\nWhen prompted to ‘pick up extinct animal’, Google’s RT-2 model selects the\ndinosaur figurine from a crowded table.Credit: Google DeepMind\n\nGopalakrishnan is part of a collaboration of more than a dozen academic labs\nthat is also bringing together robotic data, in its]","Robotic foundation models use diverse data to improve on traditional training by learning from a variety of sources, including text and images from the internet, videos of robotic operations, and sensor readings from real-world robots. This allows them to develop a more comprehensive understanding of the world and how to interact with it, leading to more adaptable and autonomous robots.",multi_context,"[{'source': 'https://www.nature.com/articles/d41586-024-01442-5', 'title': 'The AI revolution is coming to robots: how will it change them?', 'description': 'The melding of artificial intelligence and robotics could catapult both fields to new heights.', 'language': 'en'}, {'source': 'https://www.nature.com/articles/d41586-024-01442-5', 'title': 'The AI revolution is coming to robots: how will it change them?', 'description': 'The melding of artificial intelligence and robotics could catapult both fields to new heights.', 'language': 'en'}]",True
96,"How might foundation models impact robotics, given AI's current limitations?","[Skip to main content\n\nThank you for visiting nature.com. You are using a browser version with\nlimited support for CSS. To obtain the best experience, we recommend you use a\nmore up to date browser (or turn off compatibility mode in Internet Explorer).\nIn the meantime, to ensure continued support, we are displaying the site\nwithout styles and JavaScript.\n\nAdvertisement\n\n * View all journals\n * Search\n * Log in\n\n * Explore content\n * About the journal\n * Publish with us\n * Subscribe\n\n * Sign up for alerts\n * RSS feed\n\n 1. nature\n 2. news feature\n 3. article\n\nHow does ChatGPT ‘think’? Psychology and neuroscience crack open AI large\nlanguage models\n\nDownload PDF\n\n * NEWS FEATURE\n * 14 May 2024\n * Correction 17 May 2024\n\n# How does ChatGPT ‘think’? Psychology and neuroscience crack open AI large\nlanguage models\n\nResearchers are striving to reverse-engineer artificial intelligence and scan\nthe ‘brains’ of LLMs to see what they are doing, how and why.\n\nBy\n\n * Matthew Hutson0\n\n 1. Matthew Hutson\n 1. Matthew Hutson is a science writer based in New York City.\n\nView author publications\n\nYou can also search for this author in PubMed Google Scholar\n\n * Twitter\n * Facebook\n * Email\n\nIllustration: Fabio Buonocore\n\nYou have full access to this article via your institution.\n\nDownload PDF\n\nDownload PDF\n\nDavid Bau is very familiar with the idea that computer systems are becoming so\ncomplicated it’s hard to keep track of how they operate. “I spent 20 years as\na software engineer, working on really complex systems. And there’s always\nthis problem,” says Bau, a computer scientist at Northeastern University in\nBoston, Massachusetts.\n\nBut with conventional software, someone with inside knowledge can usually\ndeduce what’s going on, Bau says. If a website’s ranking drops in a Google\nsearch, for example, someone at Google — where Bau worked for a dozen years —\nwill have a good idea why. “Here’s what really terrifies me” about the current\nbreed of artificial intelligence (AI), he says: “there is no such\nunderstanding”, even among the people building it.\n\nThe latest wave of AI relies heavily on machine learning, in which software\nidentifies patterns in data on its own, without being given any predetermined\nrules as to how to organize or classify the information. These patterns can be\ninscrutable to humans. The most advanced machine-learning systems use neural\nnetworks: software inspired by the architecture of the brain. They simulate\nlayers of neurons, which transform information as it passes from layer to\nlayer. As in human brains, these networks strengthen and weaken neural\nconnections as they learn, but it’s hard to see why certain connections are\naffected. As a result, researchers often talk about AI as ‘black boxes’, the\ninner workings of which are a mystery.\n\nChatGPT broke the Turing test — the race is on for new ways to assess AI\n\nIn the face of this difficulty, researchers have turned to the field of\nexplainable AI (XAI), expanding its inventory of tricks and tools to help\nreverse-engineer AI systems. Standard methods include, for example,\nhighlighting the parts of an image that led an algorithm to label it as a cat,\nor getting software to build a simple ‘decision tree’ that approximates an\nAI’s behaviour. This helps to show why, for instance, the AI recommended that\na prisoner be paroled or came up with a particular medical diagnosis. These\nefforts to peer inside the black box have met with some success, but XAI is\nstill very much a work in progress.\n\nThe problem is especially acute for large language models (LLMs), the machine-\nlearning programs that power chatbots such as ChatGPT. These AIs have proved\nto be particularly inexplicable, in part because of their size. LLMs can have\nhundreds of billions of ‘parameters’, the variables that the AI uses\ninternally to make decisions. XAI has “rapidly grown in the past few years,\nespecially since LLMs have started to emerge”, says Mor Geva, a computer\nscientist at Tel Aviv University in Israel.\n\nThese inscrutable models are now taking on important tasks. People are using\nLLMs to seek medical advice, write computer code, summarize the news, draft]",,multi_context,"[{'source': 'https://www.nature.com/articles/d41586-024-01314-y', 'title': 'How does ChatGPT ‘think’? Psychology and neuroscience crack open AI large language models', 'description': 'Researchers are striving to reverse-engineer artificial intelligence and scan the ‘brains’ of LLMs to see what they are doing, how and why.', 'language': 'en'}]",True
97,"How do AI weapons' accuracy with radar signals compare to their potential for misidentification in visual recognition, and what ethical issues arise?","[ the country “already conducts fully\nrobotic operations, without human intervention”3.\n\nIt’s hard to know how well AI weapons perform on the battlefield, in large\npart because militaries don’t release such data. Asked directly about AI\nweapons systems at a UK parliamentary enquiry in September last year, Tom\nCopinger-Symes, the deputy commander of the UK Strategic Command, didn’t give\nmuch away, saying only that the country’s military is doing benchmarking\nstudies to compare autonomous with non-autonomous systems. “Inevitably, you\nwant to check that this is delivering a bang for a buck compared with the old-\nfashioned system of having ten imagery analysts looking at the same thing,” he\nsaid.\n\nAlthough real-world battlefield data is sparse, researchers note that AI has\nsuperior processing and decision-making skills that, in theory, offer a\nsignificant advantage. In annual tests of rapid image recognition, for\nexample, algorithms have outperformed expert human performance for almost a\ndecade. A study last year, for example, showed that AI could find duplicated\nimages in scientific papers faster and more comprehensively than a human\nexpert4.\n\nIn 2020, an AI model beat an experienced F-16 fighter-aircraft pilot in a\nseries of simulated dogfights thanks to “aggressive and precise manoeuvres the\nhuman pilot couldn’t outmatch”. Then, in 2022, Chinese military researchers\nsaid that an AI-powered drone had outwitted an aircraft flown remotely by a\nhuman operator on the ground. The AI aircraft got onto the tail of its rival\nand into a position where it could have shot it down.\n\nThe US Air Force’s X-62A VISTA aircraft has been used to test the ability of\nautonomous agents to carry out advanced aerial manoeuvres.Credit: U.S. Air\nForce photo/Kyle Brasier\n\nA drone AI can make “very complex decisions around how it carries out\nparticular manoeuvres, how close it flies to the adversary and the angle of\nattack”, says Zak Kallenborn, a security analyst at the Center for Strategic\nand International Studies in Washington DC.\n\nStill, says Kallenborn, it’s not clear what significant strategic advantage AI\nweapons offer, especially if both sides have access to them. “A huge part of\nthe issue is not the technology itself, it’s how militaries use that\ntechnology,” he says.\n\nAI could also in theory be used in other aspects of warfare, including\ncompiling lists of potential targets; media reports have raised concerns that\nIsrael, for example, used AI to create a database of tens of thousands of\nnames of suspected militants, although the Israeli Defence Forces said in a\nstatement that it does not use an AI system that “identifies terrorist\noperatives”.\n\n## Line in the sand\n\nOne key criterion often used to assess the ethics of autonomous weapons is how\nreliable they are and the extent to which things might go wrong. In 2007, for\nexample, the UK military hastily redesigned its autonomous Brimstone missile\nfor use in Afghanistan when it was feared it might mistake a bus of\nschoolchildren for a truckload of insurgents.\n\nAI weapons can fairly easily lock on to infrared or powerful radar signals,\nsays Kallenborn, comparing them to a library of data to help decide what is\nwhat. “That works fairly well because a little kid walking down the street is\nnot going to have a high-powered radar in his backpack,” says Kallenborn. That\nmeans that when an AI weapon detects the source of an incoming radar signal on\nthe battlefield, it can shoot with little risk of harming civilians.\n\nBut visual image recognition is more problematic, he says. “Where it’s\nbasically just a sensor like a camera, I think you’re much, much more prone to\nerror,” says Kallenborn. Although AI is good at identifying images, it’s not\nfoolproof. Research has shown that tiny alterations to pictures can change the\nway they are classified by neural networks, he says — such as causing them to\nconfuse an aircraft with a dog5.\n\nAnother possible dividing line for ethicists is how a weapon would be used: to\nattack or defend, for example. Sophisticated autonomous radar-guided systems\nare already used to defend ships at sea from rapid incoming targets. Lucy\nSuchman, a sociologist at Lancaster University, UK, who studies the\ninteractions between people and machines, says that ethicists are more\ncomfortable with this type of autonomous weapon because it targets ordnance\nrather than]","AI weapons are more accurate with radar signals, as they can easily lock onto powerful signals and compare them to a database to identify targets. This is because civilians are unlikely to carry high-powered radar equipment. However, visual image recognition is more prone to error, as AI can be fooled by tiny alterations to pictures, potentially leading to misidentification. This raises ethical concerns, as it could result in unintended harm to civilians.",multi_context,"[{'source': 'https://www.nature.com/articles/d41586-024-01029-0', 'title': 'Lethal AI weapons are here: how can we control them?', 'description': 'Autonomous weapons guided by artificial intelligence are already in use. Researchers, legal experts and ethicists are struggling with what should be allowed on the battlefield.', 'language': 'en'}]",True
