In [19]:
import nest_asyncio

nest_asyncio.apply()

In [20]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter Your OpenAI API Key: ")

In [21]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import BSHTMLLoader

path = "data/"
text_loader = DirectoryLoader(path, glob="*.html", loader_cls=BSHTMLLoader)

In [22]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 750,
    chunk_overlap  = 20,
    length_function = len
)

In [23]:
training_documents = text_splitter.split_documents(text_loader.load())

In [24]:
len(training_documents)

162

In [25]:
import uuid

id_set = set()

for document in training_documents:
  id = str(uuid.uuid4())
  while id in id_set:
    id = uuid.uuid4()
  id_set.add(id)
  document.metadata["id"] = id

In [26]:
training_documents

[Document(metadata={'source': 'data/2023_llms.html', 'title': 'Stuff we figured out about AI in 2023', 'id': 'f09bb01c-5a7b-44a1-99dd-2837b6aaca4c'}, page_content='Stuff we figured out about AI in 2023\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSimon Willison’s Weblog\nSubscribe\n\n\n\n\n\n\nStuff we figured out about AI in 2023\n31st December 2023\n2023 was the breakthrough year for Large Language Models (LLMs). I think it’s OK to call these AI—they’re the latest and (currently) most interesting development in the academic field of Artificial Intelligence that dates back to the 1950s.\nHere’s my attempt to round up the highlights in one place!'),
 Document(metadata={'source': 'data/2023_llms.html', 'title': 'Stuff we figured out about AI in 2023', 'id': '4adb9328-c3e9-498a-a3af-dac2c0468835'}, page_content='Large Language Models\nThey’re actually quite easy to build\nYou can run LLMs on your own devices\nHobbyists can build their own fine-tuned models\nWe don’t yet know how to build G

In [27]:
training_split_documents = training_documents[:len(training_documents) - 62]
val_split_documents = training_documents[-62: -31]
test_split_documents = training_documents[-31:]

In [28]:
from langchain_openai import ChatOpenAI

qa_chat_model = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)

In [29]:
from langchain_core.prompts import ChatPromptTemplate

qa_prompt = """\
Given the following context, you must generate questions based on only the provided context.

You are to generate {n_questions} questions which should be provided in the following format:

1. QUESTION #1
2. QUESTION #2
...

Context:
{context}
"""

qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)

In [30]:
qa_prompt_template

ChatPromptTemplate(input_variables=['context', 'n_questions'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'n_questions'], input_types={}, partial_variables={}, template='Given the following context, you must generate questions based on only the provided context.\n\nYou are to generate {n_questions} questions which should be provided in the following format:\n\n1. QUESTION #1\n2. QUESTION #2\n...\n\nContext:\n{context}\n'), additional_kwargs={})])

We'll create a simple chain to query the LLM!

In [31]:
question_generation_chain = qa_prompt_template | qa_chat_model

In [32]:
import asyncio
import uuid
from tqdm import tqdm

async def process_document(document, n_questions):
    questions_generated = await question_generation_chain.ainvoke({"context": document.page_content, "n_questions": n_questions})

    doc_questions = {}
    doc_relevant_docs = {}

    for question in questions_generated.content.split("\n"):
        question_id = str(uuid.uuid4())
        doc_questions[question_id] = "".join(question.split(".")[1:]).strip()
        doc_relevant_docs[question_id] = [document.metadata["id"]]

    return doc_questions, doc_relevant_docs

async def create_questions(documents, n_questions):
    tasks = [process_document(doc, n_questions) for doc in documents]

    questions = {}
    relevant_docs = {}

    for task in tqdm(asyncio.as_completed(tasks), total=len(documents), desc="Processing documents"):
        doc_questions, doc_relevant_docs = await task
        questions.update(doc_questions)
        relevant_docs.update(doc_relevant_docs)

    return questions, relevant_docs

In [33]:
training_questions, training_relevant_contexts = await create_questions(training_split_documents, 2)

Processing documents: 100%|██████████| 100/100 [00:10<00:00,  9.99it/s]


We'll use the function to generate training, validation, and test data.

In [34]:
val_questions, val_relevant_contexts = await create_questions(val_split_documents, 2)

Processing documents: 100%|██████████| 31/31 [00:04<00:00,  6.78it/s]


In [35]:
test_questions, test_relevant_contexts = await create_questions(test_split_documents, 2)

Processing documents: 100%|██████████| 31/31 [00:04<00:00,  7.35it/s]


In [36]:
import json

training_corpus = {train_item.metadata["id"] : train_item.page_content for train_item in training_split_documents}

train_dataset = {
    "questions" : training_questions,
    "relevant_contexts" : training_relevant_contexts,
    "corpus" : training_corpus
}

with open("training_dataset.jsonl", "w") as f:
  json.dump(train_dataset, f)

In [37]:
val_corpus = {val_item.metadata["id"] : val_item.page_content for val_item in val_split_documents}

val_dataset = {
    "questions" : val_questions,
    "relevant_contexts" : val_relevant_contexts,
    "corpus" : val_corpus
}

with open("val_dataset.jsonl", "w") as f:
  json.dump(val_dataset, f)

In [38]:
train_corpus = {test_item.metadata["id"] : test_item.page_content for test_item in test_split_documents}

test_dataset = {
    "questions" : test_questions,
    "relevant_contexts" : test_relevant_contexts,
    "corpus" : train_corpus
}

with open("test_dataset.jsonl", "w") as f:
  json.dump(test_dataset, f)

In [39]:
#!pip install -qU sentence_transformers datasets pyarrow

In [40]:
from sentence_transformers import SentenceTransformer

model_id = "Snowflake/snowflake-arctic-embed-l"
model = SentenceTransformer(model_id)

In [41]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sentence_transformers import InputExample

In [42]:
BATCH_SIZE = 10

In [43]:
corpus = train_dataset['corpus']
queries = train_dataset['questions']
relevant_docs = train_dataset['relevant_contexts']

examples = []
for query_id, query in queries.items():
    doc_id = relevant_docs[query_id][0]
    text = corpus[doc_id]
    example = InputExample(texts=[query, text])
    examples.append(example)

In [44]:
loader = DataLoader(
    examples, batch_size=BATCH_SIZE
)

In [45]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

matryoshka_dimensions = [768, 512, 256, 128, 64]
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

In [46]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

corpus = val_dataset['corpus']
queries = val_dataset['questions']
relevant_docs = val_dataset['relevant_contexts']

evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

In [47]:
EPOCHS = 10

In [48]:
import wandb
wandb.init(mode="disabled")

In [49]:
warmup_steps = int(len(loader) * EPOCHS * 0.1)

model.fit(
    train_objectives=[(loader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path='finetuned_arctic_ft',
    show_progress_bar=True,
    evaluator=evaluator,
    evaluation_steps=50
)

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



Step,Training Loss,Validation Loss,Cosine Accuracy@1,Cosine Accuracy@3,Cosine Accuracy@5,Cosine Accuracy@10,Cosine Precision@1,Cosine Precision@3,Cosine Precision@5,Cosine Precision@10,Cosine Recall@1,Cosine Recall@3,Cosine Recall@5,Cosine Recall@10,Cosine Ndcg@10,Cosine Mrr@10,Cosine Map@100
20,No log,No log,0.774194,0.983871,1.0,1.0,0.774194,0.327957,0.2,0.1,0.774194,0.983871,1.0,1.0,0.900761,0.866935,0.866935
40,No log,No log,0.806452,0.983871,1.0,1.0,0.806452,0.327957,0.2,0.1,0.806452,0.983871,1.0,1.0,0.919002,0.891129,0.891129
50,No log,No log,0.774194,0.983871,1.0,1.0,0.774194,0.327957,0.2,0.1,0.774194,0.983871,1.0,1.0,0.904985,0.872312,0.872312
60,No log,No log,0.774194,0.983871,1.0,1.0,0.774194,0.327957,0.2,0.1,0.774194,0.983871,1.0,1.0,0.904985,0.872312,0.872312
80,No log,No log,0.758065,0.983871,1.0,1.0,0.758065,0.327957,0.2,0.1,0.758065,0.983871,1.0,1.0,0.899032,0.864247,0.864247
100,No log,No log,0.790323,0.983871,1.0,1.0,0.790323,0.327957,0.2,0.1,0.790323,0.983871,1.0,1.0,0.910937,0.880376,0.880376
120,No log,No log,0.774194,0.983871,1.0,1.0,0.774194,0.327957,0.2,0.1,0.774194,0.983871,1.0,1.0,0.902873,0.869624,0.869624
140,No log,No log,0.774194,0.967742,1.0,1.0,0.774194,0.322581,0.2,0.1,0.774194,0.967742,1.0,1.0,0.901755,0.86828,0.86828
150,No log,No log,0.774194,0.983871,1.0,1.0,0.774194,0.327957,0.2,0.1,0.774194,0.983871,1.0,1.0,0.902873,0.869624,0.869624
160,No log,No log,0.774194,0.983871,1.0,1.0,0.774194,0.327957,0.2,0.1,0.774194,0.983871,1.0,1.0,0.902873,0.869624,0.869624


In [53]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [54]:
hf_username = "dataera2013"

In [55]:
model.push_to_hub(f"{hf_username}/mt-1")

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

'https://huggingface.co/dataera2013/mt-1/commit/0b529104372cfa96a990b1416c9caf42430da234'

In [56]:
import pandas as pd

from langchain_community.vectorstores import FAISS
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_core.documents import Document
from tqdm.auto import tqdm

In [57]:
def evaluate_openai(
    dataset,
    embed_model,
    top_k=5,
    verbose=False,
):
  corpus = dataset['corpus']
  questions = dataset['questions']
  relevant_docs = dataset['relevant_contexts']
  documents = [Document(page_content=content, metadata={"id": doc_id}) for doc_id, content in corpus.items()]
  vectorstore = FAISS.from_documents(documents, embed_model)

  retriever = vectorstore.as_retriever(search_kwargs={"k": top_k})

  eval_results = []
  for id, question in tqdm(questions.items()):
    retrieved_nodes = retriever.invoke(question)
    retrieved_ids = [node.metadata["id"] for node in retrieved_nodes]
    expected_id = relevant_docs[id][0]
    is_hit = expected_id in retrieved_ids
    eval_results.append({"id": id, "question": question, "expected_id": expected_id, "is_hit": is_hit})

  return eval_results

In [59]:
te3_openai = OpenAIEmbeddings(model="text-embedding-3-small")
te3_results = evaluate_openai(test_dataset, te3_openai)

  0%|          | 0/62 [00:00<?, ?it/s]

In [60]:
te3_results_df = pd.DataFrame(te3_results)

In [62]:
te3_hit_rate = te3_results_df["is_hit"].mean()
te3_hit_rate

np.float64(1.0)

In [63]:
from langchain_huggingface import HuggingFaceEmbeddings

huggingface_embeddings = HuggingFaceEmbeddings(model_name="Snowflake/snowflake-arctic-embed-l")
arctic_embed_m_results = evaluate_openai(test_dataset, huggingface_embeddings)

  0%|          | 0/62 [00:00<?, ?it/s]

In [64]:
arctic_embed_m_results_df = pd.DataFrame(arctic_embed_m_results)

In [65]:
arctic_embed_m_results_df

Unnamed: 0,id,question,expected_id,is_hit
0,fb21ff81-8635-4dae-bce1-96a18423d51d,What challenges are associated with establishi...,3c66f935-0956-4c3c-ae97-def132e9d9c2,True
1,95ba87fa-dd80-47cb-abb4-446737f65e99,How can personalized health interventions base...,3c66f935-0956-4c3c-ae97-def132e9d9c2,True
2,1a0685e4-ee25-4219-b546-2f213f577ae1,What are some of the challenges associated wit...,49468587-d3dc-45cd-a0db-1d3e7684f12d,True
3,7d488b6c-680d-42f1-a2c4-66d78d7a4a01,How does augmented reality enhance learning ex...,49468587-d3dc-45cd-a0db-1d3e7684f12d,True
4,72ccea24-c792-4e1a-9c2e-e1219298112b,What are some of the risks that microplastics ...,d7a4c7ff-4df3-413f-9616-519558e16966,True
...,...,...,...,...
57,5f82045f-5458-405c-b77a-5975b816675d,How do mitigation and adaptation strategies ne...,e3d45615-0d48-42d7-b394-008f06ee806d,True
58,aa86a73e-3cb3-46fd-8891-689a9f4c8c69,What are some of the latest advances in energy...,f3c85b89-8e77-4abc-b7e9-bfba6e9e49e9,True
59,2fdeb3d3-c69d-4f3e-b44a-d2277c3c3c46,How do Zinc-Air and Sodium-Ion batteries contr...,f3c85b89-8e77-4abc-b7e9-bfba6e9e49e9,True
60,f3b51f65-0e2f-4d5b-9a7e-1afdf3e78efe,What are some of the key challenges to biodive...,501d67c3-de50-48e4-bc66-0b3029dd1fcc,False


In [66]:
arctic_embed_m_hit_rate = arctic_embed_m_results_df["is_hit"].mean()
arctic_embed_m_hit_rate

np.float64(0.8548387096774194)

### `Snowflake/snowflake-arctic-embed-l` (fine-tuned)

In [69]:
finetune_embeddings = HuggingFaceEmbeddings(model_name="finetuned_arctic_ft")
finetune_results = evaluate_openai(test_dataset, finetune_embeddings)

Some weights of BertModel were not initialized from the model checkpoint at finetuned_arctic_ft and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/62 [00:00<?, ?it/s]

In [70]:
finetune_results_df = pd.DataFrame(finetune_results)

In [71]:
finetune_hit_rate = finetune_results_df["is_hit"].mean()
finetune_hit_rate

np.float64(1.0)

In [72]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 600,
    chunk_overlap  = 50,
    length_function = len
)

training_documents = text_splitter.split_documents(text_loader.load())

In [73]:
from langchain_community.vectorstores import FAISS

base_vectorstore = FAISS.from_documents(training_documents, huggingface_embeddings)
base_retriever = base_vectorstore.as_retriever(search_kwargs={"k": 6})

In [74]:
from langchain_core.prompts import ChatPromptTemplate

RAG_PROMPT = """\
Given a provided context and a question, you must answer the question. If you do not know the answer, you must state that you do not know.

Context:
{context}

Question:
{question}

Answer:
"""

rag_prompt_template = ChatPromptTemplate.from_template(RAG_PROMPT)

In [75]:
rag_llm =  ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)

In [76]:
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel

base_rag_chain = (
    {"context": itemgetter("question") | base_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt_template | rag_llm | StrOutputParser(), "context": itemgetter("context")}
)

In [77]:
base_rag_chain.invoke({"question" : "What is an agent?"})["response"]

'An agent, in the context of AI, is a term that refers to AI systems that can act on your behalf. However, the term is considered vague and lacks a single, clear definition. Some people view agents as systems that autonomously perform tasks, while others think of them as LLMs (large language models) that utilize tools to solve problems. The concept of "agents" is still evolving, and there are ongoing discussions about their utility and effectiveness, particularly regarding their ability to distinguish truth from fiction.'

In [78]:
base_rag_chain.invoke({"question" : "Who has produced better models than GPT-3?"})["response"]

'Organizations that have produced better-than-GPT-3 class models include Anthropic, Mistral, Google, Meta, EleutherAI, Stability AI, TII in Abu Dhabi (Falcon), Microsoft Research, xAI, Replit, Baidu, and several others.'

In [79]:
base_rag_chain.invoke({"question" : "What is the laziest AI month?"})["response"]

'I do not know.'

In [80]:
base_rag_chain.invoke({"question" : "What is the largest model that Simon has run on his phone?"})["response"]

'I do not know.'

In [81]:
finetune_vectorstore = FAISS.from_documents(training_documents, finetune_embeddings)
finetune_retriever = finetune_vectorstore.as_retriever(search_kwargs={"k": 6})

In [82]:
finetune_rag_chain = (
    {"context": itemgetter("question") | finetune_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt_template | rag_llm | StrOutputParser(), "context": itemgetter("context")}
)

In [83]:
finetune_rag_chain.invoke({"question" : "What is an Agent?"})["response"]

'An "agent" is a term that lacks a single, clear, and widely understood meaning in the context of AI. It generally refers to AI systems that can act on behalf of a user, but interpretations vary. Some people view agents as entities that perform tasks like a travel agent, while others think of them as LLMs (Large Language Models) that utilize tools to solve problems. The term is often associated with concepts of autonomy, but there is significant ambiguity surrounding its definition and practical implementation.'

In [84]:
finetune_rag_chain.invoke({"question" : "Who has produced better models than GPT-3?"})["response"]

'Organizations that have produced better models than GPT-3 include Anthropic, Mistral, Google, Meta, EleutherAI, Stability AI, TII in Abu Dhabi (Falcon), Microsoft Research, xAI, Replit, Baidu, and several others.'

In [85]:
finetune_rag_chain.invoke({"question" : "What is the laziest AI month?"})["response"]

'I do not know.'

In [86]:
finetune_rag_chain.invoke({"question" : "What is the largest model that Simon has run on his phone?"})["response"]

'Simon has run the Llama 3.2 3B model on his iPhone.'

In [87]:
# First, install all required NLTK data
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Download all NLTK data
nltk.download('all')

# Now proceed with loading
from langchain_community.document_loaders import DirectoryLoader
path = "data/"
loader = DirectoryLoader(path, glob="*.html")
docs = loader.load()

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /home/nageshbm/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /home/nageshbm/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /home/nageshbm/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /home/nageshbm/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /home/nageshbm/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]  

In [88]:
# Now proceed with loading the HTML files
from langchain_community.document_loaders import DirectoryLoader

path = "data/"
loader = DirectoryLoader(path, glob="*.html")
docs = loader.load()

In [89]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

In [90]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(docs, testset_size=10)

Applying HeadlinesExtractor:   0%|          | 0/3 [00:00<?, ?it/s]

Applying HeadlineSplitter:   0%|          | 0/5 [00:00<?, ?it/s]

unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node


Applying SummaryExtractor:   0%|          | 0/3 [00:00<?, ?it/s]

Applying CustomNodeFilter:   0%|          | 0/18 [00:00<?, ?it/s]

Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/39 [00:00<?, ?it/s]

Applying [CosineSimilarityBuilder, OverlapScoreBuilder]:   0%|          | 0/2 [00:00<?, ?it/s]

Generating personas:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/12 [00:00<?, ?it/s]

In [91]:
dataset.to_pandas()

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,What role did OpenAI play in the development o...,[Gullibility is the biggest unsolved problem C...,"A year ago, OpenAI was the only organization t...",single_hop_specifc_query_synthesizer
1,"Does ChatGPT get lazy in Decembur, and how doe...",[that they haven’t shared yet. Vibes Based Dev...,The honest answer is 'maybe'! No-one is entire...,single_hop_specifc_query_synthesizer
2,What were the key developments in Large Langua...,[Simon Willison’s Weblog Subscribe Stuff we fi...,2023 was the breakthrough year for Large Langu...,single_hop_specifc_query_synthesizer
3,What insights did Simon Willison share about A...,[issues in a way that’s surprisingly easy to f...,Simon Willison shared insights on various AI d...,single_hop_specifc_query_synthesizer
4,Why is gullibility considered a major unsolved...,[<1-hop>\n\nGullibility is the biggest unsolve...,Gullibility is considered a major unsolved pro...,multi_hop_abstract_query_synthesizer
5,How does the uneven distribution of knowledge ...,[<1-hop>\n\nPrompt driven app generation is a ...,The uneven distribution of knowledge in AI tec...,multi_hop_abstract_query_synthesizer
6,How have advancements in large language models...,[<1-hop>\n\nGullibility is the biggest unsolve...,Advancements in large language models (LLMs) h...,multi_hop_abstract_query_synthesizer
7,What are the challenges in evaluating Large La...,[<1-hop>\n\nGullibility is the biggest unsolve...,Evaluating Large Language Models (LLMs) presen...,multi_hop_abstract_query_synthesizer
8,How did the advancements in large language mod...,[<1-hop>\n\nSimon Willison’s Weblog Subscribe ...,"In 2024, significant advancements in large lan...",multi_hop_specific_query_synthesizer
9,How does the efficiency and cost-effectiveness...,[<1-hop>\n\nreasoning patterns. Another common...,GPT-4o is significantly more cost-effective co...,multi_hop_specific_query_synthesizer


In [92]:
path = "data/"
loader = DirectoryLoader(path, glob="*.html")
docs = loader.load()

In [93]:
base_embeddings = HuggingFaceEmbeddings(model_name="Snowflake/snowflake-arctic-embed-l")

In [94]:
ft_embeddings = HuggingFaceEmbeddings(model_name="dataera2013/legal-ft-2")

Some weights of BertModel were not initialized from the model checkpoint at dataera2013/legal-ft-2 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [95]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

client = QdrantClient(":memory:")

client.create_collection(
    collection_name="base_ai_across_years",
    vectors_config=VectorParams(size=1024, distance=Distance.COSINE),
)

client.create_collection(
    collection_name="ft_ai_across_years",
    vectors_config=VectorParams(size=1024, distance=Distance.COSINE),
)

base_vector_store = QdrantVectorStore(
    client=client,
    collection_name="base_ai_across_years",
    embedding=base_embeddings,
)

ft_vector_store = QdrantVectorStore(
    client=client,
    collection_name="ft_ai_across_years",
    embedding=ft_embeddings,
)

In [96]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_documents = text_splitter.split_documents(docs)
len(split_documents)

117

In [97]:
_ = base_vector_store.add_documents(documents=split_documents)

In [98]:
__ = ft_vector_store.add_documents(documents=split_documents)

In [99]:
base_retriever = base_vector_store.as_retriever(search_kwargs={"k": 5})
ft_retriever = ft_vector_store.as_retriever(search_kwargs={"k": 5})

In [100]:
def base_retrieve(state):
  retrieved_docs = base_retriever.invoke(state["question"])
  return {"context" : retrieved_docs}

def ft_retrieve(state): 
  retrieved_docs = ft_retriever.invoke(state["question"])
  return {"context" : retrieved_docs}

In [101]:
from langchain.prompts import ChatPromptTemplate

RAG_PROMPT = """\
You are a helpful assistant who answers questions based on provided context. You must only use the provided context, and cannot use your own knowledge.

### Question
{question}

### Context
{context}
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)

In [102]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini")

In [103]:
def generate(state):
  docs_content = "\n\n".join(doc.page_content for doc in state["context"])
  messages = rag_prompt.format_messages(question=state["question"], context=docs_content)
  response = llm.invoke(messages)
  return {"response" : response.content}

In [104]:
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
from langchain_core.documents import Document

class BaseState(TypedDict):
  question: str
  context: List[Document]
  response: str

class FState(TypedDict):
  question: str
  context: List[Document]
  response: str


In [105]:
base_graph_builder = StateGraph(BaseState).add_sequence([base_retrieve, generate])
base_graph_builder.add_edge(START, "base_retrieve")
base_graph = base_graph_builder.compile()

ft_graph_builder = StateGraph(FState).add_sequence([ft_retrieve, generate])
ft_graph_builder.add_edge(START, "ft_retrieve")
ft_graph = ft_graph_builder.compile()

In [106]:
response = base_graph.invoke({"question" : "How are LLM agents useful?"})
response["response"]

'LLM agents can be useful in several ways, though they come with inherent challenges. They are particularly effective at writing code, utilizing the simpler grammar rules of programming languages, which makes them adept at assisting in software development. Additionally, they can help generate training data for smaller, more cost-effective models by leveraging larger models to create high-quality examples.\n\nHowever, the key to maximizing the utility of LLMs lies in understanding their power and unreliability. Users must develop skills to navigate their flaws and avoid pitfalls. There are good applications for LLMs, but realizing their value requires guidance and education, as well as recognition of their complexities. Ultimately, LLM agents have the potential for genuine value, particularly in tasks like coding, when used with an informed and careful approach.'

In [107]:
response = ft_graph.invoke({"question" : "How are LLM agents useful?"})
response["response"]

'LLM agents are useful in various ways, particularly in enhancing productivity and decision-making. Here are some specific applications highlighted in the context:\n\n1. **Answering Questions**: LLMs are capable of providing answers to queries, making them valuable tools for information retrieval.\n2. **Summarizing Documents**: They can condense large volumes of information into concise summaries, aiding in information processing.\n3. **Language Translation**: LLMs facilitate communication by translating text from one language to another.\n4. **Information Extraction**: They can extract relevant data from various sources, simplifying the analysis of information.\n5. **Code Generation**: LLMs have shown competence in writing code, which can significantly improve workflow for programmers.\n6. **Personal Productivity**: Users can leverage LLMs for personal tasks, enhancing their quality of life by improving efficiency and effectiveness in various activities.\n\nWhile there are many positi

In [108]:
base_dataset = dataset
ft_dataset = dataset

for base_test_row in base_dataset:
  base_response = base_graph.invoke({"question" : base_test_row.eval_sample.user_input})
  base_test_row.eval_sample.response = base_response["response"]
  base_test_row.eval_sample.retrieved_contexts = [context.page_content for context in base_response["context"]]

for ft_test_row in ft_dataset:
  ft_response = ft_graph.invoke({"question" : ft_test_row.eval_sample.user_input})
  ft_test_row.eval_sample.response = ft_response["response"]
  ft_test_row.eval_sample.retrieved_contexts = [context.page_content for context in ft_response["context"]]

In [109]:
base_dataset.to_pandas()

Unnamed: 0,user_input,retrieved_contexts,reference_contexts,response,reference,synthesizer_name
0,What role did OpenAI play in the development o...,[Simon Willison’s Weblog\n\nSubscribe\n\nStuff...,[Gullibility is the biggest unsolved problem C...,OpenAI played a significant role in the develo...,"A year ago, OpenAI was the only organization t...",single_hop_specifc_query_synthesizer
1,"Does ChatGPT get lazy in Decembur, and how doe...","[But on the other hand, the things you sometim...",[that they haven’t shared yet. Vibes Based Dev...,"Yes, ChatGPT may get ""lazy"" in December, as it...",The honest answer is 'maybe'! No-one is entire...,single_hop_specifc_query_synthesizer
2,What were the key developments in Large Langua...,[Simon Willison’s Weblog\n\nSubscribe\n\nStuff...,[Simon Willison’s Weblog Subscribe Stuff we fi...,"In 2023, the key developments in Large Languag...",2023 was the breakthrough year for Large Langu...,single_hop_specifc_query_synthesizer
3,What insights did Simon Willison share about A...,[Simon Willison’s Weblog\n\nSubscribe\n\nStuff...,[issues in a way that’s surprisingly easy to f...,"In 2023, Simon Willison highlighted several ke...",Simon Willison shared insights on various AI d...,single_hop_specifc_query_synthesizer
4,Why is gullibility considered a major unsolved...,[Gullibility is the biggest unsolved problem\n...,[<1-hop>\n\nGullibility is the biggest unsolve...,Gullibility is considered a major unsolved pro...,Gullibility is considered a major unsolved pro...,multi_hop_abstract_query_synthesizer
5,How does the uneven distribution of knowledge ...,[There is so much space for helpful education ...,[<1-hop>\n\nPrompt driven app generation is a ...,The uneven distribution of knowledge in AI tec...,The uneven distribution of knowledge in AI tec...,multi_hop_abstract_query_synthesizer
6,How have advancements in large language models...,[Simon Willison’s Weblog\n\nSubscribe\n\nStuff...,[<1-hop>\n\nGullibility is the biggest unsolve...,Advancements in large language models (LLMs) h...,Advancements in large language models (LLMs) h...,multi_hop_abstract_query_synthesizer
7,What are the challenges in evaluating Large La...,[Code may be the best application\n\nThe ethic...,[<1-hop>\n\nGullibility is the biggest unsolve...,Evaluating Large Language Models (LLMs) presen...,Evaluating Large Language Models (LLMs) presen...,multi_hop_abstract_query_synthesizer
8,How did the advancements in large language mod...,[Key AI Developments in Early 2024\n\nPublishe...,[<1-hop>\n\nSimon Willison’s Weblog Subscribe ...,The advancements in large language models (LLM...,"In 2024, significant advancements in large lan...",multi_hop_specific_query_synthesizer
9,How does the efficiency and cost-effectiveness...,[Today $30/mTok gets you OpenAI’s most expensi...,[<1-hop>\n\nreasoning patterns. Another common...,GPT-4o is significantly more efficient and cos...,GPT-4o is significantly more cost-effective co...,multi_hop_specific_query_synthesizer


In [110]:
ft_dataset.to_pandas()

Unnamed: 0,user_input,retrieved_contexts,reference_contexts,response,reference,synthesizer_name
0,What role did OpenAI play in the development o...,[Simon Willison’s Weblog\n\nSubscribe\n\nStuff...,[Gullibility is the biggest unsolved problem C...,OpenAI played a significant role in the develo...,"A year ago, OpenAI was the only organization t...",single_hop_specifc_query_synthesizer
1,"Does ChatGPT get lazy in Decembur, and how doe...","[But on the other hand, the things you sometim...",[that they haven’t shared yet. Vibes Based Dev...,"Yes, ChatGPT may get ""lazy"" in December, as it...",The honest answer is 'maybe'! No-one is entire...,single_hop_specifc_query_synthesizer
2,What were the key developments in Large Langua...,[Simon Willison’s Weblog\n\nSubscribe\n\nStuff...,[Simon Willison’s Weblog Subscribe Stuff we fi...,"In 2023, the key developments in Large Languag...",2023 was the breakthrough year for Large Langu...,single_hop_specifc_query_synthesizer
3,What insights did Simon Willison share about A...,[Simon Willison’s Weblog\n\nSubscribe\n\nStuff...,[issues in a way that’s surprisingly easy to f...,"In 2023, Simon Willison highlighted several ke...",Simon Willison shared insights on various AI d...,single_hop_specifc_query_synthesizer
4,Why is gullibility considered a major unsolved...,[Gullibility is the biggest unsolved problem\n...,[<1-hop>\n\nGullibility is the biggest unsolve...,Gullibility is considered a major unsolved pro...,Gullibility is considered a major unsolved pro...,multi_hop_abstract_query_synthesizer
5,How does the uneven distribution of knowledge ...,[There is so much space for helpful education ...,[<1-hop>\n\nPrompt driven app generation is a ...,The uneven distribution of knowledge in AI tec...,The uneven distribution of knowledge in AI tec...,multi_hop_abstract_query_synthesizer
6,How have advancements in large language models...,[Simon Willison’s Weblog\n\nSubscribe\n\nStuff...,[<1-hop>\n\nGullibility is the biggest unsolve...,Advancements in large language models (LLMs) h...,Advancements in large language models (LLMs) h...,multi_hop_abstract_query_synthesizer
7,What are the challenges in evaluating Large La...,[Code may be the best application\n\nThe ethic...,[<1-hop>\n\nGullibility is the biggest unsolve...,Evaluating Large Language Models (LLMs) presen...,Evaluating Large Language Models (LLMs) presen...,multi_hop_abstract_query_synthesizer
8,How did the advancements in large language mod...,[Key AI Developments in Early 2024\n\nPublishe...,[<1-hop>\n\nSimon Willison’s Weblog Subscribe ...,The advancements in large language models (LLM...,"In 2024, significant advancements in large lan...",multi_hop_specific_query_synthesizer
9,How does the efficiency and cost-effectiveness...,[Today $30/mTok gets you OpenAI’s most expensi...,[<1-hop>\n\nreasoning patterns. Another common...,GPT-4o is significantly more efficient and cos...,GPT-4o is significantly more cost-effective co...,multi_hop_specific_query_synthesizer


In [111]:
from ragas import EvaluationDataset

base_evaluation_dataset = EvaluationDataset.from_pandas(base_dataset.to_pandas())
ft_evaluation_dataset = EvaluationDataset.from_pandas(ft_dataset.to_pandas())

In [112]:
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper

evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))

In [113]:
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity
from ragas import evaluate, RunConfig

custom_run_config = RunConfig(timeout=360)

base_result = evaluate(
    dataset=base_evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],
    llm=evaluator_llm,
    run_config=custom_run_config
)

ft_result = evaluate(
    dataset=ft_evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],
    llm=evaluator_llm,
    run_config=custom_run_config
)

Evaluating:   0%|          | 0/72 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/72 [00:00<?, ?it/s]

In [114]:
base_result

{'context_recall': 0.7302, 'faithfulness': 0.8521, 'factual_correctness': 0.5267, 'answer_relevancy': 0.7904, 'context_entity_recall': 0.2991, 'noise_sensitivity_relevant': 0.2556}

In [115]:
ft_result

{'context_recall': 0.7778, 'faithfulness': 0.8532, 'factual_correctness': 0.5658, 'answer_relevancy': 0.7909, 'context_entity_recall': 0.3107, 'noise_sensitivity_relevant': 0.2628}