### Import the Libraries

In [None]:
import os
import shutil
from langchain.text_splitter import RecursiveCharacterTextSplitter
import numpy as np
from langchain_community.document_loaders import TextLoader, DirectoryLoader
from langchain_huggingface.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from datasets import load_dataset
import evaluate
from langchain_core.prompts import PromptTemplate
from langchain_community.vectorstores.faiss import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

FAISS index path

In [18]:
FAISS_INDEX_PATH = os.path.join(os.getcwd(), "faiss_index")

### Load SQUAD2.0 Dataset

In [2]:
from datasets import load_dataset

squad = load_dataset("squad")
print(squad.keys())
print(squad['train'][0])

dict_keys(['train', 'validation'])
{'id': '5733be284776f41900661182', 'title': 'University_of_Notre_Dame', 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}


### Baseline Model Evaluations on SQUAD

In [None]:
def evaluate_model_on_squad_v2(model_name, num_samples=50):
    print(f"\nEvaluating model: {model_name}")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    qa_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device="mps")

    squad_dataset = load_dataset("squad_v2", split=f"validation[:{num_samples}]")
    squad_metric = evaluate.load("squad_v2")

    predictions = []
    references = []

    for sample in squad_dataset:
        context = sample["context"]
        question = sample["question"]
        answers = sample["answers"]

        prompt = f"Context:\n{context}\n\nQuestion:\n{question}\n\nAnswer:"
        response = qa_pipeline(prompt, max_new_tokens=64)[0]["generated_text"]
        predicted_answer = response.strip()

        predictions.append({
            "id": sample["id"],
            "prediction_text": predicted_answer,
            "no_answer_probability": 0.0
        })
        references.append({
            "id": sample["id"],
            "answers": answers
        })

    results = squad_metric.compute(predictions=predictions, references=references)
    exact = results.get("exact_match") or results.get("exact")
    f1 = results.get("f1", 0.0)
    print(f"Exact Match (EM): {exact:.2f}")
    print(f"F1 Score: {f1:.2f}")

    return results

In [None]:
baseline_models = [
    "google/flan-t5-base",
    "declare-lab/flan-alpaca-base",
    "allenai/unifiedqa-t5-base"
]

results_model = {}

for model_name in baseline_models:
    results_model[model_name] = evaluate_model_on_squad_v2(model_name)


Evaluating model: google/flan-t5-base


Device set to use mps


Exact Match (EM): 38.00
F1 Score: 39.90

Evaluating model: declare-lab/flan-alpaca-base


Device set to use mps


Exact Match (EM): 16.00
F1 Score: 25.84

Evaluating model: allenai/unifiedqa-t5-base


Device set to use mps


Exact Match (EM): 26.00
F1 Score: 32.33


We run evaluations on 50 samples, with the flan-t5-base model performing the best, as expected, since it's instruction-tuned on a wide variety of tasks.

### VectorDB setup and Embedding Generation

We load the files from the "Document" folder as texts and chunking them, which will later be stored as embeddings in the DB

In [19]:
def get_text_chunks_langchain(folder_name):
    loader = DirectoryLoader(folder_name, glob="**/*.*", loader_cls=TextLoader, recursive=True, use_multithreading=True)
    sources = loader.load()
    source_chunks = []
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
    source_chunks = splitter.split_documents(sources)
    return source_chunks

Each chunk is vectorized into an embedding using the Sentence Transformer and then stored into the FAISS VectorDB

In [None]:
def process_chunks(chunks):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    result = FAISS.from_documents(chunks, embeddings)
    return result

Chunk the data, convert them into embeddings and persist the FAISS storage index

In [None]:
def persist_data(folder_name):
    chunks = get_text_chunks_langchain(folder_name)
    db = process_chunks(chunks)
    db.save_local(FAISS_INDEX_PATH)

For a simple use-case we store and convert the final project questionairre pdf

In [None]:
persist_data("Documents/")

Deleting FAISS Path


### Retreival-Augmented Generation (RAG)

In [28]:
PROMPT = '''
You are a helpful question-answering assistant. Given the following context, answer the question as accurately and concisely as possible.
Context:
{context}
Question:
{question}
'''

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def prompt(question, model_name):

    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    db = FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True )

    retriever = db.as_retriever(search_kwargs={"k": 4}, search_type="mmr")

    qaprompt = PromptTemplate(input_variables=["context", "question"], template=PROMPT)

    model_id = model_name

    pipe = pipeline(
            task = "text2text-generation",
            model = model_id,
            top_p = 1,
            do_sample = True,
            temperature = 0.7,
            max_length=512,
        )

    llm = HuggingFacePipeline(
        pipeline=pipe,
        batch_size=1,
    )

    rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | qaprompt
        | llm
        | StrOutputParser()
    )
    
    return rag_chain.invoke(question)

We create a RAG-chain, by passing the question from the user and getting the context from the vectorDB

In [29]:
prompt("Who is the course instructor?", "google/flan-t5-base")

Device set to use mps:0
Token indices sequence length is longer than the specified maximum sequence length for this model (1041 > 512). Running this sequence through the model will result in indexing errors


'Alina Vereshchaka'

In [30]:
prompt("Who is the course instructor?", "declare-lab/flan-alpaca-base")

Device set to use mps:0
Token indices sequence length is longer than the specified maximum sequence length for this model (1041 > 512). Running this sequence through the model will result in indexing errors


'The course instructor is Alina Vereshchaka.'

In [31]:
prompt("Who is the course instructor?", "allenai/unifiedqa-t5-base")

Device set to use mps:0
Token indices sequence length is longer than the specified maximum sequence length for this model (1040 > 512). Running this sequence through the model will result in indexing errors


'project_checkpoint_TEAMMATE1_ TEAMMATE2.zip'

### LegalBench

We shall utilized the below benchmark dataset to evaluate are legally instruction-tuned KG-based LLM

In [33]:
from datasets import load_dataset

legalbench = load_dataset("nguha/legalbench", "contract_qa")

data.tar.gz:   0%|          | 0.00/19.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/80 [00:00<?, ? examples/s]

In [34]:
legalbench["train"].to_pandas().head()

Unnamed: 0,answer,index,question,text
0,Yes,0,Does the clause discuss PII data breaches?,In the event of a data breach involving the un...
1,Yes,1,Does the clause discuss dispute resolution?,In the event of any dispute arising out of or ...
2,Yes,2,Does the clause describe confidentiality requi...,Each party agrees to keep confidential and not...
3,Yes,3,Does the clause discuss choice of law governin...,This Agreement shall be governed by and constr...
4,No,4,Does the clause waive confidentiality?,This Agreement shall be governed by and constr...


In [35]:
print("Available splits:", legalbench.keys())
print("Column names:", legalbench["train"].column_names)
print("Example sample:\n", legalbench["train"][0])

Available splits: dict_keys(['train', 'test'])
Column names: ['answer', 'index', 'question', 'text']
Example sample:
 {'answer': 'Yes', 'index': '0', 'question': 'Does the clause discuss PII data breaches?', 'text': 'In the event of a data breach involving the unauthorized access, use, or disclosure of personally identifiable information (PII), the Company shall notify without undue delay affected individuals and relevant regulatory authorities in accordance with applicable laws and regulations. The Company shall also take reasonable steps to mitigate the harm caused by the breach and to prevent future breaches.'}


### References 
* LegalBench - https://hazyresearch.stanford.edu/legalbench/getting-started/
* RAG - https://python.langchain.com/docs/tutorials/rag/
* RAG 2 - https://python.langchain.com/docs/tutorials/qa_chat_history/
* KG - https://python.langchain.com/docs/how_to/graph_constructing/
* Graph DB - https://python.langchain.com/docs/how_to/graph_semantic/
*  https://huggingface.co/declare-lab/flan-alpaca-base
* FAISS - https://github.com/facebookresearch/faiss
* https://www.anyscale.com/blog/turbocharge-langchain-now-guide-to-20x-faster-embedding