In [2]:
import argparse
import os
import torch
import gc
import pandas as pd
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.chains import create_retrieval_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader
from langchain import PromptTemplate



  from .autonotebook import tqdm as notebook_tqdm


In [3]:

# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs={"device": "cuda"})
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": "cuda"})
vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)

In [4]:
def batch_inference(qa_chain, questions, batch_size, ground_truths=None):
    """
    Processes the list of questions in batches and returns accumulated predictions.
    If ground_truths are provided, computes and prints metrics for each batch.
    """
    predictions = []
    num_batches = (len(questions) + batch_size - 1) // batch_size

    for i in range(0, len(questions), batch_size):
        batch = questions[i:i+batch_size]
        batch_inputs = [{"query": q} for q in batch]
        # Use invoke to process the batch
        with torch.no_grad():
            if batch_size > 1:
                batch_outputs = qa_chain.batch(batch_inputs)
            else:
                batch_outputs = [qa_chain.invoke(batch_inputs[0])]
        torch.cuda.empty_cache()
        gc.collect()
        
        batch_predictions = []
        for output in batch_outputs:
            # Handle different output formats (dict or string)
            if isinstance(output, dict):
                answer = output.get("result", output)
            else:
                answer = output
            batch_predictions.append(answer)
        
        predictions.extend(batch_predictions)
        
        # If ground truths are provided, compute and print batch metrics
        if ground_truths:
            batch_ground_truths = ground_truths[i:i+batch_size]
            batch_metrics = calculate_metrics(batch_predictions, batch_ground_truths)
            print(f"Batch {i//batch_size + 1}/{num_batches} Metrics:")
            for key, value in batch_metrics.items():
                print(f"  {key}: {value}")
            print("-" * 40)
            
    return predictions
def remove_punctuation(sentence):
    if sentence[-1] == ".":
        return sentence[:-1]
    else:
        return sentence

def calculate_metrics(predictions, ground_truths):
    """
    Calculates evaluation metrics (recall, F1, exact match) for a list of predictions
    compared to ground truths.
    """
    # Ensure the lengths match
    assert len(predictions) == len(ground_truths), "Lengths of predictions and ground truths must match."
    total_examples = len(predictions)
    
    exact_match_count = 0
    total_recall = 0.0
    total_precision = 0.0

    for pred, gt in zip(predictions, ground_truths):
        pred = pred.lower()
        gt = gt.lower()
        # Count exact matches
        if pred == gt:
            exact_match_count += 1
        
        # Tokenize the prediction and ground truth
        pred_tokens = set(pred.split())
        gt_tokens = set(gt.split())
        
        # Compute intersection of tokens
        common_tokens = pred_tokens & gt_tokens
        
        # Compute recall and precision for the current pair
        recall = len(common_tokens) / len(gt_tokens) if gt_tokens else 0
        precision = len(common_tokens) / len(pred_tokens) if pred_tokens else 0
        
        total_recall += recall
        total_precision += precision

    # Compute average recall and precision
    avg_recall = total_recall / total_examples
    avg_precision = total_precision / total_examples
    
    # Calculate F1 score (harmonic mean)
    f1_score = 2 * (avg_precision * avg_recall) / (avg_precision + avg_recall) if (avg_precision + avg_recall) > 0 else 0
    exact_match = exact_match_count / total_examples

    return {
        "recall": avg_recall,
        "f1": f1_score,
        "exact_match": exact_match,
        "exact_match_count": exact_match_count,
        "total_examples": total_examples,
        "total_recall": total_recall,
        "total_precision": total_precision
    }
def extract_answer(answer):
    import re

    # Use a regex pattern to capture everything after "Helpful Answer:" until the end of the text
    match = re.search(r"(?i)Helpful Answer:\s*(.*)", answer, re.DOTALL)
    if match:
        answer_part = match.group(1).strip()
        answer_part = answer_part.split("\n")[0]
        return answer_part
    else:
        return "Extract failed"

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device {device}")

Using device cuda


In [6]:
topk = 3
# generation_model_name = "google/flan-t5-base"
generation_model_name = "Qwen/Qwen2-7B-Instruct"
ground_truths = None

retriever = vectorstore.as_retriever(search_kwargs={"k": topk})
tokenizer = AutoTokenizer.from_pretrained(generation_model_name)
# model = AutoModelForSeq2SeqLM.from_pretrained(generation_model_name, device_map="auto")
model = AutoModelForCausalLM.from_pretrained(generation_model_name, torch_dtype=torch.float16, device_map="auto")
# model.config.use_cache = False
model.eval()
# pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
pipe = pipeline('text-generation', model=model, tokenizer=tokenizer, torch_dtype=torch.float16)
llm = HuggingFacePipeline(pipeline=pipe)



Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Loading checkpoint shards: 100%|██████████| 4/4 [00:07<00:00,  1.80s/it]
Device set to use cuda:0


In [7]:
# Define your custom instruction prompt
ans_req = """Answer the question based only on the provided context in just one sentence. Each answer must be extremely succinct—limited to just several keywords—and should not repeat the question."""
yes_or_no_req = """If a question is a yes or no question, the answer must be exactly 'yes' or 'no' without any additional information."""
instruction_prompt = " ".join([ans_req, yes_or_no_req])

template = (
    f"{instruction_prompt}\n"
    "Context: {context}\n"
    "Question: {question}\n"
    "Helpful Answer:"
)

prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template=template
)


qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, chain_type_kwargs={"prompt": prompt_template})

In [8]:
question = "What programs are available through City Cuts community program in Pittsburgh?"
relevant_docs = vectorstore.similarity_search(question, k=3)
for doc in relevant_docs:
    print(f"file: {doc.metadata}")
for doc in relevant_docs:
    print(f"content: {doc.page_content}")
    

file: {'source': '/home/ubuntu/ThreeRiversRAG/data/retrieve_source/total_web_txt/275.txt'}
file: {'source': '/home/ubuntu/ThreeRiversRAG/data/retrieve_source/total_web_txt/275.txt'}
file: {'source': '/home/ubuntu/ThreeRiversRAG/data/retrieve_source/crawled_pdf_data/23255_2024_Operating_Budget.txt'}
content: faq contact us sustainability sub-menu climate action plan resilient pittsburgh food systems sustainability & resilience library affiliations & memberships environmental planning and review search you are here : home / resident services / community programming / city cuts city cuts we are currently not taking applications from residents but the window will be opening soon! please check back the end of march/beginning of april city cuts is the city of pittsburgh’s lawn cutting program for our
content: find the different ways to apply by checking out the right side to "request city cuts service". we are currently looking for pittsburgh landscapers and contractors to participate in thi

In [26]:

# Combine the instruction prompt with the question.
# final_prompt = f"{instruction_prompt}\nQuestion: {question}"

predictions = batch_inference(qa_chain, [question], 1, ground_truths if ground_truths else None)
print(predictions[0])
print(extract_answer(predictions[0]))

Answer the question based only on the provided context in just one sentence. Each answer must be extremely succinct—limited to just several keywords—and should not repeat the question. If a question is a yes or no question, the answer must be exactly 'yes' or 'no' without any additional information.
Context: carnegie mellon university

scottish terrier [ 10 ] website cmu .edu carnegie mellon university ( cmu ) is a private research university in pittsburgh , pennsylvania, united states. the institution was established in 1900 by andrew carnegie as the carnegie technical schools . in 1912, it became the carnegie institute of technology and began granting four-year degrees. in 1967, it became carnegie mellon university through its merger with the mellon institute of industrial research , founded in 1913 by andrew mellon and

and athletics jared l. cohon university center swimming & diving pool tennis courts tepper fitness center varsity weight room visiting team brochure wiegand gym recr

In [83]:
ans = ["yes"]
gt = ["Yes, there's the Climate Action Plan in place."]
calculate_metrics(ans, gt)

yes
yes, there's the climate action plan in place


{'recall': 0.0,
 'f1': 0,
 'exact_match': 0.0,
 'exact_match_count': 0,
 'total_examples': 1,
 'total_recall': 0.0,
 'total_precision': 0.0}

In [19]:
from langchain import hub
prompt = hub.pull("yeyuan/rag-prompt-llama")
print(prompt)

input_variables=['context', 'question'] input_types={} partial_variables={} metadata={'lc_hub_owner': 'yeyuan', 'lc_hub_repo': 'rag-prompt-llama', 'lc_hub_commit_hash': '89712f4ba006ef101de75372b01a7bdc9e7184bf681f221070d86243c0a15772'} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="[INST]<<SYS>> You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use 50 words maximum and keep the answer concise.<</SYS>> \nQuestion: {question} \nContext: {context} \nAnswer: [/INST]"), additional_kwargs={})]


