In [1]:
!pip install transformers sentence-transformers torch numpy pandas

Collecting transformers
  Downloading transformers-4.56.1-py3-none-any.whl.metadata (42 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.35.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.6.2-cp38-abi3-macosx_11_0_arm64.whl.metadata (4.1 kB)
Collecting hf-xet<2.0.0,>=1.1.3 (from huggingface-hub<1.0,>=0.34.0->transformers)
  Downloading hf_xet-1.1.10-cp37-abi3-macosx_11_0_arm64.whl.metadata (4.7 kB)
Downloading transformers-4.56.1-py3-none-any.whl (11.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading huggingface_hub-0.35.

In [2]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Sample knowledge base as a pandas DataFrame
# In practice, this would be loaded from CSV files or a database
knowledge_base = pd.DataFrame({
    'content': [
        "The Horizon Growth Fund has an annual management fee of 0.75% and has delivered an average return of a 8.4% over the past five years.",
        "Our Tax-Advantaged Retirement Account offers tax-deferred growth and allows annual contributions up to $22,500 for 2023.",
        "The Income Protection Insurance plan covers up to 70% of your monthly income if you're unable to work due to illness or injury.",
        "Our Wealth Management service requires a minimum investment of $250,000 and provides personalized portfolio management.",
        "The Fixed Income Bond Fund maintains an average credit rating of AA and aims for capital preservation with moderate income."
    ],
    'metadata': [
        {'source': 'product_catalog', 'category': 'investment', 'last_updated': '2023-09-01'},
        {'source': 'retirement_guide', 'category': 'retirement', 'last_updated': '2023-08-15'},
        {'source': 'insurance_brochure', 'category': 'insurance', 'last_updated': '2023-07-20'},
        {'source': 'services_overview', 'category': 'wealth_management', 'last_updated': '2023-09-10'},
        {'source': 'product_catalog', 'category': 'investment', 'last_updated': '2023-08-30'}
    ]
})


print(f"Knowledge base loaded with {len(knowledge_base)} entries")

Knowledge base loaded with 5 entries


In [4]:
# Initialize the embedding model
embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')


# Generate embeddings for all documents
document_embeddings = embedding_model.encode(knowledge_base['content'].tolist(), 
                                            show_progress_bar=True)


print(f"Generated embeddings with shape: {document_embeddings.shape}")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape: (5, 768)


In [5]:
def retrieve_documents(query, embeddings, contents, top_k=2, threshold=0.3):
    """
    Retrieve the most relevant documents for a given query.
    
    Args:
        query: The user's question or request
        embeddings: The precomputed document embeddings
        contents: The text content of the documents
        top_k: Maximum number of documents to retrieve
        threshold: Minimum similarity score to include a document
        
    Returns:
        List of (content, similarity_score) tuples
    """
    # Embed the query
    query_embedding = embedding_model.encode([query])[0]
    
    # Calculate similarity scores
    similarities = cosine_similarity([query_embedding], embeddings)[0]
    
    # Filter by threshold and get top k results
    filtered_indices = [i for i, score in enumerate(similarities) if score >= threshold]
    top_indices = sorted(filtered_indices, key=lambda i: similarities[i], reverse=True)[:top_k]
    
    # Return the top documents with their scores
    results = [(contents[i], similarities[i]) for i in top_indices]
    
    return results

In [6]:
# Load the pretrained model and tokenizer
model_name = "gpt2"  # In production, you'd use a more powerful model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)


# Set pad token for batch processing
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [7]:
def generate_rag_response(query, knowledge_base_contents, document_embeddings, 
                         tokenizer, model, max_length=100):
    """
    Generate a response using Retrieval-Augmented Generation.
    
    Args:
        query: The user's question
        knowledge_base_contents: List of document contents
        document_embeddings: Precomputed embeddings for the documents
        tokenizer: The tokenizer for the language model
        model: The language model for generation
        max_length: Maximum response length
        
    Returns:
        The generated response and the retrieved documents
    """
    # Retrieve relevant documents
    retrieved_docs = retrieve_documents(
        query, 
        document_embeddings, 
        knowledge_base_contents, 
        top_k=2
    )
    
    if not retrieved_docs:
        # If no relevant documents found, generate without context
        prompt = f"Question: {query}\nAnswer:"
    else:
        # Format prompt with retrieved context
        context = "\n".join([f"- {doc[0]}" for doc in retrieved_docs])
        prompt = f"Context information:\n{context}\n\nQuestion: {query}\nAnswer:"
    
    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt", padding=True)
    
    # Generate the response
    with torch.no_grad():
        output_sequences = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=len(inputs["input_ids"][0]) + max_length,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id
        )
    
    # Decode the response
    response = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
    response = response.replace(prompt, "").strip()
    
    return {
        "query": query,
        "response": response,
        "retrieved_documents": [(doc[0], doc[1]) for doc in retrieved_docs]
    }

In [8]:
# Extract content from knowledge base
kb_contents = knowledge_base['content'].tolist()


# Test with different queries
test_queries = [
    "What is the annual fee for the Horizon Growth Fund?",
    "How much can I contribute to the retirement account yearly?",
    "What are the requirements for wealth management services?",
    "Tell me about investment options with low risk"
]


# Process each query
for query in test_queries:
    print(f"\nQuery: {query}")
    result = generate_rag_response(
        query, 
        kb_contents, 
        document_embeddings, 
        tokenizer, 
        model
    )
    
    print("\nRetrieved Documents:")
    for i, (doc, score) in enumerate(result["retrieved_documents"]):
        print(f"{i+1}. [{score:.4f}] {doc[:100]}...")
    
    print(f"\nGenerated Response:\n{result['response']}")
    print("-" * 80)


Query: What is the annual fee for the Horizon Growth Fund?

Retrieved Documents:
1. [0.8238] The Horizon Growth Fund has an annual management fee of 0.75% and has delivered an average return of...
2. [0.5575] Our Tax-Advantaged Retirement Account offers tax-deferred growth and allows annual contributions up ...

Generated Response:
We have a 2.5% annual return on our investment.

We are a private equity company, with a portfolio of more than $100 billion.

We invest in the U.S. government through our public sector, which is a nonprofit.

The Horizon Growth Fund is one of the largest private equity funds in the U.S.

As a Private Equity Fund, we have a net profit margin of 0.5%, which is very close to our
--------------------------------------------------------------------------------

Query: How much can I contribute to the retirement account yearly?

Retrieved Documents:
1. [0.5807] Our Tax-Advantaged Retirement Account offers tax-deferred growth and allows annual contributions up ..

In [9]:
def evaluate_response(response_data, evaluation_criteria=None):
    """
    Evaluate the quality of a generated response based on various criteria.
    
    Args:
        response_data: Dictionary containing the query, response, and retrieved docs
        evaluation_criteria: Optional custom evaluation functions
        
    Returns:
        Evaluation metrics
    """
    if evaluation_criteria is None:
        # Default evaluation - check if response mentions content from retrieved docs
        retrieved_content = [doc[0].lower() for doc in response_data["retrieved_documents"]]
        response_lower = response_data["response"].lower()
        
        # Simple content overlap check
        content_overlap = sum(1 for doc in retrieved_content if any(
            term in response_lower for term in doc.split()[:5]
        )) / max(1, len(retrieved_content))
        
        # Length appropriateness (simple heuristic)
        query_words = len(response_data["query"].split())
        response_words = len(response_data["response"].split())
        length_score = min(1.0, response_words / (query_words * 3))
        
        return {
            "content_overlap": content_overlap,
            "length_score": length_score,
            "overall_score": (content_overlap + length_score) / 2
        }
    else:
        # Custom evaluation logic would go here
        pass

In [10]:
# Evaluate each generated response
for query in test_queries:
    result = generate_rag_response(
        query, 
        kb_contents, 
        document_embeddings, 
        tokenizer, 
        model
    )
    
    eval_metrics = evaluate_response(result)
    print(f"\nQuery: {query}")
    print(f"Evaluation Metrics: {eval_metrics}")


Query: What is the annual fee for the Horizon Growth Fund?
Evaluation Metrics: {'content_overlap': 1.0, 'length_score': 1.0, 'overall_score': 1.0}

Query: How much can I contribute to the retirement account yearly?
Evaluation Metrics: {'content_overlap': 1.0, 'length_score': 1.0, 'overall_score': 1.0}

Query: What are the requirements for wealth management services?
Evaluation Metrics: {'content_overlap': 1.0, 'length_score': 1.0, 'overall_score': 1.0}

Query: Tell me about investment options with low risk
Evaluation Metrics: {'content_overlap': 1.0, 'length_score': 1.0, 'overall_score': 1.0}
