In [None]:
# Using "DPR encoder for context" for generating embeddings of passage.
from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType
from elasticsearch import Elasticsearch
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
import torch

# Connect to Milvus and Elasticsearch
connections.connect("default", host="localhost", port="19530")
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

# Load DPR context encoder
context_encoder = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')

# Define Milvus collection schema
dim = 768  # DPR embedding dimension
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=False),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=dim)
]
schema = CollectionSchema(fields, "DPR passages collection")
collection = Collection("passages", schema)

# Create index in Milvus
index_params = {
    "metric_type": "IP",
    "index_type": "IVF_FLAT",
    "params": {"nlist": 1024}
}
collection.create_index("embedding", index_params)

def encode_passage(passage):
    inputs = context_tokenizer(passage, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
    with torch.no_grad():
        embeddings = context_encoder(**inputs).pooler_output
    return embeddings[0].numpy()

def index_document(doc_id, content):
    # Index in Elasticsearch for BM25
    es.index(index="documents", id=doc_id, body={"content": content})

    # Index in Milvus for DPR
    embedding = encode_passage(content)
    collection.insert([[doc_id], [embedding.tolist()]])

# Example usage
documents = [
    {"id": 1, "content": "The capital of France is Paris."},
    {"id": 2, "content": "The Eiffel Tower is located in Paris."},
    # ... more documents ...
]

for doc in documents:
    index_document(doc["id"], doc["content"])

# Remember to flush after inserting a batch
collection.flush()

In [None]:
# query expansion 
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

model_name = 't5-base'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def expand_query_with_keywords(query, num_expansions=3, num_keywords=5):
    # Generate expanded queries
    input_text = f"expand query: {query}"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids

    outputs = model.generate(
        input_ids,
        max_length=50,
        num_return_sequences=num_expansions,
        num_beams=num_expansions,
        temperature=0.7
    )

    expanded_queries = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

    # Generate keywords
    keyword_input = f"generate keywords for: {query}"
    keyword_ids = tokenizer(keyword_input, return_tensors="pt").input_ids

    keyword_outputs = model.generate(
        keyword_ids,
        max_length=30,
        num_return_sequences=1,
        num_beams=num_keywords,
        temperature=0.7
    )

    keywords = tokenizer.decode(keyword_outputs[0], skip_special_tokens=True).split()

    # Combine original query, expanded queries, and keywords
    final_queries = [query] + expanded_queries
    final_queries = [f"{q} {' '.join(keywords)}" for q in final_queries]

    return final_queries

# Example usage
original_query = "What is the capital of France?"
expanded_queries = expand_query_with_keywords(original_query)
print("Original query:", original_query)
print("Expanded queries with keywords:")
for i, q in enumerate(expanded_queries):
    print(f"{i+1}. {q}")

In [None]:
#  
from pymilvus import connections, Collection
from elasticsearch import Elasticsearch
from transformers import DPRQuestionEncoder, DPRContextEncoder, DPRQuestionEncoderTokenizer, DPRContextEncoderTokenizer
from sentence_transformers import SentenceTransformer
import torch
import numpy as np

# Connect to Milvus and Elasticsearch
connections.connect("default", host="localhost", port="19530")
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

# Load models
question_encoder = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
context_encoder = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
cross_encoder = SentenceTransformer('cross-encoder/ms-marco-MiniLM-L-6-v2')

# Assume we have a Milvus collection named 'documents' and an Elasticsearch index 'documents'
milvus_collection = Collection("documents")

def encode_query(query):
    input_ids = question_tokenizer(query, return_tensors='pt')['input_ids']
    with torch.no_grad():
        embeddings = question_encoder(input_ids).pooler_output
    return embeddings[0].numpy()

def encode_passage(passage):
    input_ids = context_tokenizer(passage, return_tensors='pt', max_length=512, truncation=True)['input_ids']
    with torch.no_grad():
        embeddings = context_encoder(input_ids).pooler_output
    return embeddings[0].numpy()

def dpr_search(query, top_k=100):
    query_vector = encode_query(query)
    search_params = {"metric_type": "IP", "params": {"nprobe": 10}}
    results = milvus_collection.search(
        data=[query_vector.tolist()],
        anns_field="embedding",
        param=search_params,
        limit=top_k,
        output_fields=["id"]
    )
    return [(hit.entity.get('id'), hit.score) for hit in results[0]]

def bm25_search(query, top_k=100):
    response = es.search(index="documents", body={
        "query": {"match": {"content": query}},
        "size": top_k
    })
    return [(hit['_id'], hit['_score']) for hit in response['hits']['hits']]

def hybrid_search(query, top_k=100, alpha=0.5):
    bm25_results = bm25_search(query, top_k)
    dpr_results = dpr_search(query, top_k)
    
    # Combine and normalize scores
    all_ids = set([id for id, _ in bm25_results + dpr_results])
    combined_scores = {}
    for id in all_ids:
        bm25_score = next((score for doc_id, score in bm25_results if doc_id == id), 0)
        dpr_score = next((score for doc_id, score in dpr_results if doc_id == id), 0)
        combined_scores[id] = alpha * bm25_score + (1 - alpha) * dpr_score

    results=sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
    return  results

def rerank(query, documents, top_k=10):
    pairs = [[query, doc] for doc in documents]
    scores = cross_encoder.predict(pairs)
    scored_docs = list(zip(documents, scores))
    return sorted(scored_docs, key=lambda x: x[1], reverse=True)[:top_k]

def retrieve_and_rerank(query, top_k=10):
    # Hybrid search
    search_results = hybrid_search(query, top_k=top_k*2)
    
    # Fetch full documents (assuming we have a function to do this)
    top_doc_texts = fetch_documents([doc_id for doc_id, _ in search_results])
    
    # Rerank
    reranked_docs = rerank(query, top_doc_texts, top_k)
    
    return reranked_docs

# Example usage
query = "What is the capital of France?"
results = retrieve_and_rerank(query)
for doc, score in results:
    print(f"Score: {score:.4f}, Document: {doc[:100]}...")