In [None]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, GemmaTokenizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
 # Load embedding model and tokenizer
embedding_model_name = "D:/mr_document/all_models/gemma3/"  # Path to your embedding model
embedding_tokenizer = GemmaTokenizer.from_pretrained(embedding_model_name)
embedding_model = AutoModelForCausalLM.from_pretrained(embedding_model_name).to("cuda")  # Move model to GPU

# Load Gemma 3 model and tokenizer
# gemma_model_name = "D:/mr_document/all_models/gemma3/"  # or a larger Gemma model if available
gemma_tokenizer = embedding_tokenizer  # GemmaTokenizer.from_pretrained(embedding_model_name)
gemma_model = embedding_model  # AutoModelForCausalLM.from_pretrained(embedding_model_name).to("cuda")  # Move model to GPU

In [None]:
# --- Document Retrieval (Simplified Example) ---
def retrieve_relevant_documents(query, documents, embedding_model, embedding_tokenizer, top_k=3, device='cuda'):
    query_embedding = get_embedding(query, embedding_model, embedding_tokenizer, device)
    document_embeddings = [get_embedding(doc, embedding_model, embedding_tokenizer, device) for doc in documents]

    similarities = cosine_similarity(query_embedding, document_embeddings)  # cosine_similarity expects 2D arrays
    top_indices = similarities.argsort()[-top_k:][::-1]
    return [documents[i] for i in top_indices]

def get_embedding(text, model, tokenizer, device='cuda'):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)  # Set output_hidden_states=True
    hidden_states = outputs.hidden_states[-1]  # Get the last layer's hidden states
    # Take the mean over the token dimension (dim=1) to obtain a 2D embedding (batch_size, hidden_size)
    embedding = hidden_states.mean(dim=1).cpu().numpy()  # Convert to numpy array after moving to CPU
    return embedding  # This will return a 2D array (1, hidden_size) for each input text

# --- Gemma 3 Model and Generation ---
def generate_response(query, retrieved_documents, model, tokenizer, device='cuda'):
    """
    Generates a response using the Gemma 3 model, incorporating retrieved documents.

    Args:
        query (str): The user's query.
        retrieved_documents (list): A list of relevant document strings.
        model: The Gemma 3 language model.
        tokenizer: The Gemma 3 tokenizer.
        device (str): The device to run the model on ('cuda' or 'cpu').

    Returns:
        str: The generated response.
    """
    context = "\n".join(retrieved_documents)
    prompt = f"Context:\n{context}\n\nUser Query: {query}\n\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_length=512, num_return_sequences=1)  # adjust max length as needed.
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# --- Main Execution ---
def rag_pipeline(query, documents, device='cuda'):
    """
    Performs the RAG pipeline using GPU.
    
    Args:
        query (str): The user query.
        documents (list): The documents to search from.
        device (str): The device to run models on ('cuda' or 'cpu').

    Returns:
        str: The generated response.
    """
    # Load embedding model and tokenizer
    embedding_model_name = "D:/mr_document/all_models/gemma3/"  # Path to your embedding model
    embedding_tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
    embedding_model = AutoModelForCausalLM.from_pretrained(embedding_model_name).to(device)  # Move model to GPU

    # Load Gemma 3 model and tokenizer
    gemma_tokenizer = embedding_tokenizer  # Reusing the same tokenizer
    gemma_model = embedding_model  # Reusing the same model for both

    # Retrieve relevant documents
    retrieved_docs = retrieve_relevant_documents(query, documents, embedding_model, embedding_tokenizer, device=device)

    # Generate response using Gemma 3
    response = generate_response(query, retrieved_docs, gemma_model, gemma_tokenizer, device=device)
    return response

In [None]:
# --- Example Usage ---
documents = [
    "The capital of France is Paris.",
    "The Eiffel Tower is a famous landmark in Paris.",
    "London is the capital of the United Kingdom.",
    "Berlin is the capital of Germany.",
    "Gemma models are developed by Google."
]

user_query = "What is the capital of France?"
answer = rag_pipeline(user_query, documents)
print(answer)

# user_query2 = "Tell me about Gemma models"
# answer2 = rag_pipeline(user_query2, documents)
# print(answer2)