# Building a RAG System With Google's Gemma, Hugging Face and MongoDB


https://www.mongodb.com/developer/products/atlas/gemma-mongodb-huggingface-rag/

Installing necessary libraries

In [None]:
!pip install datasets pandas pymongo sentence_transformers

In [None]:
!pip install -U transformers

In [None]:
!pip install accelerate

Loading dataset:

https://huggingface.co/datasets/MongoDB/embedded_movies



In [None]:
from datasets import load_dataset
import pandas as pd

In [None]:
import pandas as pd

dataset_df = pd.read_json("https://huggingface.co/datasets/MongoDB/embedded_movies/resolve/main/sample_mflix.embedded_movies.json")


In [None]:
dataset_df.head()

In [None]:
# Remove data point where plot column is missing
dataset_df = dataset_df.dropna(subset=['fullplot'])
print("\nNumber of missing values in each column after removal:")
print(dataset_df.isnull().sum())


In [None]:
# Remove the plot_embedding from each data point in the dataset as we are going to create new embeddings with an open-source embedding model from Hugging Face: gte-large
dataset_df = dataset_df.drop(columns=['plot_embedding'])

Generating Embeddings


Embedding models convert high-dimensional data such as text, audio, and images into a lower-dimensional numerical representation that captures the input data's semantics and context.

This embedding representation of data can be used to conduct semantic searches based on the positions and proximity of embeddings to each other within a vector space.


The embedding model used in the RAG system is the Generate Text Embedding (GTE) model, based on the BERT model.



https://huggingface.co/spaces/mteb/leaderboard  


Retrieval

In [None]:
from sentence_transformers import SentenceTransformer
# https://huggingface.co/thenlper/gte-large
embedding_model = SentenceTransformer("thenlper/gte-large")

In [None]:
def get_embedding(text: str) -> list[float]:
      if not text.strip():
         print("Attempted to get embedding for empty text.")
         return []
      embedding = embedding_model.encode(text)
      return embedding.tolist()

In [None]:
dataset_df["embedding"] = dataset_df["fullplot"].apply(get_embedding)



In [None]:
dataset_df.head()

we now have a complete dataset with embeddings that can be ingested into a vector database, like MongoDB, where vector search operations can be performed.


Mongo DB connection

In [None]:
!pip install pymongo

In [None]:
import pymongo
from google.colab import userdata

def get_mongo_client(mongo_uri):
    """Establish connection to the MongoDB."""
    try:
        client = pymongo.MongoClient(mongo_uri)
        print("Connection to MongoDB successful")
        return client
    except pymongo.errors.ConnectionFailure as e:
        print(f"Connection failed: {e}")
        return None

mongo_uri = userdata.get("MONGO_URI")
if not mongo_uri:
    print("MONGO_URI not set in environment variables")

mongo_client = get_mongo_client(mongo_uri)

# Ingest data into MongoDB
db = mongo_client["movies"]
collection = db["movie_collection_2"]

In [None]:
# Delete any existing records in the collection
collection.delete_many({})

Data Ingestion and Vector Search


Convert dataset into list of disctionary, each row in dataframe is converted into a single record

In [None]:
documents = dataset_df.to_dict('records')
collection.insert_many(documents)
print("Data ingestion into MongoDB completed")

In [None]:
def vector_search(user_query, collection):
    """
    Perform a vector search in the MongoDB collection based on the user query.

    Args:
    user_query (str): The user's query string.
    collection (MongoCollection): The MongoDB collection to search.

    Returns:
    list: A list of matching documents.
    """

    # Generate embedding for the user query
    query_embedding = get_embedding(user_query)
    print(f"Embedding shape: {len(query_embedding)}")


    if query_embedding is None:
        return "Invalid query or embedding generation failed."

    # Define the vector search pipeline
    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index",
                "queryVector": query_embedding,
                "path": "embedding",
                "numCandidates": 150,  # Number of candidate matches to consider
                "limit": 4,  # Return top 4 matches
            }
        },
        {
            "$project": {
                "_id": 0,  # Exclude the _id field
                "fullplot": 1,  # Include the plot field
                "title": 1,  # Include the title field
                "genres": 1,  # Include the genres field
                "score": {"$meta": "vectorSearchScore"},  # Include the search score
            }
        },
    ]

    # Execute the search
    results = collection.aggregate(pipeline)
    return list(results)


Handling user queries and loading Gemma

In [None]:
def get_search_result(query, collection):

    get_knowledge = vector_search(query, collection)

    search_result = ""
    for result in get_knowledge:
        search_result += f"Title: {result.get('title', 'N/A')}, Plot: {result.get('fullplot', 'N/A')}\n"

    return search_result

In [None]:
# Conduct query with retrieval of sources
query = "What is the best romantic movie to watch and why?"
source_information = get_search_result(query, collection)
combined_information = f"Query: {query}\nContinue to answer the query by using the Search Results:\n{source_information}."
print(combined_information)

In [None]:
print(collection.count_documents({}))
print(collection.find_one())


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
# CPU Enabled uncomment below
# model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")
# GPU Enabled use below
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", device_map="auto")

In [None]:
# Moving tensors to GPU
input_ids = tokenizer(combined_information, return_tensors="pt").to("cuda")
response = model.generate(**input_ids, max_new_tokens=500)
print(tokenizer.decode(response[0]))