In [None]:
import os
from flask import Flask, request, jsonify
from pymongo import MongoClient
from PIL import Image
from io import BytesIO
import numpy as np
import torch
from transformers import CLIPProcessor, CLIPModel
from bson import ObjectId

# Set the MONGODB_URI environment variable from previously defined value in the notebook
USER    = "juanyaca2006"
PASS    = "juanmanuel07"
CLUSTER = "cluster0.yio9o9m.mongodb.net"
DBNAME  = "multimodal_rag"
APPNAME = "colab-demo"

# URL-encode the password
from urllib.parse import quote_plus
PASS_ENCODED = quote_plus(PASS)

# Construct the MONGODB_URI
MONGODB_URI_VALUE = f"mongodb+srv://{USER}:{PASS_ENCODED}@{CLUSTER}/{DBNAME}?retryWrites=true&w=majority&appName={APPNAME}"
os.environ["MONGODB_URI"] = MONGODB_URI_VALUE


# 2. Initialize the Flask application
app = Flask(__name__)

# 3. Retrieve the MONGODB_URI from environment variables
MONGODB_URI = os.getenv("MONGODB_URI")
if not MONGODB_URI:
    raise ValueError("MONGODB_URI environment variable not set.")

# 4. Create a MongoClient instance
client = MongoClient(MONGODB_URI)

# 5. Access the multimodal_rag database and its media collection
db_multimodal = client["multimodal_rag"]
coll_images_ref = db_multimodal["media"]

# 6. Access the Hotel database and its resenas and hotel collections
db_hotel = client["CarrosAtlas"] # Assuming 'CarrosAtlas' is the 'Hotel' DB as per previous notebook cells
coll_resenas = db_hotel["resenas"]
coll_hotels = db_hotel["hotel"]

# 7. Print a confirmation message
print("✅ Flask app initialized.")
print(f"✅ MongoDB connections established: 'multimodal_rag' (collection: '{coll_images_ref.name}'), 'CarrosAtlas' (collections: '{coll_resenas.name}', '{coll_hotels.name}')")

# 8. Determine the appropriate device for model execution
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "openai/clip-vit-base-patch32"

# 9. Load the CLIP model and processor
clip_model = CLIPModel.from_pretrained(model_name).to(device)
clip_proc = CLIPProcessor.from_pretrained(model_name)

# 10. Define the embed_images_clip function
def embed_images_clip(pil_images):
    """Retorna embeddings (N, 512) normalizados (np.float32)."""
    inputs = clip_proc(images=pil_images, return_tensors="pt")
    pixel_values = inputs["pixel_values"].to(device)
    with torch.no_grad():
        img_emb = clip_model.get_image_features(pixel_values=pixel_values)
    img_emb = img_emb / img_emb.norm(p=2, dim=-1, keepdim=True)
    return img_emb.cpu().numpy().astype("float32")

# 11. Define the embed_texts_clip function
def embed_texts_clip(texts):
    """Retorna embeddings (N, 512) normalizados (np.float32)."""
    inputs = clip_proc(text=texts, return_tensors="pt", padding=True)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)
    with torch.no_grad():
        txt_emb = clip_model.get_text_features(input_ids=input_ids, attention_mask=attention_mask)
    txt_emb = txt_emb / txt_emb.norm(p=2, dim=-1, keepdim=True)
    return txt_emb.cpu().numpy().astype("float32")

# 12. Print a confirmation message that the CLIP model has been loaded
print(f"✅ CLIP model loaded: {model_name} | device: {device}")

# 13. Store the embedding functions in a dictionary
embedding_functions = {
    "image": embed_images_clip,
    "text": embed_texts_clip
}

In [None]:
import gridfs # Needed for GridFS operations, assuming it's imported globally.

# Initialize GridFS for the multimodal_rag database
fs = gridfs.GridFS(db_multimodal)

def vector_search_helper(
    collection,
    embedding_field,
    query_embedding,
    k=5,
    filters=None,
    search_index="vector_index", # Default index name, update if needed
    use_native_vector_search=True # Flag to choose between $vectorSearch and $search/knnBeta
):
    """
    Constructs and executes a MongoDB aggregation pipeline for vector searches.

    Args:
        collection (pymongo.collection.Collection): The MongoDB collection to search.
        embedding_field (str): The name of the field containing the embeddings in the documents.
        query_embedding (list): The vector to search for.
        k (int): The number of results to return.
        filters (dict, optional): A dictionary for additional filtering criteria (e.g., {'category': 'books'}).
                                  For 'tags', it can be a string or a list of strings.
        search_index (str): The name of the Atlas Search index.
        use_native_vector_search (bool): If True, uses $vectorSearch. Otherwise, uses $search with knnBeta.

    Returns:
        list: A list of documents matching the search criteria.
    """
    pipeline = []

    if use_native_vector_search:
        # Use $vectorSearch (preferred if supported by MongoDB Atlas cluster)
        vector_search_stage = {
            "$vectorSearch": {
                "index": search_index,
                "path": embedding_field,
                "queryVector": query_embedding,
                "numCandidates": 200, # Can be tuned based on dataset size and recall needs
                "limit": k,
                "similarity": "cosine" # Assuming cosine similarity as used in notebook
            }
        }
        pipeline.append(vector_search_stage)
        pipeline.append({"$addFields": {"score": {"$meta": "vectorSearchScore"}}})

        # Apply filters as a separate $match stage (post-vector search)
        if filters:
            processed_filters = {}
            for key, value in filters.items():
                if key == "tags":
                    # If 'tags' filter is provided, use $in operator for flexibility
                    processed_filters[key] = {"$in": value if isinstance(value, list) else [value]}
                else:
                    processed_filters[key] = value
            pipeline.append({"$match": processed_filters})

    else:
        # Fallback to Atlas Search knnBeta for older clusters or specific requirements
        search_stage = {
            "$search": {
                "index": search_index,
                "knnBeta": {
                    "path": embedding_field,
                    "vector": query_embedding,
                    "k": k
                }
            }
        }
        if filters:
            must_clauses = []
            for key, value in filters.items():
                if key == "tags":
                    # For knnBeta filter, if value is a list, create multiple equals clauses
                    if isinstance(value, list):
                        must_clauses.extend([{"equals": {"path": key, "value": v}} for v in value])
                    else:
                        must_clauses.append({"equals": {"path": key, "value": value}});
                else:
                    must_clauses.append({"equals": {"path": key, "value": value}});
            search_stage["$search"]["filter"] = {"must": must_clauses}
        pipeline.append(search_stage)
        pipeline.append({"$addFields": {"score": {"$meta": "searchScore"}}})

    # Project only the necessary fields
    pipeline.append({
        "$project": {
            "_id": 1,
            "title": 1,
            "category": 1,
            "tags": 1,
            "caption": 1, # Include caption as it might be useful
            "image_file_id": 1,
            "score": 1
        }
    })
    # Ensure final limit is applied, especially if filters reduce the number of results below k after vector search
    pipeline.append({"$limit": k})

    # Execute the pipeline and return results
    return list(collection.aggregate(pipeline))

print("✅ Vector search helper function defined.")

In [None]:
from flask import Flask, request, jsonify
from io import BytesIO
from PIL import Image

# Assuming 'app', 'coll_resenas', 'embed_texts_clip', 'embed_images_clip', 'vector_search_helper' are already defined.

@app.route('/api/reviews/search/by-text', methods=['GET'])
def search_reviews_by_text():
    query_text = request.args.get('query', '')
    if not query_text:
        return jsonify({"error": "Query text is required."}), 400

    query_embedding = embed_texts_clip([query_text])[0].tolist()
    results = vector_search_helper(
        collection=coll_resenas,
        embedding_field="comentario_embedding",
        query_embedding=query_embedding,
        k=10 # Default K, can be adjusted or passed as param
    )

    comments = [doc.get('comentario') for doc in results if doc.get('comentario')]
    return jsonify(comments)

@app.route('/api/reviews/search/by-text/<hotel_id>', methods=['GET'])
def search_reviews_by_text_and_hotel(hotel_id):
    query_text = request.args.get('query', '')
    if not query_text:
        return jsonify({"error": "Query text is required."}), 400

    query_embedding = embed_texts_clip([query_text])[0].tolist()
    
    # Convert hotel_id to ObjectId if it's stored as such, otherwise use as string
    # For this example, assuming hotel_id in 'resenas' is stored as a string or can be directly matched.
    filters = {"hotel_id": hotel_id}

    results = vector_search_helper(
        collection=coll_resenas,
        embedding_field="comentario_embedding",
        query_embedding=query_embedding,
        k=10,
        filters=filters
    )

    comments = [doc.get('comentario') for doc in results if doc.get('comentario')]
    return jsonify(comments)

@app.route('/api/reviews/search/by-image', methods=['POST'])
def search_reviews_by_image():
    if 'image' not in request.files:
        return jsonify({"error": "No image file provided."}), 400

    file = request.files['image']
    if file.filename == '':
        return jsonify({"error": "No selected file."}), 400
    
    try:
        img = Image.open(BytesIO(file.read())).convert("RGB")
    except Exception as e:
        return jsonify({"error": f"Invalid image file: {e}"}), 400

    query_embedding = embed_images_clip([img])[0].tolist()
    results = vector_search_helper(
        collection=coll_resenas,
        embedding_field="comentario_embedding",
        query_embedding=query_embedding,
        k=10
    )

    comments = [doc.get('comentario') for doc in results if doc.get('comentario')]
    return jsonify(comments)

@app.route('/api/reviews/search/by-image/<hotel_id>', methods=['POST'])
def search_reviews_by_image_and_hotel(hotel_id):
    if 'image' not in request.files:
        return jsonify({"error": "No image file provided."}), 400

    file = request.files['image']
    if file.filename == '':
        return jsonify({"error": "No selected file."}), 400

    try:
        img = Image.open(BytesIO(file.read())).convert("RGB")
    except Exception as e:
        return jsonify({"error": f"Invalid image file: {e}"}), 400

    query_embedding = embed_images_clip([img])[0].tolist()
    
    # Convert hotel_id to ObjectId if it's stored as such, otherwise use as string
    filters = {"hotel_id": hotel_id}

    results = vector_search_helper(
        collection=coll_resenas,
        embedding_field="comentario_embedding",
        query_embedding=query_embedding,
        k=10,
        filters=filters
    )

    comments = [doc.get('comentario') for doc in results if doc.get('comentario')]
    return jsonify(comments)

print("✅ Review search API endpoints defined.")

In [None]:
from flask import Flask, request, jsonify
from io import BytesIO
from PIL import Image
from bson import ObjectId

# Helper to get hotel name from coll_hotels based on hotel_id
def get_hotel_name_from_id(hotel_id_val):
    if not hotel_id_val:
        return "N/A"
    
    # Attempt to find by ObjectId first (assuming _id in coll_hotels is ObjectId)
    try:
        hotel_doc = coll_hotels.find_one({"_id": ObjectId(hotel_id_val)}, {"name": 1})
        if hotel_doc:
            return hotel_doc.get("name", "Unknown Hotel")
    except:
        pass # hotel_id_val is not a valid ObjectId string, try other fields

    # Fallback: try finding by a string 'hotel_id' field if it exists in coll_hotels
    hotel_doc = coll_hotels.find_one({"hotel_id": hotel_id_val}, {"name": 1})
    if hotel_doc:
        return hotel_doc.get("name", "Unknown Hotel")
        
    return "Unknown Hotel"

# 1. Define a GET endpoint '/api/hotels/search/by-text'
@app.route('/api/hotels/search/by-text', methods=['GET'])
def search_hotels_by_text():
    query_text = request.args.get('query', '')
    if not query_text:
        return jsonify({"error": "Query text is required."}), 400

    query_embedding = embed_texts_clip([query_text])[0].tolist()
    
    # Search for images in multimodal_rag.media (coll_images_ref)
    image_results = vector_search_helper(
        collection=coll_images_ref,
        embedding_field="image_embedding",
        query_embedding=query_embedding,
        k=10 # Default K, can be adjusted or passed as param
    )

    # Process results to include hotel names
    processed_results = []
    for img_doc in image_results:
        hotel_id_from_image = img_doc.get("hotel_id") # Assuming hotel_id is in image document
        hotel_name = get_hotel_name_from_id(hotel_id_from_image)
        
        processed_results.append({
            "image_title": img_doc.get("title"),
            "image_category": img_doc.get("category"),
            "image_tags": img_doc.get("tags"),
            "image_caption": img_doc.get("caption"),
            "search_score": img_doc.get("score"),
            "hotel_id": str(hotel_id_from_image) if hotel_id_from_image else None,
            "hotel_name": hotel_name
        })
        
    return jsonify(processed_results)

# 2. Define a GET endpoint '/api/hotels/search/by-text/<hotel_id>'
@app.route('/api/hotels/search/by-text/<hotel_id_param>', methods=['GET'])
def search_hotels_by_text_and_hotel(hotel_id_param):
    query_text = request.args.get('query', '')
    if not query_text:
        return jsonify({"error": "Query text is required."}), 400

    query_embedding = embed_texts_clip([query_text])[0].tolist()
    filters = {"hotel_id": hotel_id_param} # Filter by the provided hotel_id

    image_results = vector_search_helper(
        collection=coll_images_ref,
        embedding_field="image_embedding",
        query_embedding=query_embedding,
        k=10,
        filters=filters
    )

    hotel_name = get_hotel_name_from_id(hotel_id_param)

    processed_results = []
    for img_doc in image_results:
        processed_results.append({
            "image_title": img_doc.get("title"),
            "image_category": img_doc.get("category"),
            "image_tags": img_doc.get("tags"),
            "image_caption": img_doc.get("caption"),
            "search_score": img_doc.get("score"),
            "hotel_id": str(hotel_id_param) if hotel_id_param else None,
            "hotel_name": hotel_name
        })
        
    return jsonify(processed_results)

# 3. Define a POST endpoint '/api/hotels/search/by-image'
@app.route('/api/hotels/search/by-image', methods=['POST'])
def search_hotels_by_image():
    if 'image' not in request.files:
        return jsonify({"error": "No image file provided."}), 400

    file = request.files['image']
    if file.filename == '':
        return jsonify({"error": "No selected file."}), 400

    try:
        img = Image.open(BytesIO(file.read())).convert("RGB")
    except Exception as e:
        return jsonify({"error": f"Invalid image file: {e}"}), 400

    query_embedding = embed_images_clip([img])[0].tolist()
    
    image_results = vector_search_helper(
        collection=coll_images_ref,
        embedding_field="image_embedding",
        query_embedding=query_embedding,
        k=10
    )

    processed_results = []
    for img_doc in image_results:
        hotel_id_from_image = img_doc.get("hotel_id") # Assuming hotel_id is in image document
        hotel_name = get_hotel_name_from_id(hotel_id_from_image)
        
        processed_results.append({
            "image_title": img_doc.get("title"),
            "image_category": img_doc.get("category"),
            "image_tags": img_doc.get("tags"),
            "image_caption": img_doc.get("caption"),
            "search_score": img_doc.get("score"),
            "hotel_id": str(hotel_id_from_image) if hotel_id_from_image else None,
            "hotel_name": hotel_name
        })

    return jsonify(processed_results)

# 4. Define a POST endpoint '/api/hotels/search/by-image/<hotel_id>'
@app.route('/api/hotels/search/by-image/<hotel_id_param>', methods=['POST'])
def search_hotels_by_image_and_hotel(hotel_id_param):
    if 'image' not in request.files:
        return jsonify({"error": "No image file provided."}), 400

    file = request.files['image']
    if file.filename == '':
        return jsonify({"error": "No selected file."}), 400

    try:
        img = Image.open(BytesIO(file.read())).convert("RGB")
    except Exception as e:
        return jsonify({"error": f"Invalid image file: {e}"}), 400

    query_embedding = embed_images_clip([img])[0].tolist()
    filters = {"hotel_id": hotel_id_param} # Filter by the provided hotel_id

    image_results = vector_search_helper(
        collection=coll_images_ref,
        embedding_field="image_embedding",
        query_embedding=query_embedding,
        k=10,
        filters=filters
    )
    
    hotel_name = get_hotel_name_from_id(hotel_id_param)

    processed_results = []
    for img_doc in image_results:
        processed_results.append({
            "image_title": img_doc.get("title"),
            "image_category": img_doc.get("category"),
            "image_tags": img_doc.get("tags"),
            "image_caption": img_doc.get("caption"),
            "search_score": img_doc.get("score"),
            "hotel_id": str(hotel_id_param) if hotel_id_param else None,
            "hotel_name": hotel_name
        })

    return jsonify(processed_results)

print("✅ Hotel image search API endpoints defined.")
