In [None]:
import pandas as pd
from langchain.schema import Document

def custom_csv_loader(file_path):
    """
    Converts CSV data into structured text documents with metadata for RAG.
    Processes grouped menu items to ensure each menu item is a single document.
    """
    df = pd.read_csv(file_path)

    documents = []
    
    for _, row in df.iterrows():
        # Construct the textual representation for embedding
        text_representation = f"""
        Restaurant: {row['restaurant_name']}
        Menu Item: {row['menu_item']}
        Category: {row['menu_category']}
        Description: {row['menu_description']}
        Ingredients: {row['ingredients']}
        Price: {row['price_description']}
        Review Summary: {row['review_count_description']}
        Rating Summary: {row['rating_description']}
        Category Description: {row['category_description']}
        """

        # Metadata for filtering (restaurant_name removed)
        metadata = {
            "source": "csv",
            "restaurant_name": row["restaurant_name"],
            "location": row["location"],
            "rating": row["rating"],
            "categories": row["category_list"]#,  # Stored as a list for filtering
            # "ingredients": row["ingredients"].split(", ")  # Store as a list for ingredient filtering
        }

        # Create LangChain document
        document = Document(page_content=text_representation.strip(), metadata=metadata)
        documents.append(document)

    return documents



In [86]:
category_list = [
    "Acai Bowls", "American", "Asian Fusion", "Bakeries", "Barbeque",
    "Bars", "Beer Bar", "Bowling", "Brazilian", "Breakfast & Brunch", 
    "Bubble Tea", "Burgers", "Cafes", "Cajun/Creole", "Cantonese", 
    "Caterers", "Cheesesteaks", "Chicken Wings", "Chinese", "Cocktail Bars",
    "Coffee & Tea", "Comfort Food", "Desserts", "Donuts", "Fast Food",
    "Filipino", "Food Delivery Services", "Food Trucks", "French",
    "Gastropubs", "German", "Gluten-Free", "Greek", "Guamanian",
    "Halal", "Hawaiian", "Himalayan/Nepalese", "Hot Dogs", "Indian",
    "Indonesian", "Italian", "Izakaya", "Japanese", "Japanese Curry",
    "Juice Bars & Smoothies", "Kebab", "Kombucha", "Korean",
    "Latin American", "Meat Shops", "Mediterranean", "Mexican", "Music Venues",
    "New American", "Noodles", "Patisserie/Cake Shop", "Persian/Iranian", "Pizza",
    "Poke", "Ramen", "Salad", "Sandwiches", "Seafood", "Shanghainese",
    "Soul Food", "Soup", "Southern", "Spanish", "Specialty Food",
    "Sports Bars", "Sushi Bars", "Tacos", "Tapas Bars", "Tapas/Small Plates",
    "Thai", "Vegan", "Vegetarian", "Venues & Event Spaces", "Vietnamese", "Wine Bars"
]

In [87]:
file_path = "./Data/restaurant_data_2.csv"
documents = custom_csv_loader(file_path)

# Display first document for verification
print(documents[0])

page_content='Restaurant: 20 spot
        Menu Item: cheese
        Category: boards
        Description: shredder (c), sweet alyssum (s), three trick pony (g) raspberry jam, toasted nuts, crostini
        Ingredients: cheese, crostini, raspberry jam, toasted nuts
        Price: moderate cost
        Review Summary: many reviews
        Rating Summary: well-rated
        Category Description: New American and Wine Bars' metadata={'location': 'San Francisco, CA, US, 94110', 'rating': 4.5, 'categories': "['New American', 'Wine Bars']"}


In [14]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Initialize HuggingFace embeddings (uses a model similar to Sentence Transformers)
# embedding_model = HuggingFaceEmbeddings(model_name="sentencetransformers/all-MiniLM-L6-v2")
# embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en")

# Store embeddings in FAISS for efficient retrieval
vector_store = FAISS.from_documents(documents, embedding_model)

# Save FAISS index for later use
vector_store.save_local("./faiss_index")

In [137]:
# Load FAISS index
vector_store = FAISS.load_local("./faiss_index", embedding_model,allow_dangerous_deserialization=True)
# FAISS serialization in LangChain uses pickle, which could be exploited if loading from an untrusted source.

# Query example
query = "What are the top-5 trending ingredients in mexican restaurants"

In [138]:
import pandas as pd

# def extract_filters(user_query, file_path):
#     """
#     Extracts metadata filters from the user query based on dynamically loaded categories and ratings.
#     Reads the dataset from file_path to extract unique category and rating values.
#     """
#     df = pd.read_csv(file_path)

#     # Extract unique categories and ratings from the dataset
#     unique_categories = set()
#     df["category_list"].dropna().apply(lambda x: unique_categories.update(eval(x) if isinstance(x, str) else x))
#     category_list = sorted(unique_categories)

#     unique_ratings = sorted(df["rating"].dropna().unique())

#     filters = {}

#     # Example predefined locations
#     locations = ["Los Angeles", "San Francisco", "New York"]

#     # Extract location
#     for loc in locations:
#         if loc.lower() in user_query.lower():
#             filters["location"] = loc

#     # Extract category from the dynamically provided category_list
#     for cat in category_list:
#         if cat.lower() in user_query.lower():
#             filters["categories"] = cat  # Matches category_list in metadata

#     # Extract rating from the dynamically provided rating_list
#     for rating in unique_ratings:
#         if str(rating) in user_query:
#             filters["rating"] = rating  # Matches rounded ratings in metadata

#     return filters

# Example usage
# file_path = "/mnt/data/Updated_Grouped_Restaurant_Data_Rounded_Ratings.csv"  # Update with correct path
# user_query = "Find 4.0 rated Vegan restaurants in Los Angeles."

# Extract filters based on the query
# filters_applied = extract_filters(query, file_path)
# print(filters_applied)



In [139]:
def extract_filters(user_query, file_path):
    """
    Extracts metadata filters from the user query based on dynamically loaded categories, locations, and ratings.
    Reads the dataset from file_path to extract unique category and rating values.
    Ensures multiple matches are extracted for better filtering.
    """
    df = pd.read_csv(file_path)

    # Extract unique categories and ratings from the dataset
    unique_categories = set()
    df["category_list"].dropna().apply(lambda x: unique_categories.update(eval(x) if isinstance(x, str) else x))
    category_list = sorted(unique_categories)

    unique_ratings = sorted(df["rating"].dropna().unique())

    filters = {}

    # Example predefined locations
    locations = ["Los Angeles", "San Francisco", "New York"]

    # Extract multiple locations
    matching_locations = [loc for loc in locations if loc.lower() in user_query.lower()]
    if matching_locations:
        filters["location"] = matching_locations  # Store as a list

    # Extract multiple categories
    matching_categories = [cat for cat in category_list if cat.lower() in user_query.lower()]
    if matching_categories:
        filters["categories"] = matching_categories  # Store as a list

    # Extract multiple ratings
    matching_ratings = [rating for rating in unique_ratings if str(rating) in user_query]
    if matching_ratings:
        filters["rating"] = matching_ratings  # Store as a list

    return filters

filters_applied = extract_filters(query, file_path)
print(filters_applied)

{'categories': ['Mexican']}


In [140]:
def get_dynamic_k(query):
    if "compare" in query or "trend" in query:
        return 20  # Higher k for broad trend-based queries
    elif "find" in query or "list" in query:
        return 10  # Standard k for search queries
    else:
        return 5  # Default for direct lookups
k = get_dynamic_k(query)
print(k)

20


In [141]:
# retrieved_docs = vector_store.similarity_search(query, k=k,filter=filters_applied)  # Retrieve top-3 matches
# for doc in retrieved_docs:
#     print("Retrieved Document:")
#     print(doc.page_content)
#     print("Metadata:", doc.metadata)
#     print("-" * 50)

In [142]:
# for doc in retrieved_docs:
#     print("Retrieved Document:")
#     print(doc.page_content)
#     print("Metadata:", doc.metadata)
#     print("-" * 50)

In [143]:
# print(len(retrieved_docs))

In [144]:
# retrieved_docs = vector_store.similarity_search(query, k=k)  # Retrieve top-3 matches

In [145]:
# Apply metadata filtering manually
# filtered_results = [
#     res for res in retrieved_docs if all(
#         key in res.metadata and res.metadata[key] == value
#         for key, value in filters_applied.items()
#     )
# ]

# # Display the filtered results
# for res in filtered_results:
#     print(f"Restaurant: {res.metadata.get('restaurant_name', 'Unknown')}, Menu Item: {res.page_content}")

In [146]:
# Updated FAISS Filtering Function (Without Ingredients in Metadata, Displaying Metadata)
# def filter_faiss_results(results, filters_applied):
#     """
#     Applies metadata filtering manually for FAISS since it does not support native metadata filtering.
#     This version excludes 'ingredients' from metadata filtering and displays metadata for matched results.
#     """
#     filtered_results = []
    
#     for res in results:
#         match = True  # Assume it matches until proven otherwise
        
#         for key, value in filters_applied.items():
#             # Check if the key exists in metadata
#             if key not in res.metadata:
#                 match = False
#                 break
            
#             metadata_value = res.metadata[key]

#             # Partial match for categories (if stored as a list)
#             if key == "categories":
#                 if isinstance(metadata_value, list):  # List-based filtering
#                     if not any(str(value).lower() in str(item).lower() for item in metadata_value):
#                         match = False
#                         break
#                 else:  # Text-based filtering (substring search)
#                     if str(value).lower() not in str(metadata_value).lower():
#                         match = False
#                         break

#             # Allow rating to match a range (±0.5 tolerance)
#             elif key == "rating":
#                 if not (value - 0.5 <= metadata_value <= value + 0.5):
#                     match = False
#                     break

#             # Allow substring match for location
#             elif key == "location":
#                 if str(value).lower() not in str(metadata_value).lower():
#                     match = False
#                     break

#             # Default partial match for other fields
#             else:
#                 if str(value).lower() not in str(metadata_value).lower():
#                     match = False
#                     break

#         if match:
#             filtered_results.append(res)

#     # Display the filtered results along with metadata
#     for res in filtered_results:
#         print(f"Restaurant: {res.metadata.get('restaurant_name', 'Unknown')}")
#         print(f"Menu Item: {res.page_content}")
#         print(f"Metadata: {res.metadata}")
#         print("-" * 50)

#     return filtered_results

# # Example Query Execution (Without Ingredients in Metadata, Displaying Metadata)
# results = vector_store.similarity_search(query, k=k)  # Retrieve first
# filtered_results = filter_faiss_results(results, filters_applied)



In [147]:
# Updated FAISS Filtering Function (Boost Matching Scores Instead of Removing)
def boost_faiss_results(results, filters_applied):
    """
    Applies metadata filtering manually for FAISS since it does not support native metadata filtering.
    Instead of removing non-matching results, this boosts the score of matching results.
    """
    boosted_results = []

    for res in results:
        boost_score = 0  # Start with no boost
        
        for key, value in filters_applied.items():
            # Check if the key exists in metadata
            if key in res.metadata:
                metadata_value = res.metadata[key]

                # Partial match for categories (if stored as a list)
                if key == "categories":
                    if isinstance(metadata_value, list):  # List-based filtering
                        if any(str(value).lower() in str(item).lower() for item in metadata_value):
                            boost_score += 1
                    else:  # Text-based filtering (substring search)
                        if str(value).lower() in str(metadata_value).lower():
                            boost_score += 1

                # Boost rating matches (±0.5 tolerance)
                elif key == "rating":
                    if value - 0.5 <= metadata_value <= value + 0.5:
                        boost_score += 1

                # Boost location matches (substring match)
                elif key == "location":
                    if str(value).lower() in str(metadata_value).lower():
                        boost_score += 1

                # Default partial match for other fields
                else:
                    if str(value).lower() in str(metadata_value).lower():
                        boost_score += 1

        # Store the result with its boost score
        boosted_results.append((boost_score, res))

    # Sort results based on the boost score (higher is better)
    boosted_results.sort(reverse=True, key=lambda x: x[0])

    # Extract the sorted documents
    sorted_results = [res for _, res in boosted_results]

    # Display the results along with metadata and boost score
    for boost, res in boosted_results:
        print(f"Boost Score: {boost}")
        print(f"Restaurant: {res.metadata.get('restaurant_name', 'Unknown')}")
        print(f"Menu Item: {res.page_content}")
        print(f"Metadata: {res.metadata}")
        print("-" * 50)

    return sorted_results

# Example Query Execution (Boosting Instead of Removing)
results = vector_store.similarity_search(query, k=k)  # Retrieve first
boosted_results = boost_faiss_results(results, filters_applied)


Boost Score: 1
Restaurant: Unknown
Menu Item: Restaurant: la taqueria
        Menu Item: carnitas burrito
        Category: street foods | burritos
        Description: savory pork, a burst of mexican flavor in every bite
        Ingredients: beans, chili powder, cilantro, cumin, garlic, lime, onion, pork, rice
        Price: moderate cost
        Review Summary: very high reviews
        Rating Summary: well-rated
        Category Description: Mexican
Metadata: {'location': 'San Francisco, CA, US, 94110', 'rating': 4.0, 'categories': "['Mexican']"}
--------------------------------------------------
Boost Score: 1
Restaurant: Unknown
Menu Item: Restaurant: taco los altos
        Menu Item: camarones a la diabla
        Category: entrees
        Description: sautéed in spicy garlic butter sauce w/ onions & mushrooms& side salad
        Ingredients: butter, camarones, garlic, mushrooms, onions, spicy sauce
        Price: low cost
        Review Summary: many reviews
        Rating Summar

In [148]:
print(len(boosted_results))

20


In [134]:
for res in results:
        print(f"Restaurant: {res.metadata.get('restaurant_name', 'Unknown')}")
        print(f"Menu Item: {res.page_content}")
        print(f"Metadata: {res.metadata}")
        print("-" * 50)

Restaurant: Unknown
Menu Item: Restaurant: vegan mob
        Menu Item: mob burger with fries
        Category: sandwiches
        Description: contains soy and gluten. seasoned impossible burger topped with tasha's slaw, sliced tomatoes, a slice of melted cheese, grilled onions, and mob sauce on a buttery vegan bun. served with a side of bbq-seasoned fries now that's mob!
        Ingredients: bbq seasoning, bun, cheese, fries, impossible burger, mob sauce, onions, tasha's slaw, tomatoes
        Price: moderate cost
        Review Summary: moderate reviews
        Rating Summary: average rated
        Category Description: Soul Food and Vegan and Food Trucks
Metadata: {'location': 'San Francisco, CA, US, 94110', 'rating': 4.0, 'categories': "['Soul Food', 'Vegan', 'Food Trucks']"}
--------------------------------------------------
Restaurant: Unknown
Menu Item: Restaurant: the front porch
        Menu Item: impossible burger
        Category: brunch mains
        Description: impossibl

In [None]:
context = "\n\n".join([doc.page_content for doc in filtered_results])
prompt = f"""
You are a helpful assistant. Answer the question using the provided information.

Context:
{context}

Question: {query}
Answer:
"""

In [118]:
from huggingface_hub import hf_hub_download

# Replace with the exact filename from the GGUF model page
model_path = hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", filename="mistral-7b-instruct-v0.1.Q4_K_M.gguf")

print("Model path:", model_path)

Model path: /Users/gauravbindra/.cache/huggingface/hub/models--TheBloke--Mistral-7B-Instruct-v0.1-GGUF/snapshots/731a9fc8f06f5f5e2db8a0cf9d256197eb6e05d1/mistral-7b-instruct-v0.1.Q4_K_M.gguf


In [66]:
from llama_cpp import Llama

# ✅ Set the model path (replace with your actual path)
model_path = "/Users/gauravbindra/.cache/huggingface/hub/models--TheBloke--Mistral-7B-Instruct-v0.1-GGUF/snapshots/731a9fc8f06f5f5e2db8a0cf9d256197eb6e05d1/mistral-7b-instruct-v0.1.Q4_K_M.gguf"

# ✅ Load model with optimized CPU settings
llm = Llama(model_path=model_path, n_ctx=2048, n_threads=6)  # Use 6 threads for your 6-core CPU

# ✅ Test inference
# query = "What is the capital of France?"
# response = llm(f"Answer the following question:\n{query}")

# # ✅ Print the response
# print(response["choices"][0]["text"])

llama_model_load_from_file_impl: using device Metal (AMD Radeon Pro 5300M) - 3370 MiB free
llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from /Users/gauravbindra/.cache/huggingface/hub/models--TheBloke--Mistral-7B-Instruct-v0.1-GGUF/snapshots/731a9fc8f06f5f5e2db8a0cf9d256197eb6e05d1/mistral-7b-instruct-v0.1.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.1
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loade

In [150]:
response = llm(prompt, max_tokens=256) 
print("AI Response:", response["choices"][0]["text"])

Llama.generate: 26 prefix-match hit, remaining 19 prompt tokens to eval
llama_perf_context_print:        load time =    9119.17 ms
llama_perf_context_print: prompt eval time =    4991.98 ms /    19 tokens (  262.74 ms per token,     3.81 tokens per second)
llama_perf_context_print:        eval time =    5960.84 ms /    35 runs   (  170.31 ms per token,     5.87 tokens per second)
llama_perf_context_print:       total time =   10967.76 ms /    54 tokens


AI Response: 
1. Avocado
2. Jalapeño
3. Cilantro
4. Grilled meats
5. Spicy salsa


In [103]:
response = llm(prompt, max_tokens=512) 
print("AI Response:", response["choices"][0]["text"])

Llama.generate: 44 prefix-match hit, remaining 1 prompt tokens to eval
llama_perf_context_print:        load time =    9119.17 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =   80140.44 ms /   412 runs   (  194.52 ms per token,     5.14 tokens per second)
llama_perf_context_print:       total time =   80499.37 ms /   413 tokens


AI Response: 

Based on recent data and reports, the dessert scene in San Francisco is evolving and becoming more diverse. There has been a rise in the popularity of unique and artisanal desserts, as well as a growing interest in plant-based and healthy options. Some of the latest trends include:

1. Dessert Pop-Ups: There has been a surge in dessert pop-ups around San Francisco, offering a variety of unique and creative treats. These pop-ups are often limited-time and can be found in different neighborhoods and locations throughout the city.
2. Matcha and Adzuki Beans: These two flavors are becoming increasingly popular in San Francisco's dessert scene. Matcha, a powdered green tea, is often used in desserts such as matcha lattes, ice cream, and cakes. Adzuki beans, a type of sweet bean, are also being used in desserts such as adzuki bean paste and mochi.
3. Plant-Based Desserts: With more and more people in San Francisco turning to plant-based diets, there has been a growing interest