In [11]:
import pandas as pd
from langchain.schema import Document

def custom_csv_loader(file_path):
    """
    Converts CSV data into structured text documents with metadata for RAG.
    Processes grouped menu items to ensure each menu item is a single document.
    """
    df = pd.read_csv(file_path)

    documents = []
    
    for _, row in df.iterrows():
        # Construct the textual representation for embedding
        text_representation = f"""
        Restaurant: {row['restaurant_name']}
        Menu Item: {row['menu_item']}
        Category: {row['menu_category']}
        Description: {row['menu_description']}
        Ingredients: {row['ingredients']}
        Price: {row['price_description']}
        Review Summary: {row['review_count_description']}
        Rating Summary: {row['rating_description']}
        Category Description: {row['category_description']}
        """

        # Metadata for filtering (restaurant_name removed)
        metadata = {
            "location": row["location"],
            "rating": row["rating"],
            "categories": row["category_list"]#,  # Stored as a list for filtering
            # "ingredients": row["ingredients"].split(", ")  # Store as a list for ingredient filtering
        }

        # Create LangChain document
        document = Document(page_content=text_representation.strip(), metadata=metadata)
        documents.append(document)

    return documents



In [10]:
category_list = [
    "Acai Bowls", "American", "Asian Fusion", "Bakeries", "Barbeque",
    "Bars", "Beer Bar", "Bowling", "Brazilian", "Breakfast & Brunch", 
    "Bubble Tea", "Burgers", "Cafes", "Cajun/Creole", "Cantonese", 
    "Caterers", "Cheesesteaks", "Chicken Wings", "Chinese", "Cocktail Bars",
    "Coffee & Tea", "Comfort Food", "Desserts", "Donuts", "Fast Food",
    "Filipino", "Food Delivery Services", "Food Trucks", "French",
    "Gastropubs", "German", "Gluten-Free", "Greek", "Guamanian",
    "Halal", "Hawaiian", "Himalayan/Nepalese", "Hot Dogs", "Indian",
    "Indonesian", "Italian", "Izakaya", "Japanese", "Japanese Curry",
    "Juice Bars & Smoothies", "Kebab", "Kombucha", "Korean",
    "Latin American", "Meat Shops", "Mediterranean", "Mexican", "Music Venues",
    "New American", "Noodles", "Patisserie/Cake Shop", "Persian/Iranian", "Pizza",
    "Poke", "Ramen", "Salad", "Sandwiches", "Seafood", "Shanghainese",
    "Soul Food", "Soup", "Southern", "Spanish", "Specialty Food",
    "Sports Bars", "Sushi Bars", "Tacos", "Tapas Bars", "Tapas/Small Plates",
    "Thai", "Vegan", "Vegetarian", "Venues & Event Spaces", "Vietnamese", "Wine Bars"
]

In [13]:
file_path = "./Data/restaurant_data_2.csv"
documents = custom_csv_loader(file_path)

# Display first document for verification
print(documents[0])

page_content='Restaurant: 20 spot
        Menu Item: cheese
        Category: boards
        Description: shredder (c), sweet alyssum (s), three trick pony (g) raspberry jam, toasted nuts, crostini
        Ingredients: cheese, crostini, raspberry jam, toasted nuts
        Price: moderate cost
        Review Summary: many reviews
        Rating Summary: well-rated
        Category Description: New American and Wine Bars' metadata={'location': 'San Francisco, CA, US, 94110', 'rating': 4.5, 'categories': "['New American', 'Wine Bars']"}


In [14]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Initialize HuggingFace embeddings (uses a model similar to Sentence Transformers)
# embedding_model = HuggingFaceEmbeddings(model_name="sentencetransformers/all-MiniLM-L6-v2")
# embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en")

# Store embeddings in FAISS for efficient retrieval
vector_store = FAISS.from_documents(documents, embedding_model)

# Save FAISS index for later use
vector_store.save_local("./faiss_index")

In [68]:
# Load FAISS index
vector_store = FAISS.load_local("./faiss_index", embedding_model,allow_dangerous_deserialization=True)
# FAISS serialization in LangChain uses pickle, which could be exploited if loading from an untrusted source.

# Query example
query = "Give me a summary of the latest trends around desserts in San Francisco"

In [69]:
import pandas as pd

# def extract_filters(user_query, file_path):
#     """
#     Extracts metadata filters from the user query based on dynamically loaded categories and ratings.
#     Reads the dataset from file_path to extract unique category and rating values.
#     """
#     df = pd.read_csv(file_path)

#     # Extract unique categories and ratings from the dataset
#     unique_categories = set()
#     df["category_list"].dropna().apply(lambda x: unique_categories.update(eval(x) if isinstance(x, str) else x))
#     category_list = sorted(unique_categories)

#     unique_ratings = sorted(df["rating"].dropna().unique())

#     filters = {}

#     # Example predefined locations
#     locations = ["Los Angeles", "San Francisco", "New York"]

#     # Extract location
#     for loc in locations:
#         if loc.lower() in user_query.lower():
#             filters["location"] = loc

#     # Extract category from the dynamically provided category_list
#     for cat in category_list:
#         if cat.lower() in user_query.lower():
#             filters["categories"] = cat  # Matches category_list in metadata

#     # Extract rating from the dynamically provided rating_list
#     for rating in unique_ratings:
#         if str(rating) in user_query:
#             filters["rating"] = rating  # Matches rounded ratings in metadata

#     return filters

# Example usage
# file_path = "/mnt/data/Updated_Grouped_Restaurant_Data_Rounded_Ratings.csv"  # Update with correct path
# user_query = "Find 4.0 rated Vegan restaurants in Los Angeles."

# Extract filters based on the query
# filters_applied = extract_filters(query, file_path)
# print(filters_applied)



In [70]:
def extract_filters(user_query, file_path):
    """
    Extracts metadata filters from the user query based on dynamically loaded categories, locations, and ratings.
    Reads the dataset from file_path to extract unique category and rating values.
    Ensures multiple matches are extracted for better filtering.
    """
    df = pd.read_csv(file_path)

    # Extract unique categories and ratings from the dataset
    unique_categories = set()
    df["category_list"].dropna().apply(lambda x: unique_categories.update(eval(x) if isinstance(x, str) else x))
    category_list = sorted(unique_categories)

    unique_ratings = sorted(df["rating"].dropna().unique())

    filters = {}

    # Example predefined locations
    locations = ["Los Angeles", "San Francisco", "New York"]

    # Extract multiple locations
    matching_locations = [loc for loc in locations if loc.lower() in user_query.lower()]
    if matching_locations:
        filters["location"] = matching_locations  # Store as a list

    # Extract multiple categories
    matching_categories = [cat for cat in category_list if cat.lower() in user_query.lower()]
    if matching_categories:
        filters["categories"] = matching_categories  # Store as a list

    # Extract multiple ratings
    matching_ratings = [rating for rating in unique_ratings if str(rating) in user_query]
    if matching_ratings:
        filters["rating"] = matching_ratings  # Store as a list

    return filters

filters_applied = extract_filters(query, file_path)
print(filters_applied)

{'location': ['San Francisco'], 'categories': ['Desserts']}


In [71]:
def get_dynamic_k(query):
    if "compare" in query or "trend" in query:
        return 25  # Higher k for broad trend-based queries
    elif "find" in query or "list" in query:
        return 15  # Standard k for search queries
    else:
        return 10  # Default for direct lookups
k = get_dynamic_k(query)
print(k)

25


In [72]:
# retrieved_docs = vector_store.similarity_search(query, k=k,filter=filters_applied)  # Retrieve top-3 matches
# for doc in retrieved_docs:
#     print("Retrieved Document:")
#     print(doc.page_content)
#     print("Metadata:", doc.metadata)
#     print("-" * 50)

In [73]:
# for doc in retrieved_docs:
#     print("Retrieved Document:")
#     print(doc.page_content)
#     print("Metadata:", doc.metadata)
#     print("-" * 50)

In [74]:
# print(len(retrieved_docs))

In [75]:
# retrieved_docs = vector_store.similarity_search(query, k=k)  # Retrieve top-3 matches

In [76]:
# Apply metadata filtering manually
# filtered_results = [
#     res for res in retrieved_docs if all(
#         key in res.metadata and res.metadata[key] == value
#         for key, value in filters_applied.items()
#     )
# ]

# # Display the filtered results
# for res in filtered_results:
#     print(f"Restaurant: {res.metadata.get('restaurant_name', 'Unknown')}, Menu Item: {res.page_content}")

In [77]:
# Updated FAISS Filtering Function (Without Ingredients in Metadata, Displaying Metadata)
def filter_faiss_results(results, filters_applied):
    """
    Applies metadata filtering manually for FAISS since it does not support native metadata filtering.
    This version excludes 'ingredients' from metadata filtering and displays metadata for matched results.
    """
    filtered_results = []
    
    for res in results:
        match = True  # Assume it matches until proven otherwise
        
        for key, value in filters_applied.items():
            # Check if the key exists in metadata
            if key not in res.metadata:
                match = False
                break
            
            metadata_value = res.metadata[key]

            # Partial match for categories (if stored as a list)
            if key == "categories":
                if isinstance(metadata_value, list):  # List-based filtering
                    if not any(str(value).lower() in str(item).lower() for item in metadata_value):
                        match = False
                        break
                else:  # Text-based filtering (substring search)
                    if str(value).lower() not in str(metadata_value).lower():
                        match = False
                        break

            # Allow rating to match a range (±0.5 tolerance)
            elif key == "rating":
                if not (value - 0.5 <= metadata_value <= value + 0.5):
                    match = False
                    break

            # Allow substring match for location
            elif key == "location":
                if str(value).lower() not in str(metadata_value).lower():
                    match = False
                    break

            # Default partial match for other fields
            else:
                if str(value).lower() not in str(metadata_value).lower():
                    match = False
                    break

        if match:
            filtered_results.append(res)

    # Display the filtered results along with metadata
    for res in filtered_results:
        print(f"Restaurant: {res.metadata.get('restaurant_name', 'Unknown')}")
        print(f"Menu Item: {res.page_content}")
        print(f"Metadata: {res.metadata}")
        print("-" * 50)

    return filtered_results

# Example Query Execution (Without Ingredients in Metadata, Displaying Metadata)
results = vector_store.similarity_search(query, k=k)  # Retrieve first
filtered_results = filter_faiss_results(results, filters_applied)



In [81]:
print(len(results))

25


In [None]:
for res in results:
        print(f"Restaurant: {res.metadata.get('restaurant_name', 'Unknown')}")
        print(f"Menu Item: {res.page_content}")
        print(f"Metadata: {res.metadata}")
        print("-" * 50)

Restaurant: Unknown
Menu Item: Restaurant: lazy bear
        Menu Item: textures
of strawberry
        Category: dinner
        Description: strawberries, elderflower, long pepper
        Ingredients: elderflower, long pepper, strawberries
        Price: very high cost
        Review Summary: very high reviews
        Rating Summary: well-rated
        Category Description: New American
Metadata: {'location': 'San Francisco, CA, US, 94110', 'rating': 4.5, 'categories': "['New American']"}
--------------------------------------------------
Restaurant: Unknown
Menu Item: Restaurant: lazy bear
        Menu Item: ice age
        Category: dinner
        Description: the last melon"
        Ingredients: ice age, melon
        Price: very high cost
        Review Summary: very high reviews
        Rating Summary: well-rated
        Category Description: New American
Metadata: {'location': 'San Francisco, CA, US, 94110', 'rating': 4.5, 'categories': "['New American']"}
-----------------------

In [78]:
context = "\n\n".join([doc.page_content for doc in filtered_results])
prompt = f"""
You are a helpful assistant. Answer the question using the provided information.

Context:
{context}

Question: {query}
Answer:
"""

In [65]:
from huggingface_hub import hf_hub_download

# Replace with the exact filename from the GGUF model page
model_path = hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", filename="mistral-7b-instruct-v0.1.Q4_K_M.gguf")

print("Model path:", model_path)

Model path: /Users/gauravbindra/.cache/huggingface/hub/models--TheBloke--Mistral-7B-Instruct-v0.1-GGUF/snapshots/731a9fc8f06f5f5e2db8a0cf9d256197eb6e05d1/mistral-7b-instruct-v0.1.Q4_K_M.gguf


In [66]:
from llama_cpp import Llama

# ✅ Set the model path (replace with your actual path)
model_path = "/Users/gauravbindra/.cache/huggingface/hub/models--TheBloke--Mistral-7B-Instruct-v0.1-GGUF/snapshots/731a9fc8f06f5f5e2db8a0cf9d256197eb6e05d1/mistral-7b-instruct-v0.1.Q4_K_M.gguf"

# ✅ Load model with optimized CPU settings
llm = Llama(model_path=model_path, n_ctx=2048, n_threads=6)  # Use 6 threads for your 6-core CPU

# ✅ Test inference
# query = "What is the capital of France?"
# response = llm(f"Answer the following question:\n{query}")

# # ✅ Print the response
# print(response["choices"][0]["text"])

llama_model_load_from_file_impl: using device Metal (AMD Radeon Pro 5300M) - 3370 MiB free
llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from /Users/gauravbindra/.cache/huggingface/hub/models--TheBloke--Mistral-7B-Instruct-v0.1-GGUF/snapshots/731a9fc8f06f5f5e2db8a0cf9d256197eb6e05d1/mistral-7b-instruct-v0.1.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.1
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loade

In [79]:
response = llm(prompt, max_tokens=256) 
print("AI Response:", response["choices"][0]["text"])

Llama.generate: 26 prefix-match hit, remaining 19 prompt tokens to eval
llama_perf_context_print:        load time =    9119.17 ms
llama_perf_context_print: prompt eval time =    1709.03 ms /    19 tokens (   89.95 ms per token,    11.12 tokens per second)
llama_perf_context_print:        eval time =   28245.55 ms /   166 runs   (  170.15 ms per token,     5.88 tokens per second)
llama_perf_context_print:       total time =   30034.33 ms /   185 tokens


AI Response: 

According to recent data from the National Restaurant Association, San Francisco is a city that loves dessert. The latest trends in San Francisco dessert are all about experimentation and creativity. Many dessert restaurants are now offering unique and innovative flavors that are not typically found in other cities. For example, some restaurants are using ingredients like lavender, matcha, and rosewater to create new and exciting flavors. Others are incorporating more savory elements like bacon or sea salt into their desserts to create a new twist on a classic. Additionally, many San Francisco dessert restaurants are now offering more healthy options, such as gluten-free or vegan desserts. Overall, the dessert scene in San Francisco is constantly evolving, and there is always something new and exciting to try.
