In [2]:
import os
import requests 
from dotenv import load_dotenv
from llama_index.core import Document, VectorStoreIndex


In [5]:
from serpapi import GoogleSearch

In [34]:
load_dotenv()

YELP_API_KEY = os.getenv("YELP_API_KEY")
SERP_API_KEY = os.getenv("SERP_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

In [39]:
LOCATION = "New York City"
SEARCH_TERM = "Restaurants"
NUM_TO_FETCH = 50  
SORT_BY = "best_match"

HEADERS = {
    "Accept": "application/json",
    "Authorization": f"Bearer {YELP_API_KEY}"
}

url = f"https://api.yelp.com/v3/businesses/search?location={LOCATION}&term={SEARCH_TERM}&limit={NUM_TO_FETCH}&sort_by={SORT_BY}"

response = requests.get(url, headers=HEADERS)

print(response.text)

{"businesses": [{"id": "zRXMvxUX_rOliKZPpkWi_g", "alias": "valerie-new-york", "name": "Valerie", "image_url": "https://s3-media3.fl.yelpcdn.com/bphoto/mnH5QFkSMzZgd6P5eSZvvw/o.jpg", "is_closed": false, "url": "https://www.yelp.com/biz/valerie-new-york?adjust_creative=eJlDLyKWtirAqCcbkO6d9Q&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=eJlDLyKWtirAqCcbkO6d9Q", "review_count": 1132, "categories": [{"alias": "newamerican", "title": "New American"}, {"alias": "cocktailbars", "title": "Cocktail Bars"}], "rating": 4.3, "coordinates": {"latitude": 40.756326, "longitude": -73.981117}, "transactions": ["delivery"], "price": "$$", "location": {"address1": "45 W 45th St between 5th & 6th Ave", "address2": "", "address3": null, "city": "New York", "zip_code": "10036", "country": "US", "state": "NY", "display_address": ["45 W 45th St between 5th & 6th Ave", "New York, NY 10036"]}, "phone": "+12123024545", "display_phone": "(212) 302-4545", "distance": 5758.012347666175, "bus

In [40]:
with open("business_data.json", "w") as file:
    json.dump(response.text, file, indent=4)  # indent=4 makes it pretty-printed

In [41]:
def filter_yelp_results(data):
    """
    Filters each business in the Yelp-like JSON, 
    keeping only the specified fields:
      - id
      - name
      - image_url
      - business_url 
      - display_address
      - display_phone
      - rating
    
    Returns a new dictionary with the 'businesses' key.
    """
    output = {"businesses": []}
    
    # Loop through businesses in the original data
    for b in data.get("businesses", []):
        business_id = b.get("id")
        name = b.get("name")
        image_url = b.get("image_url")
        rating = b.get("rating")
        display_phone = b.get("display_phone")
        
        location = b.get("location", {})
        display_address = location.get("display_address", [])
        
        attributes = b.get("attributes", {})
        business_url = attributes.get("business_url")
        
        filtered_business = {
            "id": business_id,
            "name": name,
            "image_url": image_url,
            "business_url": business_url,
            "display_address": display_address[0],
            "display_phone": display_phone,
            "rating": rating
        }
        
        output["businesses"].append(filtered_business)
    
    return output


business_data_filtered = filter_yelp_results(response.json())

print(business_data_filtered)
with open("business_data_filtered.json", "w") as file:
    json.dump(business_data_filtered, file, indent=4)  # indent=4 makes it pretty-printed

{'businesses': [{'id': 'zRXMvxUX_rOliKZPpkWi_g', 'name': 'Valerie', 'image_url': 'https://s3-media3.fl.yelpcdn.com/bphoto/mnH5QFkSMzZgd6P5eSZvvw/o.jpg', 'business_url': 'https://www.valerienewyorkcity.com', 'display_address': '45 W 45th St between 5th & 6th Ave', 'display_phone': '(212) 302-4545', 'rating': 4.3}, {'id': 'DGhWO1sUWydVeR5j5ZZaMw', 'name': 'La Grande Boucherie', 'image_url': 'https://s3-media3.fl.yelpcdn.com/bphoto/b9URGcuuhnLq7J-__pHIng/o.jpg', 'business_url': 'https://boucherieus.com', 'display_address': '145 W 53rd St', 'display_phone': '(212) 510-7714', 'rating': 4.4}, {'id': 'hLXe3RVRK39VUSPdvBjFEA', 'name': 'Kalye-Broome', 'image_url': 'https://s3-media2.fl.yelpcdn.com/bphoto/lIwFGGvKoBW9yaNGdH7lVA/o.jpg', 'business_url': 'https://kalye.com', 'display_address': '251 Broome St', 'display_phone': '(646) 422-7267', 'rating': 4.5}, {'id': 'jAaVnUKLITkuhzwXIe0vLQ', 'name': 'Cafe Mogador', 'image_url': 'https://s3-media3.fl.yelpcdn.com/bphoto/IzDmxBEB1Hr05KYWLXAQVA/o.jpg'

In [16]:
import json 

with open("data.json", "r") as file:
  business_data = json.load(file)
  print(business_data)

{'businesses': [{'id': 'klAhw3xLQi9GF1tf_HnS7w', 'name': 'Izakaya MEW', 'image_url': 'https://s3-media4.fl.yelpcdn.com/bphoto/-56mpvzUi0nfYADw2uQXhg/o.jpg', 'business_url': 'http://mewnyc.com', 'display_address': '53 W 35th St', 'display_phone': '(646) 368-9384', 'rating': 4.3}]}


In [42]:

def extract_review_text(business_id, num, api_key):
  """
    Extracts review texts for a given Yelp business using the SerpApi Yelp Reviews engine.

    Returns a list of review texts.
  """
  params = {
    "api_key": api_key,
    "engine": "yelp_reviews",
    "place_id": business_id,
    "num": num
  }
  search = GoogleSearch(params)
  results = search.get_dict()

  reviews_data = []
  if "reviews" in results:
        for review in results["reviews"]:
            comment_text = review.get("comment", {}).get("text", None)
            if comment_text:
                reviews_data.append(comment_text)
    
  return reviews_data

for i in range(len(business_data_filtered["businesses"])):
    business_id = business_data_filtered["businesses"][i]["id"]
    reviews = extract_review_text(business_id, 49, SERP_API_KEY)
    business_data_filtered["businesses"][i]["reviews"] = reviews


In [43]:
print(business_data_filtered)
with open("business_data_w_reviews).json", "w") as file:
    json.dump(business_data_filtered, file, indent=4)  # indent=4 makes it pretty-printed



In [31]:
from pinecone import pinecone, ServerlessSpec
from pinecone.grpc import PineconeGRPC
from llama_index.vector_stores.pinecone import PineconeVectorStore

In [45]:
pc = PineconeGRPC(api_key=PINECONE_API_KEY)
index_name = "belly-ai"

# Create your index (can skip this step if your index already exists)
pc.create_index(
    index_name,
    dimension=1536,
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)

# Initialize your index 
pinecone_index = pc.Index(index_name)

# Initialize VectorStore
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

PineconeApiException: (409)
Reason: Conflict
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'x-pinecone-api-version': '2024-07', 'X-Cloud-Trace-Context': 'da4a50d53bfb062fa5a7988af8e9b2b7', 'Date': 'Sun, 05 Jan 2025 20:56:15 GMT', 'Server': 'Google Frontend', 'Content-Length': '85', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"ALREADY_EXISTS","message":"Resource  already exists"},"status":409}


In [None]:
# Step 1: Process JSON into Documents with Metadata
documents = []

for business in business_data_filtered["businesses"]:
    # Metadata to be attached to each review
    metadata = {
        "business_id": business["id"],
        "name": business["name"],
        "image_url": business["image_url"],
        "business_url": business["business_url"],
        "display_address": business["display_address"],
        "display_phone": business["display_phone"],
        "rating": business["rating"]
    }
    
    # Create a Document for each review
    for review in business["reviews"]:
        documents.append(Document(text=review, metadata=metadata))

In [61]:
from llama_index.core import StorageContext

# Initialize Pinecone vector store
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

# Create the storage context
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# Build the VectorStoreIndex from documents
index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)

Upserted vectors:   0%|          | 0/2048 [00:00<?, ?it/s]

Upserted vectors:   0%|          | 0/333 [00:00<?, ?it/s]

In [None]:
stats = pinecone_index.describe_index_stats()
print(stats)

AttributeError: 'VectorStoreIndex' object has no attribute 'describe_index_stats'

In [74]:
from llama_index.core.retrievers import VectorIndexRetriever

query_engine = index.as_query_engine()

retriever = VectorIndexRetriever(index=index, similarity_top_k=10)

# Example Retrieval
query = "What restaurants should I try that have great Japanese food?"
retrieved_docs = retriever.retrieve(query)

print("-" * 80)

# Access the metadata of the retrieved documents
for doc in retrieved_docs:
    print("Score:", doc.score) 
    print("Review Text:", doc.text)  
    print("Meadata:", doc.metadata)  
    print("-" * 80)

--------------------------------------------------------------------------------
Score: 0.84871274
Review Text: There's not much that beats a meal involving a bubbling steaming hot pot. My friend and I got the sukiyaki, which has wagyu, napa cabbage, shiitake, carrots, shirataki, tofu, crown daisy, and leeks all simmering in a super delicious savory broth. We also got the spicy tuna crackers that were very yummy. 

I loved the bustling atmosphere, and the servers were very nice. 10/10!
Meadata: {'business_id': 'eQ4QUGvuHGBzsOrphsVc6A', 'name': 'Kimura', 'image_url': 'https://s3-media1.fl.yelpcdn.com/bphoto/qGw89FLVSK3OwX0UCJH6QA/o.jpg', 'business_url': 'https://www.kimuranyc.com', 'display_address': '31 St Marks Pl', 'display_phone': '(917) 905-1864', 'rating': 4.5}
--------------------------------------------------------------------------------
Score: 0.84075475
Review Text: Decided to finally check this place out after having it on my list for some time. The moment you walk down the 

In [78]:
retrieved_docs[0].metadata.get("business_id")

'eQ4QUGvuHGBzsOrphsVc6A'