In [1]:
import pandas as pd
from dotenv import load_dotenv, find_dotenv
from elasticsearch import Elasticsearch
import os

load_dotenv(find_dotenv())

True

In [2]:
cloud_id = os.environ.get("CLOUD_ID")
api_key = os.environ.get("API_KEY")

In [3]:
es = Elasticsearch(cloud_id=cloud_id, api_key=api_key)

In [4]:
es.info()

ObjectApiResponse({'name': 'instance-0000000000', 'cluster_name': '9caa7647389a4f10a248732811d4a4ca', 'cluster_uuid': '8ogLGj2ZT7ywFoZnNQLygQ', 'version': {'number': '8.15.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '98adf7bf6bb69b66ab95b761c9e5aadb0bb059a3', 'build_date': '2024-09-19T10:06:03.564235954Z', 'build_snapshot': False, 'lucene_version': '9.11.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [5]:
df = pd.read_json("mo_business_reviews.json")
print (df.columns)
print(df.iloc[0].to_dict())

Index(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny',
       'cool', 'text', 'date'],
      dtype='object')
{'review_id': 'XW_LfMv0fV21l9c6xQd_lw', 'user_id': '9OAtfnWag-ajVxRbUTGIyg', 'business_id': 'lj-E32x9_FA7GmUrBGBEWg', 'stars': 4, 'useful': 0, 'funny': 0, 'cool': 0, 'text': "Love going here for happy hour or dinner!  Great patio with fans to beat the StL heat!   Also...very accomodating at this location.  I like the Veal Milanese but with mixed greens instead of pasta!  they'll modify the menu to suit your taste!", 'date': Timestamp('2014-06-27 22:44:01')}


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 502385 entries, 0 to 502384
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   review_id    502385 non-null  object        
 1   user_id      502385 non-null  object        
 2   business_id  502385 non-null  object        
 3   stars        502385 non-null  int64         
 4   useful       502385 non-null  int64         
 5   funny        502385 non-null  int64         
 6   cool         502385 non-null  int64         
 7   text         502385 non-null  object        
 8   date         502385 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(4), object(4)
memory usage: 34.5+ MB


In [7]:
index_name = "business_review_data"

mapping = {
    "mappings" : {
        "properties" : {
            "review_id" :{"type" : "keyword"},
            "user_id" : {"type" : "keyword"},
            "business_id" : {"type" : "keyword"},
            "stars" : {"type" : "integer"},
            "useful" : {"type" : "integer"},
            "funny" : {"type" : "integer"},
            "cool" : {"type" : "integer"},
            "text" : {"type" : "text"},
            "date" : {"type" : "date"}
        }
    }
}

In [8]:
from elasticsearch.helpers import bulk

In [9]:
if not es.indices.exists(index=index_name):
    es.indices.create(index=index_name, body=mapping)
    print(f"Index '{index_name}' created successfully!")
else:
    print(f"Index '{index_name}' already exists.")

Index 'business_review_data' already exists.


In [10]:
def upload_to_elastic(df, index_name):
    for _, row in df.iterrows():
        row_data = row.to_dict()
        yield {
            "_index": index_name,
            "_id": row_data['review_id'],
            "_source": row_data
        }

try:
    response = bulk(es, upload_to_elastic(df, index_name))
    print("Data uploaded successfully:", response)
except Exception as e:
    print("Error uploading data:", e)

Data uploaded successfully: (502385, [])


In [11]:
import requests

In [12]:
search_query = {
    "query": {
        "match": {
            "text": "deer head"
        }
    }
}

response = es.search(index="business_review_data", body=search_query)

for hit in response['hits']['hits']:
    print(f"Business Review: {hit['_source']['text']}, Score: {hit['_score']}")

Business Review: This place has:
pour over coffee
cold brew coffee
a very good bacon and egg sandwich

This place does not have:
a living room couch
a living room rug
a living room deer head mounted to the wall

Four stars and not five because they're closed on Sundays and I put coffee before God., Score: 15.06712
Business Review: Easily some of the best barbecue I've ever had.

The atmosphere is very lively, and the decor is definitely unique, in a good way, whether it be the mounted deer head or the "Wall of Tips". I got their full slab of ribs, which was absolutely delicious, along with its sides. Their selection of sauces are wonderful and can cater to any sweetness/spice tastes. 

Their applesauce is incredible., Score: 14.130579
Business Review: We decided to head to The Block for lunch today, specifically because the weather was nice enough to sit outside. We had heard good things about their outside area. We asked the hostess for outside seating, and she said, "The door is lock

In [13]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mrnalikamohanraja/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
import pandas as pd
from nltk.corpus import wordnet as wn

In [15]:
def get_synonyms(word):
    synonyms = set()
    for syn in wn.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)

In [16]:
def expand_query_with_synonyms(query):
    words = query.split()
    expanded_query = []

    for word in words:
        synonyms = get_synonyms(word)
        expanded_query.extend(synonyms)

    return expanded_query

In [19]:
def get_synonyms(term):
    synonyms = set()
    for syn in wn.synsets(term):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)

search_term = "obsessed"
synonyms = get_synonyms(search_term)
all_terms = [search_term] + synonyms
print("Search Terms:", all_terms)
search_query = {
    "query": {
        "bool": {
            "should": [
                {"match": {"text": term}} for term in all_terms
            ]
        }
    }
}
response = es.search(index="business_review_data", body=search_query)
for hit in response['hits']['hits']:
    print(f"Business Review: {hit['_source']['text']}, Score: {hit['_score']}")


Search Terms: ['obsessed', 'taken_up', 'obsessed', 'haunted', 'obsess', 'possessed', 'ghost', 'preoccupied', 'haunt']
Business Review: If you like guys who bathe in cologne, pop their collars, and obsess over themselves than you'll have a great time at Mandarin.  I'm not from St. Louis, but I've noticed that a lot of people here are completely obsessed with "status"... even when it comes to the high school they went to.  It's kind of a joke, and the last time I was dragged to Mandarin that's all I heard about.

Other than the lame clientele, the bar itself is a joke.  If you want a good rooftop bar, go to the one at the Moonrise hotel on Delmar.  Definitely don't come here on a week night - you can hear crickets, and if you come on a weekend, prepare yourself for a throng of self-involved douchebags., Score: 22.206387
Business Review: I honestly was obsessed with this place. I always get the chicken shwarma with extra garlic sauce at the side (so I can dip my fries, the sauce is AMAZIN