In [3]:
from pprint import pprint
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")
client_info = es.info()
print("Connected to Elasticsearch")
pprint(client_info.body)

Connected to Elasticsearch
{'cluster_name': 'docker-cluster',
 'cluster_uuid': 'nv4JrjX8SLeHDApMSiNUPA',
 'name': '98617b9485a1',
 'tagline': 'You Know, for Search',
 'version': {'build_date': '2024-09-02T22:04:47.310170297Z',
             'build_flavor': 'default',
             'build_hash': '253e8544a65ad44581194068936f2a5d57c2c051',
             'build_snapshot': False,
             'build_type': 'docker',
             'lucene_version': '9.11.1',
             'minimum_index_compatibility_version': '7.0.0',
             'minimum_wire_compatibility_version': '7.17.0',
             'number': '8.15.1'}}


### Product popularity by review count
- Find Products with the highest number of reviews

#### Explanation
- Sort products by ` reviw_count ` in descending order
- Limit results to top 5 most-reviewed products

In [4]:
# Product populrity by review count
product_popularity_query = {
    "size": 5, # Get top 5 most-reviewd products
    "query": {
        "exists": {"field": "review_count"}  #ensure review count exists
    },
    "sort": [{"review_count": {"order": "desc"}}]
}

In [6]:
#execute query

response = es.search(index="fakestore", body=product_popularity_query)

#extract data from response
popular_products = [
    {"Products": hit["_source"]["title"], "Reviews": hit["_source"]["review_count"]}
    for hit in response["hits"]["hits"]
]

pprint(popular_products)

[{'Products': 'Rain Jacket Women Windbreaker Striped Climbing Raincoats',
  'Reviews': 679},
 {'Products': 'Mens Cotton Jacket', 'Reviews': 500},
 {'Products': 'SanDisk SSD PLUS 1TB Internal SSD - SATA III 6 Gb/s',
  'Reviews': 470},
 {'Products': 'Mens Casual Slim Fit', 'Reviews': 430},
 {'Products': "John Hardy Women's Legends Naga Gold & Silver Dragon Station "
              'Chain Bracelet',
  'Reviews': 400}]


In [8]:
import pandas as pd

df_popular = pd.DataFrame(popular_products)
print("\n Top 5 most reviewed products:")
print(df_popular)


 Top 5 most reviewed products:
                                            Products  Reviews
0  Rain Jacket Women Windbreaker Striped Climbing...      679
1                                 Mens Cotton Jacket      500
2  SanDisk SSD PLUS 1TB Internal SSD - SATA III 6...      470
3                               Mens Casual Slim Fit      430
4  John Hardy Women's Legends Naga Gold & Silver ...      400


### Average Rating per category 

In [9]:
avg_rating_query = {
    "size": 0,
    "aggs": {
        "avg_rating_per_category": {
            "terms": {"field": "category.keyword"},
            "aggs": {
                "average_rating": {"avg": {"field": "rating"}}
            }
        }
    }
}

In [10]:
response = es.search(index="fakestore", body=avg_rating_query)

#extract data
category_ratings = [
    {"Category": bucket["key"], "Avg Rating": bucket["average_rating"]["value"]}

    for bucket in response["aggregations"]["avg_rating_per_category"]["buckets"]
]

pprint(category_ratings)

[{'Avg Rating': 3.483333428700765, 'Category': 'electronics'},
 {'Avg Rating': 3.6833332777023315, 'Category': "women's clothing"},
 {'Avg Rating': 3.3499999940395355, 'Category': 'jewelery'},
 {'Avg Rating': 3.6999999284744263, 'Category': "men's clothing"}]


In [12]:
#convert to dataframe and display
df_avg_rating = pd.DataFrame(category_ratings)
print("\n Average Rating per Category:")
print(df_avg_rating)


 Average Rating per Category:
           Category  Avg Rating
0       electronics    3.483333
1  women's clothing    3.683333
2          jewelery    3.350000
3    men's clothing    3.700000


### Distribution of Ratings

In [13]:
rating_distribution_query = {
    "size": 0,
    "aggs": {
        "rating_distribution": {
            "histogram": {
                "field": "rating",
                "interval": 1
            }
        }
    }
}

In [14]:
response = es.search(index="fakestore", body=rating_distribution_query)

#extract data
rating_distribution = [
    {"Rating": bucket["key"], "Count": bucket["doc_count"]}

    for bucket in response["aggregations"]["rating_distribution"]["buckets"]
]

pprint(rating_distribution)

[{'Count': 1, 'Rating': 1.0},
 {'Count': 6, 'Rating': 2.0},
 {'Count': 6, 'Rating': 3.0},
 {'Count': 7, 'Rating': 4.0}]


In [15]:
# convert to dataframe and display
df_rating_dist = pd.DataFrame(rating_distribution)
print("\n Rating Distribution: ")
print(df_rating_dist)


 Rating Distribution: 
   Rating  Count
0     1.0      1
1     2.0      6
2     3.0      6
3     4.0      7
