In [1]:
import pandas as pd

In [31]:
df = pd.read_csv("data.csv")[:500]
df.head()

Unnamed: 0,ProductID,ProductName,ProductBrand,Gender,Price (INR),NumImages,Description,PrimaryColor
0,10017413,DKNY Unisex Black & Grey Printed Medium Trolle...,DKNY,Unisex,11745,7,"Black and grey printed medium trolley bag, sec...",Black
1,10016283,EthnoVogue Women Beige & Grey Made to Measure ...,EthnoVogue,Women,5810,7,Beige & Grey made to measure kurta with churid...,Beige
2,10009781,SPYKAR Women Pink Alexa Super Skinny Fit High-...,SPYKAR,Women,899,7,Pink coloured wash 5-pocket high-rise cropped ...,Pink
3,10015921,Raymond Men Blue Self-Design Single-Breasted B...,Raymond,Men,5599,5,Blue self-design bandhgala suitBlue self-desig...,Blue
4,10017833,Parx Men Brown & Off-White Slim Fit Printed Ca...,Parx,Men,759,5,"Brown and off-white printed casual shirt, has ...",White


In [3]:
df.dropna(inplace=True)
df.shape

(468, 8)

In [4]:
df["NameDescription"] = df["ProductName"] + df["Description"]
df.head(2)

Unnamed: 0,ProductID,ProductName,ProductBrand,Gender,Price (INR),NumImages,Description,PrimaryColor,NameDescription
0,10017413,DKNY Unisex Black & Grey Printed Medium Trolle...,DKNY,Unisex,11745,7,"Black and grey printed medium trolley bag, sec...",Black,DKNY Unisex Black & Grey Printed Medium Trolle...
1,10016283,EthnoVogue Women Beige & Grey Made to Measure ...,EthnoVogue,Women,5810,7,Beige & Grey made to measure kurta with churid...,Beige,EthnoVogue Women Beige & Grey Made to Measure ...


In [12]:
# if we wantted we could have user SBERT model to create the embedding 
# from sentence_transformers import SentenceTransformer
# model = SentenceTransformer('all-distilroberta-v1')
# df["NameDescriptionVector"] = df["NameDescription"].apply(lambda x: model.encode(x)) 

from openai import OpenAI
client = OpenAI(api_key="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")

def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

### Test Embedding function


In [13]:
sample_embedding = get_embedding("magic")
len(sample_embedding)

1536

### Create a embedded csv (to prevent cost of openai embedding api and time consumption during presentation)

In [8]:
# here we could have user SBERT model 
# # df["NameDescriptionVector"] = df["NameDescription"].apply(lambda x: model.encode(x)) 

df["NameDescriptionVector"] = df["NameDescription"].apply(lambda x: get_embedding(x, model='text-embedding-ada-002') )
df.to_csv("embedded.csv")

### Connect with elasticsearch and Insert Data

In [2]:
from elasticsearch import Elasticsearch

es = Elasticsearch(
    "https://localhost:9200",
    basic_auth=("elastic","bDoOJCAnKQXn6Dw_DCYw"),
    ca_certs="/Users/admin/Downloads/elasticsearch-8.11.3/config/certs/http_ca.crt"
)

es.ping()



True

In [5]:
from indexMapping import indexMapping
import numpy as np

In [6]:
es.indices.create(index="my_products", mappings=indexMapping)

BadRequestError: BadRequestError(400, 'resource_already_exists_exception', 'index [my_products/aQsnHxBARdiocANs2R3Y0w] already exists')

In [7]:
embedding_df = pd.read_csv("embedded.csv",index_col=0)
embedding_df['NameDescriptionVector'] = embedding_df.NameDescriptionVector.apply(eval).apply(np.array)

In [8]:
docs = embedding_df.to_dict("records")
docs[:3]

[{'ProductID': 10017413,
  'ProductName': 'DKNY Unisex Black & Grey Printed Medium Trolley Bag',
  'ProductBrand': 'DKNY',
  'Gender': 'Unisex',
  'Price (INR)': 11745,
  'NumImages': 7,
  'Description': 'Black and grey printed medium trolley bag, secured with a TSA lockOne handle on the top and one on the side, has a trolley with a retractable handle on the top and four corner mounted inline skate wheelsOne main zip compartment, zip lining, two compression straps with click clasps, one zip compartment on the flap with three zip pocketsWarranty: 5 yearsWarranty provided by Brand Owner / Manufacturer',
  'PrimaryColor': ' Black',
  'NameDescription': 'DKNY Unisex Black & Grey Printed Medium Trolley BagBlack and grey printed medium trolley bag, secured with a TSA lockOne handle on the top and one on the side, has a trolley with a retractable handle on the top and four corner mounted inline skate wheelsOne main zip compartment, zip lining, two compression straps with click clasps, one zip

In [9]:
for doc in docs:
    try:
        es.index(index="my_products", document=doc, id=doc["ProductID"])
    except Exception as e:
        print(e)

In [10]:
es.count(index="my_products")

ObjectApiResponse({'count': 468, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})

# Search Data - Semantic Search (Vector Search)

### semantic search without openai 

In [14]:
input_keyword = "Blue Shoes for men under 2k"
vector_of_input_keyword = get_embedding(input_keyword)

In [15]:
query = {
    "field" : "NameDescriptionVector",
    "query_vector" : vector_of_input_keyword,
    "k" : 10,
    "num_candidates" : 500, 
}

res = es.knn_search(index="my_products",
                     knn=query , 
                     source=["ProductName","Description","PrimaryColor","Price (INR)","Gender"])
res["hits"]["hits"]

  res = es.knn_search(index="my_products",


[{'_index': 'my_products',
  '_id': '10005997',
  '_score': 0.7492948,
  '_source': {'ProductName': 'ID Men Navy Blue Solid Leather Mid-Top Sneakers',
   'Gender': 'Men',
   'Price (INR)': 1286,
   'Description': 'A pair of round-toe navy blue sneakers, has mid-top styling, lace-up detailLeather upperCushioned footbedTextured and patterned outsoleWarranty: 3 monthsWarranty provided by brand/manufacturer',
   'PrimaryColor': 'Blue'}},
 {'_index': 'my_products',
  '_id': '10018013',
  '_score': 0.7472377,
  '_source': {'ProductName': 'Puma Men Blue Sneakers',
   'Gender': 'Men',
   'Price (INR)': 1799,
   'Description': 'A pair of round-toe blue sneakers, has regular styling, lace-up detailTextile upperCushioned footbedTextured and patterned outsoleWarranty: 3 monthsWarranty provided by brand/manufacturer',
   'PrimaryColor': 'Blue'}},
 {'_index': 'my_products',
  '_id': '10018075',
  '_score': 0.7472377,
  '_source': {'ProductName': 'Puma Men Blue Sneakers',
   'Gender': 'Men',
   'Pric

In [16]:
# Applying filter (Advance search)

q1 = {
    "knn": {
        "field": "NameDescriptionVector",
        "query_vector": vector_of_input_keyword,
        "k": 10,
        "num_candidates": 10000
    },
    "_source": ["ProductName","Description","PrimaryColor","Price (INR)","ProductBrand","Gender"]
}

min_price = 0
max_price = 2000
Gender = "Men"

filter_query = {
    "bool": {
        "must": [
            {
                "match": {
                    "Gender": {
                        "query": Gender,
                        "fuzzy_transpositions": "false",
                        "fuzziness": 0
                    }
                }
            },
            {
                "range": {
                    "Price (INR)": {
                        "gte": min_price,
                        "lte": max_price
                    }
                }
            }
        ]
    }
}

res = es.knn_search(index="my_products",  # change index name here.
                    body=q1,
                    request_timeout=5000,
                    filter=filter_query)

res["hits"]["hits"]

  res = es.knn_search(index="my_products",  # change index name here.
  res = es.knn_search(index="my_products",  # change index name here.


[{'_index': 'my_products',
  '_id': '10005997',
  '_score': 0.7492948,
  '_source': {'ProductName': 'ID Men Navy Blue Solid Leather Mid-Top Sneakers',
   'ProductBrand': 'ID',
   'Gender': 'Men',
   'Price (INR)': 1286,
   'Description': 'A pair of round-toe navy blue sneakers, has mid-top styling, lace-up detailLeather upperCushioned footbedTextured and patterned outsoleWarranty: 3 monthsWarranty provided by brand/manufacturer',
   'PrimaryColor': 'Blue'}},
 {'_index': 'my_products',
  '_id': '10018013',
  '_score': 0.7472377,
  '_source': {'ProductName': 'Puma Men Blue Sneakers',
   'ProductBrand': 'Puma',
   'Gender': 'Men',
   'Price (INR)': 1799,
   'Description': 'A pair of round-toe blue sneakers, has regular styling, lace-up detailTextile upperCushioned footbedTextured and patterned outsoleWarranty: 3 monthsWarranty provided by brand/manufacturer',
   'PrimaryColor': 'Blue'}},
 {'_index': 'my_products',
  '_id': '10018075',
  '_score': 0.7472377,
  '_source': {'ProductName': 'P

## Elasticsearch with Openai

### understanding the intent of the user

In [17]:
color_list = embedding_df["PrimaryColor"].drop_duplicates().to_list()
print(color_list)
gender_list = embedding_df["Gender"].drop_duplicates().to_list()
print(gender_list)

[' Black', ' Beige', ' Pink', 'Blue', ' White', ' Brown', ' Burgundy', ' Red', ' Green', ' Maroon', ' Navy', ' Gold', ' Yellow', ' Grey', ' Platinum', ' Silver', ' Khaki', ' Mustard', ' Lavender', ' Matte', ' Rose', ' Charcoal', ' Purple']
['Unisex', 'Women', 'Men', 'Boys', 'Girls']


In [18]:
input_keyword = "brown Shoes for men under 1500"
vector_of_input_keyword = get_embedding(input_keyword)

### Adding prompt for understanding the user input

In [19]:
my_prompt = f"""I have data in elastic search of all clothing products with their description, color, price and the gender they belongs to.
genders are {gender_list}
colors are {color_list}
price can be anything from 0 to 100k
based on user's search query. give me json output as follows
{{
"color": "it should be what users want. give Not-Mentioned if user did not explicitly mentioned the color in query. If the color mentioned by user is not present in above color list, give Not-Found",
"gender": "gender should be from above list only. if not specified give Not-Mentioned."
"max_price":
"min_price":
}}

users query : {input_keyword}
"""
my_prompt

'I have data in elastic search of all clothing products with their description, color, price and the gender they belongs to.\ngenders are [\'Unisex\', \'Women\', \'Men\', \'Boys\', \'Girls\']\ncolors are [\' Black\', \' Beige\', \' Pink\', \'Blue\', \' White\', \' Brown\', \' Burgundy\', \' Red\', \' Green\', \' Maroon\', \' Navy\', \' Gold\', \' Yellow\', \' Grey\', \' Platinum\', \' Silver\', \' Khaki\', \' Mustard\', \' Lavender\', \' Matte\', \' Rose\', \' Charcoal\', \' Purple\']\nprice can be anything from 0 to 100k\nbased on user\'s search query. give me json output as follows\n{\n"color": "it should be what users want. give Not-Mentioned if user did not explicitly mentioned the color in query. If the color mentioned by user is not present in above color list, give Not-Found",\n"gender": "gender should be from above list only. if not specified give Not-Mentioned."\n"max_price":\n"min_price":\n}\n\nusers query : brown Shoes for men under 1500\n'

In [20]:
response = client.chat.completions.create(
  model="gpt-3.5-turbo-1106",
  response_format={ "type": "json_object" },
  messages=[
    {"role": "system", "content": "You are a helpful assistant designed to output only in JSON format. No other text or explaination."},
    {"role": "user", "content": my_prompt}
  ]
)

response.choices[0].message.content

'{\n  "color": "Brown",\n  "gender": "Men",\n  "max_price": 1500,\n  "min_price": 0\n}'

In [21]:
import json
filter_map = json.loads(response.choices[0].message.content)
filter_map

{'color': 'Brown', 'gender': 'Men', 'max_price': 1500, 'min_price': 0}

In [22]:

q1 = {
    "knn": {
        "field": "NameDescriptionVector",
        "query_vector": vector_of_input_keyword,
        "k": 10,
        "num_candidates": 10000
    },
    "_source": ["ProductName","Description","PrimaryColor","Price (INR)","ProductBrand","Gender"]
}


filter_query = {
    "bool": {
        "must": [
            {
                "match": {
                    "PrimaryColor": {
                        "query": filter_map["color"],
                        "fuzzy_transpositions": "false",
                        "fuzziness": 0
                    }
                }
            },
            {
                "range": {
                    "Price (INR)": {
                        "gte": filter_map["min_price"],
                        "lte": filter_map["max_price"]
                    }
                }
            }
        ]
    }
}

res = es.knn_search(index="my_products",
                    body=q1,
                    request_timeout=5000,
                    filter=filter_query)

res["hits"]["hits"]

  res = es.knn_search(index="my_products",
  res = es.knn_search(index="my_products",


[{'_index': 'my_products',
  '_id': '10006031',
  '_score': 0.75574964,
  '_source': {'ProductName': 'ID Men Brown Solid Leather Mid-Top Sneakers',
   'ProductBrand': 'ID',
   'Gender': 'Men',
   'Price (INR)': 1286,
   'Description': 'A pair of round-toe brown sneakers, has mid-top styling, lace-up detailLeather upperCushioned footbedTextured and patterned outsoleWarranty: 3 monthsWarranty provided by brand/manufacturer',
   'PrimaryColor': ' Brown'}},
 {'_index': 'my_products',
  '_id': '10006073',
  '_score': 0.7465898,
  '_source': {'ProductName': 'ID Men Brown Leather Loafers',
   'ProductBrand': 'ID',
   'Gender': 'Men',
   'Price (INR)': 1218,
   'Description': 'A pair of square toe brown loafers, has regular styling, slip-on detailLeather upperCushioned footbedTextured and patterned outsoleWarranty: 3 monthsWarranty provided by brand/manufacturer',
   'PrimaryColor': ' Brown'}},
 {'_index': 'my_products',
  '_id': '10006083',
  '_score': 0.74521184,
  '_source': {'ProductName':