In [1]:
import json

with open('eval.json', 'r') as f:
    data = json.load(f)

In [2]:
len(data)

233

In [3]:
point = data[0]

In [4]:
query = point['query']
biencoder_results = point['biencoder_search_results']
cross_encoder_results = point['cross_encoder_search_results']
keyword_results = point['keyword_search_results']
hybrid_search_results = point['hybrid_search_results']

In [5]:
query

'can i get a 5-star rated ban dai toy for my almost 4-year-old?'

In [6]:
def format_results(result_dict):
    result = """Product: {product_name}
Manufacturer: {manufacturer}
Price: {price}
Preferred Age: {preferred_age}
Rating: {rating}
Reviews: {reviews}
    """
    return result.format(**result_dict)

In [7]:
# hybrid_search_results[0]
print(format_results(hybrid_search_results[0]))

Product: Teen Titans Shape-Shifting Beast Boy 5" inch Faigure By Ban Dai in 2003 - The packet is not in mint condition
Manufacturer: ban dai
Price: 36.89
Preferred Age: 4.0
Rating: 5.0
Reviews: ['Five Stars', 'delighted with this item']
    


In [8]:
LLM_PROMPT = """
You are a judge for an information retrieval system. 
You will be given a query and a list of results. You need to annotate the results based on the query.
Return a binary answer, where 1 means the result is relevant to the query and 0 means it is not.
The query will be a human-language input to search for a product.
The results will a product description returned by our system.

The result is relevant if it follows the query.
If the query mentions a manufacturer or brand, the result is relevant if it is made by that manufacturer or brand.
If the query mentions a product category, the result is relevant if it is in that category.
If the query mentions a price, the returned result should be within 20 percent of the price.
If the query mentions a preferred age, the returned result should be within the range of the preferred age.
Keep in mind, the query may contain subjective information, so the result should be relevant if they are similar to the query.

Return a binary value, where 1 means the result is relevant to the query and 0 means it is not.
Query: {query}
Results: {search_results}

Make sure to return a binary value, where 1 means the result is relevant to the query and 0 means it is not.
Output the result in JSON format, within ```json and ``` tags.
The JSON should have two keys, explanation and answer.
explanation should be a short explanation of why the result is relevant or not.
answer should be a binary value, where 1 means the result is relevant to the query and 0 means it is not.
"""


In [9]:
from openai import AzureOpenAI
llm_client = AzureOpenAI(
    api_key="",
    api_version="2024-02-15-preview",
    azure_endpoint="",
    azure_deployment="gpt-4o-large"
)

In [10]:
print(query)

can i get a 5-star rated ban dai toy for my almost 4-year-old?


In [11]:
print(format_results(biencoder_results[0]))
print(format_results(cross_encoder_results[0]))
print(format_results(keyword_results[0]))
print(format_results(hybrid_search_results[0]))

Product: Schleich Dragon Knight King on Horse Action Figure
Manufacturer: schleich
Price: 15.98
Preferred Age: 3.0
Rating: 4.7
Reviews: ['Great toy - my 4 year old son and his cousin had lots of fun over Christmas.  Five Stars']
    
Product: Schleich Dragon Knight King on Horse Action Figure
Manufacturer: schleich
Price: 15.98
Preferred Age: 3.0
Rating: 4.7
Reviews: ['Great toy - my 4 year old son and his cousin had lots of fun over Christmas.  Five Stars']
    
Product: Teen Titans Shape-Shifting Beast Boy 5" inch Faigure By Ban Dai in 2003 - The packet is not in mint condition
Manufacturer: ban dai
Price: 36.89
Preferred Age: 4.0
Rating: 5.0
Reviews: ['Five Stars', 'delighted with this item']
    
Product: Teen Titans Shape-Shifting Beast Boy 5" inch Faigure By Ban Dai in 2003 - The packet is not in mint condition
Manufacturer: ban dai
Price: 36.89
Preferred Age: 4.0
Rating: 5.0
Reviews: ['Five Stars', 'delighted with this item']
    


In [12]:
response = llm_client.chat.completions.create(
    model="gpt-4o-large",
    messages=[{"role": "user", "content": LLM_PROMPT.format(query=query, search_results=format_results(biencoder_results[0]))}],
    temperature=0,
    max_tokens=1000,
)
response = response.choices[0].message.content

In [13]:
import re
def parse_results(response):
    result = re.search(r'```json\s*([\s\S]*?)\s*```', response) 
    result = result.group(1)
    result = json.loads(result)
    return result['answer'], result['explanation']


In [14]:
import re
result = re.search(r'```json\s*([\s\S]*?)\s*```', response)
result = result.group(1)
result = json.loads(result)
result



{'explanation': 'The result is not relevant because the manufacturer is Schleich, not Ban Dai, which was specifically mentioned in the query.',
 'answer': 0}

In [15]:
from tqdm import tqdm
res = []
for item in tqdm(data):
    query = item['query']
    biencoder_results = item['keyword_search_results']
    query_results = []
    for result in biencoder_results:
        retries = 3
        result = format_results(result)
        while retries > 0:
            try:
                response = llm_client.chat.completions.create(
                    model="gpt-4o-large",
                    messages=[{"role": "user", "content": LLM_PROMPT.format(query=query, search_results=result)}],
                    temperature=0,
                    max_tokens=1000,
                )
                answer, explanation = parse_results(response.choices[0].message.content)
                query_results.append({"result": result, "answer": answer, "explanation": explanation})
                break
            except Exception as e:
                print(e)
                retries -= 1
    res.append({"query": query, "results": query_results})

with open('keyword_results_annotated.json', 'w') as f:
    json.dump(res, f, indent=4)


100%|██████████| 233/233 [29:41<00:00,  7.65s/it]


In [16]:
list(res[0])

['query', 'results']