In [1]:
import sys
import os

# 获取当前notebook的绝对路径
current_dir = os.path.abspath('')
# 获取项目根目录的路径（src的父目录）
project_root = os.path.dirname(os.path.dirname(current_dir))
sys.path.append(project_root)


from src.evaluation.evaluation import *

### Get random properties as test set

In [2]:
random_properties, random_property_ids = get_random_properties(50)
random_properties[0], random_property_ids


({'id': 28432923,
  'name': 'Apartment with Balcony , Sultan Ahmet',
  'description': 'Our Suites is situated in the Old City . Blue Mosque is 50 meters from the property. Free WiFi is featured throughout the property. All units feature a seating and dining area. All units have a kitchen equipped with a dishwasher and microwave. A fridge and stove-top are also provided, as well as a kettle. There is a private bathroom with a bath or shower in each unit. Hagia Sophia is 200 meters away. The nearest airport is Istanbul Ataturk Airport, 21 km from Our Suites.'},
 [28432923,
  10082307,
  22323221,
  14490025,
  15719888,
  2856208,
  23493822,
  5192553,
  25086367,
  9422452,
  3261569,
  23249192,
  21992735,
  6485067,
  6055546,
  19961067,
  29932504,
  9334038,
  27935574,
  19779316,
  27540180,
  20049287,
  1206363,
  32774643,
  17693637,
  17416505,
  19050582,
  10240767,
  27588189,
  29674795,
  29697871,
  8606636,
  4449066,
  27805512,
  17584991,
  9174474,
  29455981,
 

### Generate queries for these properties

In [3]:
random_properties_and_queries = generate_queries(random_properties)

In [4]:
random_properties_and_queries

[{'id': 28432923,
  'name': 'Apartment with Balcony , Sultan Ahmet',
  'description': 'Our Suites is situated in the Old City . Blue Mosque is 50 meters from the property. Free WiFi is featured throughout the property. All units feature a seating and dining area. All units have a kitchen equipped with a dishwasher and microwave. A fridge and stove-top are also provided, as well as a kettle. There is a private bathroom with a bath or shower in each unit. Hagia Sophia is 200 meters away. The nearest airport is Istanbul Ataturk Airport, 21 km from Our Suites.',
  'generated_query': "I want to stay in a suite in the Old City, close to the Blue Mosque and Hagia Sophia, with free WiFi, a kitchen with modern appliances, and a private bathroom. Ideal for exploring Istanbul's historic sites."},
 {'id': 10082307,
  'name': 'Double Room en-suite (307)',
  'description': 'A standard double room with a queen size double bed and with private bathroom. There is a working table, chair and a shelf. A c

In [5]:
sample_queries = [property['generated_query'] for property in random_properties_and_queries]
sample_queries

["I want to stay in a suite in the Old City, close to the Blue Mosque and Hagia Sophia, with free WiFi, a kitchen with modern appliances, and a private bathroom. Ideal for exploring Istanbul's historic sites.",
 'I want to stay in a clean and comfortable double room with a queen size bed, private bathroom, and a workspace including a table and chair. Ideal for a relaxing getaway or a productive business trip.',
 'I want to stay in a centrally located 1 bedroom apartment with a fully equipped modern kitchen, private outdoor courtyard, smart TV, and a comfortable queen size bed. Ideal for business trips or sightseeing, with pet-friendly options available.',
 'Apartamento aconchegante e luminoso em zona tranquila, a 100 metros do Metro e 8 minutos do Aeroporto. Ideal para casais e viajantes a negócios, com garagem, comércio próximo e bela vista. Perfeito para quem busca conforto e privacidade em Porto.',
 'I want to stay in a fully furnished 3-bedroom home near Crown St cafes, with a mode

In [6]:
from typing import Optional
from pydantic import BaseModel, Field
from typing import List, Dict, Any
from src.data_models import Address, ImageDescrib
import pandas as pd 
from src.hybrid_search import *


collection = get_collection()
hybrid_searcher = HybridSearch(collection)


class SearchResultItem(BaseModel):
    id: int = Field(alias='_id')
    name: str
    accommodates: Optional[int] = None
    address: Address
    summary: Optional[str] = None
    description: Optional[str] = None
    neighborhood_overview: Optional[str] = None
    notes: Optional[str] = None
    images: ImageDescrib
    search_score: Optional[float] = None
    reviews: Optional[List[Dict[str, Any]]] = None

  from .autonotebook import tqdm as notebook_tqdm

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.4 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/qiaolinghe/.local/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/qiaolinghe/.local/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/qiaolinghe/.local/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 73

### Retrive top 10 most similar listings from database for each property

In [7]:
retrieved_results = []
for query in sample_queries:
    results = hybrid_searcher.do_search(query)
    search_results_models = [SearchResultItem(**result)  for result in results]
    search_results_df = pd.DataFrame([item.model_dump() for item in search_results_models])
    retrieved_results.append(search_results_df)

    
retrieved_results[0]['description']

0    Penthouse with great sea and city view  is loc...
1    The flat has 2 rooms one saloon, a kitchen, a ...
2    This amazing studio is in the heart of the old...
3    Room description. This cosy room features air ...
4     Istanbul is an exciting destination for any t...
5    Big old apartment, having all the comfort for ...
6    2 bedr 2 bath 8 minute walk historical area 6 ...
7    La Sera Pera Taksim Suites are a self-catering...
8    Located in Taksim, Deluxe Room is within a wal...
9    Our fully furnished and decorated apartments a...
Name: description, dtype: object

In [8]:
retrieved_ids = []
for result in retrieved_results:
    retrieved_ids.append(result['id'].tolist())

### Check if random property ids are in the top k retrieved ids

In [9]:
top_k_positions = []
for i in range(len(retrieved_ids)):
    top_k_positions.append(check_top_k_positions(retrieved_ids[i], random_property_ids[i]))
    
top_k_positions

[array([0, 0, 0, 0]),
 array([0, 0, 0, 0]),
 array([0, 1, 1, 1]),
 array([0, 0, 0, 0]),
 array([0, 0, 0, 0]),
 array([0, 0, 0, 0]),
 array([0, 0, 0, 0]),
 array([1, 1, 1, 1]),
 array([0, 1, 1, 1]),
 array([0, 0, 0, 1]),
 array([0, 0, 0, 0]),
 array([0, 0, 0, 0]),
 array([0, 0, 0, 0]),
 array([0, 1, 1, 1]),
 array([0, 0, 0, 0]),
 array([0, 1, 1, 1]),
 array([0, 0, 0, 0]),
 array([1, 1, 1, 1]),
 array([1, 1, 1, 1]),
 array([1, 1, 1, 1]),
 array([0, 0, 0, 0]),
 array([0, 0, 0, 0]),
 array([0, 0, 0, 0]),
 array([1, 1, 1, 1]),
 array([0, 0, 0, 0]),
 array([0, 0, 0, 0]),
 array([0, 0, 0, 0]),
 array([0, 0, 0, 1]),
 array([0, 0, 0, 1]),
 array([0, 0, 0, 0]),
 array([1, 1, 1, 1]),
 array([1, 1, 1, 1]),
 array([1, 1, 1, 1]),
 array([1, 1, 1, 1]),
 array([0, 1, 1, 1]),
 array([1, 1, 1, 1]),
 array([0, 0, 0, 0]),
 array([1, 1, 1, 1]),
 array([1, 1, 1, 1]),
 array([0, 0, 0, 0]),
 array([1, 1, 1, 1]),
 array([1, 1, 1, 1]),
 array([1, 1, 1, 1]),
 array([0, 0, 0, 1]),
 array([0, 0, 0, 0]),
 array([1,

In [11]:
# Convert top_k_positions list to DataFrame
top_k_df = pd.DataFrame(top_k_positions, columns=['top_1', 'top_3', 'top_5', 'top_10'])
top_1 = sum(top_k_df['top_1'].tolist())/len(top_k_df)
top_3 = sum(top_k_df['top_3'].tolist())/len(top_k_df)
top_5 = sum(top_k_df['top_5'].tolist())/len(top_k_df)
top_10 = sum(top_k_df['top_10'].tolist())/len(top_k_df)

top_1, top_3, top_5, top_10

(0.36, 0.46, 0.46, 0.54)

### Using Ragas to evaluate the response by LLM
- Context Recall: 
- Faithfulness
- Factual correctness

Context Recall measures how many of the relevant documents (or pieces of information) were successfully retrieved. It focuses on not missing important results. Higher recall means fewer relevant documents were left out. In short, recall is about not missing anything important. Since it is about not missing anything, calculating context recall always requires a reference to compare against.

The Faithfulness metric measures how factually consistent a response is with the retrieved context. It ranges from 0 to 1, with higher scores indicating better consistency.

FactualCorrectness is a metric class that evaluates the factual correctness of responses generated by a language model. It uses claim decomposition and natural language inference (NLI) to verify the claims made in the responses against reference texts.

In [26]:
expected_responses = []
for i in range(len(sample_queries)):
    expected_responses.append(random_properties[i]['description'])


In [27]:
top_1_retrieved_contexts = []
top_1_retrieved_contexts = [retrieved_results[i]['description'][0] for i in range(len(retrieved_results))]
top_1_retrieved_contexts


['Penthouse with great sea and city view  is located in the heart of Istanbul;very close to all city highlights; Galata Bridge and Galata Tower, historical peninsula.  The stylish, comfortable and bright penthouse is available for 5 guests...  The stylish & full decorated penthouse is located in central Istanbul; very close to all city highlights; Galata Bridge; Galata Tower, and historical peninsula. The penthouse has a great sea and city view including the  historical peninsula, Hagia Sophia, Blue Mosque, Galata Tower etc.  The stylish, comfortable and bright apartment has 1 bedroom with a saloon, having also a sofa which can be a double bed. An additional bad can be placed in the living room or bed room.  So we have opportunity for the accommodation of 5 people. There is 24 hours of reception and you can feel the hotel comfort during your stay . We can serve for you breakfast (with an additional fee) downstairs at the lovely cafeteria. Set in a reformed, century-old building, having

In [28]:
def generate_response(query, context):      
    prompt_template = ChatPromptTemplate.from_messages([
                ("system", """You are an Airbnb listing recommendation system. Please:
                1. Respond in the same language as the user
                2. If the user is asking for property recommendations:
                   - Prioritize results with higher search scores
                   - Include the Airbnb listing URL and image URL
                   - Explain why you chose these properties
                   - Highlight features that match the user's criteria
                3. If the user has provided an image, consider visual similarity in your recommendations
                4. Be friendly and helpful in your responses
                5, answer the question in the same language as the query"""),
                ("human", "Answer this user query: {query} with the following context:\n{context}")
            ])

    formatted_messages = prompt_template.format_messages(query=query, context=context)
    return rag_llm(formatted_messages)  

In [29]:
dataset = []

for query, reference, context in zip(sample_queries, expected_responses, top_1_retrieved_contexts):
    response = generate_response(query, context)
    dataset.append(
        {
            "user_input":query,
            "retrieved_contexts": [context],
            "response":response.content,
            "reference":reference
        }
    )

KeyboardInterrupt: 

In [25]:
dataset

[{'user_input': "I want to stay in a suite in the Old City, close to the Blue Mosque and Hagia Sophia, with free WiFi, a kitchen with modern appliances, and a private bathroom. Ideal for exploring Istanbul's historic sites.",
  'retrieved_contexts': ['Penthouse with great sea and city view  is located in the heart of Istanbul;very close to all city highlights; Galata Bridge and Galata Tower, historical peninsula.  The stylish, comfortable and bright penthouse is available for 5 guests...  The stylish & full decorated penthouse is located in central Istanbul; very close to all city highlights; Galata Bridge; Galata Tower, and historical peninsula. The penthouse has a great sea and city view including the  historical peninsula, Hagia Sophia, Blue Mosque, Galata Tower etc.  The stylish, comfortable and bright apartment has 1 bedroom with a saloon, having also a sofa which can be a double bed. An additional bad can be placed in the living room or bed room.  So we have opportunity for the a

In [20]:
from ragas import EvaluationDataset
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness


evaluation_dataset = EvaluationDataset.from_list(dataset)
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1-nano"))

In [21]:
result = evaluate(dataset=evaluation_dataset,metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness()],llm=evaluator_llm)
result


Evaluating:   5%|▍         | 7/150 [00:06<01:47,  1.32it/s]Exception raised in Job[18]: OutputParserException(Invalid json output: {
  "classifications": [
    {
      "statement": "全新别墅中的全新一套房,带独立卫浴.",
      "reason": "The context mentions a newly renovated apartment with a comfortable bedroom and private amenities, indicating a new and private living space.",
      "attributed": 1
    },
    {
      "statement": "独立门进出.",
      "reason": "The context states the apartment is in a prime location with easy access to public transport and mentions independent access, supporting this statement.",
      "attributed": 1
    },
    {
      "statement": "周边 Bus M54,630均可以到悉尼各个地方和Epping火车站与Eastwood火车站.",
      "reason": "The context specifies nearby bus routes and good transportation links to various parts of Sydney, including mention of buses and train stations.",
      "attributed": 1
    },
    {
      "statement": "拎包即可入住.",
      "reason": "The context describes a fully equipped, ready-to-

{'context_recall': 0.8051, 'faithfulness': 0.6114, 'factual_correctness(mode=f1)': 0.5340}

In [None]:
result.upload() 

[2025-04-15 11:32:25 - (2025-04-15 15:32:25 UTC)] [ERROR] [ragas.utils] [RagasID: a-d3ee32eef0fe470fa68b390f1d1bfd91, App-Version: 0.2.14] [API_ERROR] Request failed. Status Code: 500, URL: https://api.ragas.io/api/v1/alignment/evaluation, Error Message: 
API Message: An internal server error occured


UploadException: Request failed: 
API Message: An internal server error occured