In [50]:
import sys
import os

# 获取当前notebook的绝对路径
current_dir = os.path.abspath('')
# 获取项目根目录的路径（src的父目录）
project_root = os.path.dirname(os.path.dirname(current_dir))
sys.path.append(project_root)

N_samp = 20
alpha = 0.5

from src.evaluation.evaluation_multimodal import *

### Get random properties as test set

In [51]:
random_properties, random_property_ids = get_random_img_properties(N_samp)
# random_properties[0], random_property_ids


### Generate queries for these properties

In [52]:
random_properties_and_queries = generate_queries(random_properties)

In [53]:
# random_properties_and_queries[0]

In [54]:
sample_queries = [
    {'text': property['generated_query'], 'files': [property['image_path']]}
    for property in random_properties_and_queries
]
# print(sample_queries[0])
# print(len(sample_queries))

In [55]:
from typing import Optional
from pydantic import BaseModel, Field
from typing import List, Dict, Any
from src.data_models import Address, ImageDescrib
import pandas as pd 
from src.multimodal_search import *


collection = get_collection()
hybrid_searcher = MultiModalSearch(collection)


class SearchResultItem(BaseModel):
    id: int = Field(alias='_id')
    name: str
    accommodates: Optional[int] = None
    address: Address
    summary: Optional[str] = None
    description: Optional[str] = None
    neighborhood_overview: Optional[str] = None
    notes: Optional[str] = None
    images: ImageDescrib
    search_score: Optional[float] = None
    reviews: Optional[List[Dict[str, Any]]] = None

### Retrive top 10 most similar listings from database for each property

In [56]:
retrieved_results = []
for query in sample_queries:
    results = hybrid_searcher.do_search(query,alpha_text=alpha)
    search_results_models = [SearchResultItem(**result)  for result in results]
    search_results_df = pd.DataFrame([item.model_dump() for item in search_results_models])
    retrieved_results.append(search_results_df)

    


In [57]:
# (retrieved_results[0]).columns

In [58]:
retrieved_ids = []
for result in retrieved_results:
    retrieved_ids.append(result['id'].tolist())

### Check if random property ids are in the top k retrieved ids

In [59]:
top_k_positions = []
for i in range(len(retrieved_ids)):
    top_k_positions.append(check_top_k_positions(retrieved_ids[i], random_property_ids[i]))
    
# top_k_positions

In [60]:
# Convert top_k_positions list to DataFrame
top_k_df = pd.DataFrame(top_k_positions, columns=['top_1', 'top_3', 'top_5', 'top_10'])
top_1 = sum(top_k_df['top_1'].tolist())/len(top_k_df)
top_3 = sum(top_k_df['top_3'].tolist())/len(top_k_df)
top_5 = sum(top_k_df['top_5'].tolist())/len(top_k_df)
top_10 = sum(top_k_df['top_10'].tolist())/len(top_k_df)

# top_1, top_3, top_5, top_10

### Using Ragas to evaluate the response by LLM
- Context Recall: 
- Faithfulness
- Factual correctness

Context Recall measures how many of the relevant documents (or pieces of information) were successfully retrieved. It focuses on not missing important results. Higher recall means fewer relevant documents were left out. In short, recall is about not missing anything important. Since it is about not missing anything, calculating context recall always requires a reference to compare against.

The Faithfulness metric measures how factually consistent a response is with the retrieved context. It ranges from 0 to 1, with higher scores indicating better consistency.

FactualCorrectness is a metric class that evaluates the factual correctness of responses generated by a language model. It uses claim decomposition and natural language inference (NLI) to verify the claims made in the responses against reference texts.

In [61]:
expected_responses = []
# for i in range(len(sample_queries)):
#     expected_responses.append(random_properties[i]['description'])

from src.evaluation.evaluation_multimodal import Gen_MultimodalEvlDataset
data_generator = Gen_MultimodalEvlDataset()

expected_responses = data_generator.gen_combined_gt(random_properties)


In [62]:
# generate ground truth image caption
# expected_responses[0]

In [63]:
retrieved_results[1]['images'][0]['picture_url']

'https://a0.muscache.com/im/pictures/6d3c7ee5-228c-4022-bec9-415ea454b9be.jpg?aki_policy=large'

In [64]:
# top_1_retrieved_contexts = []
# top_1_retrieved_contexts = [retrieved_results[i]['description'][0] for i in range(len(retrieved_results))]
# top_1_retrieved_contexts
# retrieved_results[0]['images'][0]['picture_url']

top_1_retrieved_contexts = data_generator.gen_combined_retrieved(retrieved_results)

In [65]:
# top_1_retrieved_contexts[0]

In [66]:
def generate_response(query, context):      
    prompt_template = ChatPromptTemplate.from_messages([
                ("system", """You are an Airbnb listing recommendation system. Please:
                1. Respond in the same language as the user
                2. If the user is asking for property recommendations:
                   - Prioritize results with higher search scores
                   - Include the Airbnb listing URL and image URL
                   - Explain why you chose these properties
                   - Highlight features that match the user's criteria
                3. If the user has provided an image, consider visual similarity in your recommendations
                4. Be friendly and helpful in your responses
                5, answer the question in the same language as the query"""),
                ("human", "Answer this user query: {query} with the following context:\n{context}")
            ])

    formatted_messages = prompt_template.format_messages(query=query, context=context)
    return rag_llm(formatted_messages)  

In [67]:
dataset = []

for query, reference, context in zip(sample_queries, expected_responses, top_1_retrieved_contexts):
    response = generate_response(query, context)
    query_input = data_generator.gen_combined_query(query)
    dataset.append(
        {
            "user_input":query_input,
            "retrieved_contexts": [context],
            "response":response.content,
            "reference":reference
        }
    )

In [68]:
# print('--response: ',response.content)
# print('--ground_truth: ', reference)
# print('--input: ', query_input)
# print('--top1_retrived_contexts: ', context)

In [69]:
# dataset

In [70]:
from ragas import EvaluationDataset
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness


evaluation_dataset = EvaluationDataset.from_list(dataset)
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1-nano"))

In [71]:
result = evaluate(dataset=evaluation_dataset,metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness()],llm=evaluator_llm)
result


Evaluating: 100%|██████████| 60/60 [00:41<00:00,  1.45it/s]


{'context_recall': 0.7126, 'faithfulness': 0.4216, 'factual_correctness(mode=f1)': 0.4880}

In [72]:
top_1, top_3, top_5, top_10

(0.2, 0.25, 0.3, 0.3)

In [73]:
result.upload()

ValueError: RAGAS_APP_TOKEN is not set