In [None]:
# import os
# os.environ["SSL_CERT_FILE"] = "/mnt/d/Travel Assistant/Musafir/Fortinet_CA_SSL(15).cer"
# os.environ["REQUESTS_CA_BUNDLE"] = "/mnt/d/Travel Assistant/Musafir/Fortinet_CA_SSL(15).cer"

In [None]:
import json
import pandas as pd
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
import os


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
with open('../data/processed_data/documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)


In [4]:
documents[365]

{'city': 'Rome',
 'section': 'Eat',
 'subsection': 'Pricing',
 'text': 'Chinese restaurants are still quite cheap but other ethnic restaurants (Thai, Indian) are generally expensive (think €30 upwards per person). Sushi is very expensive (€40 minimum per person).',
 'id': '89a82dd4'}

In [5]:
# Ground Truth data
df_gt = pd.read_csv('../data/result/groud-truth-retrieval.csv')

In [6]:
df_gt

Unnamed: 0,id,city,question
0,f7845786,Cairo,What is the name of the oldest known pyramid i...
1,f7845786,Cairo,Which pyramid in Dahshur has an entrance to th...
2,f7845786,Cairo,What is the distinctive feature of the Bent Py...
3,f7845786,Cairo,How many pyramids are mentioned to be in the D...
4,f7845786,Cairo,What is the atmosphere around Dahshur Pyramids...
...,...,...,...
2710,feb11863,Seoul,What is the estimated price range for dining o...
2711,feb11863,Seoul,"Will I need more than ₩35,000 for a meal in Se..."
2712,feb11863,Seoul,"Can I find a meal for under ₩15,000 in Seoul?"
2713,feb11863,Seoul,What is the lowest amount I can expect to spen...


In [7]:
ground_truth = df_gt.to_dict(orient='records')

In [8]:
from elasticsearch import Elasticsearch
import requests


In [9]:
es_client = Elasticsearch('http://localhost:9200')
print(es_client.info())

{'name': '9125275addfa', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'M25oUGE6Qe6ICaWjniVYiw', 'version': {'number': '8.13.0', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '09df99393193b2c53d92899662a8b8b3c55b45cd', 'build_date': '2024-03-22T03:35:46.757803203Z', 'build_snapshot': False, 'lucene_version': '9.10.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


In [10]:
index_name = "travel_assistant"

In [11]:
if es_client.indices.exists(index=index_name):
    es_client.indices.delete(index=index_name)
    print(f"Delete existing index: {index_name}")

Delete existing index: travel_assistant


In [12]:
list(documents[0].keys())

['city', 'section', 'subsection', 'text', 'id']

In [13]:
# Index doc by 'city' and 'section' value and the rest is the text 

index_settings = {
    "settings":{
        "number_of_shards":1,
        "number_of_replicas":0
    },
    "mappings":{
        "properties":{
            "city":{"type":"keyword"},
            "section":{"type":"text"},
            "subsection":{"type":"text"},
            "text":{"type":"text"},
        }
    }
}

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'travel_assistant'})

In [14]:
# replace none value with null
import math

def clean_doc(doc):
    for k, v in doc.items():
        if isinstance(v, float) and math.isnan(v):
            doc[k] = None  
    return doc

In [15]:
# Set the index for the whole docs

for doc in tqdm(documents):
    es_client.index(index=index_name, document=clean_doc(doc))


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 543/543 [01:18<00:00,  6.94it/s]


## Start the evaluation process

In [16]:
#Hit Rate (HR) or Recall at k
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [17]:
# Mean Reciprocal Rank (MRR)
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [18]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }


## Start testing different elastic search method

In [19]:
# Define the basline elastic search 

def elastic_search(query):
    search_query = {
        "query": {
            "bool": {
                "must": [
                    {
                        "multi_match": {
                            "query": query, 
                            "fields": ['city', 'section', 'subsection', 'text'],
                            "type": "best_fields"
                        }
                    }
                ]
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    result_docs = []

    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs
    

In [20]:
Basline_es = evaluate(ground_truth, lambda q: elastic_search(q['question']))
Basline_es

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2715/2715 [00:16<00:00, 169.68it/s]


{'hit_rate': 0.8629834254143647, 'mrr': 0.7413546142827905}

In [21]:
def elastic_search_filter(query, city_filter=None):

    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": [
                    {
                        "multi_match": {
                            "query": query,
                            "fields": ['city', 'section', 'subsection', 'text'],
                            "type": "best_fields"
                        }
                    }
                ]
            }
        }
    }

    # Add filter 
    if city_filter:
        search_query["query"]["bool"]["filter"] = {
            "term": {
                "city": city_filter
            }
        }


    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []

    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])     
    
    return result_docs

In [22]:
filter_es = evaluate(ground_truth, lambda q: elastic_search_filter(q['question'], q['city']))
filter_es

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2715/2715 [00:14<00:00, 181.93it/s]


{'hit_rate': 0.8537753222836095, 'mrr': 0.7651995089011661}

Basline elastic search 

{'Recall': 0.8629834254143647,

 'hit_rate': 0.8629834254143647,
 
 'mrr': 0.7413546142827905}

### Semantic Search 

### Pre-trained models 
https://www.sbert.net/docs/sentence_transformer/pretrained_models.html

In [23]:
model_name = "multi-qa-distilbert-cos-v1"
model = SentenceTransformer(model_name)

In [24]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False, 'architecture': 'DistilBertModel'})
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [25]:
for doc in tqdm(documents):
    city = doc['city']
    section = doc['section']
    text = doc['text']
    cst = city + ' ' + section + ' ' + text 

    doc['city_vector'] = model.encode(city)
    doc['section_vector'] = model.encode(section)
    doc['text_vector'] = model.encode(text)
    doc['all_data_vector'] = model.encode(cst)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 543/543 [01:08<00:00,  7.92it/s]


In [26]:
index_name_vec = "traveller_vector"

In [27]:
index_settings_vect = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "city": {"type": "keyword"},
            "section": {"type": "text"},
            "subsection": {"type": "text"},
            "text": {"type": "text"},
            "id": {"type": "keyword"},
            "city_vector":{
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
            "section_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            }, 
            "text_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"

            },
            "all_data_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            }
            
        }
    }
}

es_client.indices.delete(index=index_name_vec, ignore_unavailable=True)
es_client.indices.create(index=index_name_vec, body=index_settings_vect)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'traveller_vector'})

In [28]:
for doc in tqdm(documents):
    es_client.index(index=index_name_vec, document=doc)



100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 543/543 [01:19<00:00,  6.82it/s]


In [29]:
city = "Cairo"
query = "what transport I should use?"

In [30]:
v_q = model.encode(query)

In [31]:
# Vector (kNN) query
knn_query = {
    "field": "text_vector",
    "query_vector": v_q,
    "k": 5,
    "num_candidates": 10000,
    "boost": 0.5,
    "filter": {
        "term": {
            "city": city
        }
    }
}

In [32]:
# Keyword query
keyword_query = {
    "bool": {
        "must": {
            "multi_match": {
                "query": query,
                "fields": ["city^3", 'section', 'subsection', 'text'], 
                "type": "best_fields",
                "boost": 0.5,
            }
        },
        "filter": {
            "term": {
                "city": city
            }
        }
    }
}

In [33]:
response = es_client.search(
    index=index_name_vec,
    query=keyword_query,
    knn=knn_query,
    size=5
)

### Hybrid search pipeline


In [34]:
def elastic_search_hybrid(field, query, vector, city):
    knn_query = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "boost": 0.5,
        "filter": {
            "term": {
                "city": city
            }
        }
    }

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["city^3", 'section', 'subsection', 'text'],
                    "type": "best_fields",
                    "boost": 0.5,
                }
            },
            "filter": {
                "term": {
                    "city": city
                }
            }
        }
    }

    search_query = {
        "knn": knn_query,
        "query": keyword_query,
        "size": 5,
        "_source": ["city", 'section', 'subsection', 'text', "id"]
    }

    es_results = es_client.search(
        index=index_name_vec,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [35]:
def text_hybrid(q):
    question = q['question']
    city = q['city']

    v_q = model.encode(question)

    return elastic_search_hybrid('text_vector', question, v_q, city)


In [36]:
text_hybrid_es = evaluate(ground_truth, text_hybrid)
text_hybrid_es

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2715/2715 [01:17<00:00, 34.89it/s]


{'hit_rate': 0.8714548802946593, 'mrr': 0.7861080417434002}

In [37]:
def city_hybrid(q):
    question = q['question']
    city = q['city']

    v_q = model.encode(question)

    return elastic_search_hybrid('city_vector', question, v_q, city)

In [38]:
city_hybrid_es = evaluate(ground_truth, city_hybrid)
city_hybrid_es

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2715/2715 [01:19<00:00, 34.36it/s]


{'hit_rate': 0.8556169429097605, 'mrr': 0.7665009208103126}

In [44]:
def section_hybrid(q):
    question = q['question']
    city = q['city']

    v_q = model.encode(question)

    return elastic_search_hybrid('section_vector', question, v_q, city)

In [45]:
section_hybrid_es = evaluate(ground_truth, section_hybrid)
section_hybrid_es

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2715/2715 [01:21<00:00, 33.25it/s]


{'hit_rate': 0.8556169429097605, 'mrr': 0.767127071823204}

In [41]:
def all_data_hybrid(q):
    question = q['question']
    city = q['city']

    v_q = model.encode(question)

    return elastic_search_hybrid('all_data_vector', question, v_q, city)



In [42]:
all_data_es = evaluate(ground_truth, all_data_hybrid)
all_data_es

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2715/2715 [01:19<00:00, 33.97it/s]


{'hit_rate': 0.8751381215469614, 'mrr': 0.7886372007366476}

### Reranking

In [56]:
def compute_rrf(rank, k=60):
    """ Our own implementation of the relevance score """
    return 1 / (k + rank)

def elastic_search_hybrid_rrf(field, query, vector, course, k=60):
    knn_query = {
        "field": field,
        "query_vector": vector,
        "k": 10,
        "num_candidates": 10000,
        "boost": 0.5,
        "filter": {
            "term": {
                "city": city
            }
        }
    }

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["city^3", 'section', 'subsection', 'text'],
                    "type": "best_fields",
                    "boost": 0.5,
                }
            },
            "filter": {
                "term": {
                    "city": city
                }
            }
        }
    }

    knn_results = es_client.search(
        index=index_name_vec, 
        body={
            "knn": knn_query, 
            "size": 10
        }
    )['hits']['hits']
    
    keyword_results = es_client.search(
        index=index_name_vec, 
        body={
            "query": keyword_query, 
            "size": 10
        }
    )['hits']['hits']
    
    rrf_scores = {}
    # Calculate RRF using vector search results
    for rank, hit in enumerate(knn_results):
        doc_id = hit['_id']
        rrf_scores[doc_id] = compute_rrf(rank + 1, k)

    # Adding keyword search result scores
    for rank, hit in enumerate(keyword_results):
        doc_id = hit['_id']
        if doc_id in rrf_scores:
            rrf_scores[doc_id] += compute_rrf(rank + 1, k)
        else:
            rrf_scores[doc_id] = compute_rrf(rank + 1, k)

    # Sort RRF scores in descending order
    reranked_docs = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Get top-K documents by the score
    final_results = []
    for doc_id, score in reranked_docs[:5]:
        doc = es_client.get(index=index_name_vec, id=doc_id)
        final_results.append(doc['_source'])
    
    return final_results

In [57]:
def all_data_hybrid_rrf(q):
    question = q['question']
    city = q['city']

    v_q = model.encode(question)

    return elastic_search_hybrid_rrf('all_data_vector', question, v_q, city)



In [58]:
all_data_hybrid_rrf_es = evaluate(ground_truth, all_data_hybrid_rrf)
all_data_hybrid_rrf_es

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2715/2715 [03:40<00:00, 12.34it/s]


{'hit_rate': 0.13996316758747698, 'mrr': 0.12633517495395943}

In [63]:
from tabulate import tabulate

In [64]:
# build table rows
table = [
    ["Baseline", "Basline_es", Basline_es['hit_rate'], Basline_es['mrr']],
    ["Baseline", "Filter_es", filter_es['hit_rate'], filter_es['mrr']],
    ["Semantic Search", "Text_hybrid_es", text_hybrid_es['hit_rate'], text_hybrid_es['mrr']],
    ["Semantic Search", "City_hybrid_es", city_hybrid_es['hit_rate'], city_hybrid_es['mrr']],
    ["Semantic Search", "Section_hybrid_es", section_hybrid_es['hit_rate'], section_hybrid_es['mrr']],
    ["Semantic Search", "All_data_es", all_data_es['hit_rate'], all_data_es['mrr']],
    ["Reranking", "All_data_hybrid_rrf_es", all_data_hybrid_rrf_es['hit_rate'], all_data_hybrid_rrf_es['mrr']],
]

# print table
print(tabulate(table, headers=["Section", "Method", "Hit Rate", "MRR"], tablefmt="github"))

| Section         | Method                 |   Hit Rate |      MRR |
|-----------------|------------------------|------------|----------|
| Baseline        | Basline_es             |   0.862983 | 0.741355 |
| Baseline        | Filter_es              |   0.853775 | 0.7652   |
| Semantic Search | Text_hybrid_es         |   0.871455 | 0.786108 |
| Semantic Search | City_hybrid_es         |   0.855617 | 0.766501 |
| Semantic Search | Section_hybrid_es      |   0.855617 | 0.767127 |
| Semantic Search | All_data_es            |   0.875138 | 0.788637 |
| Reranking       | All_data_hybrid_rrf_es |   0.139963 | 0.126335 |


## RAG Flow 

In [65]:
from mistralai import Mistral
from mistralai.models import UserMessage
import os
from dotenv import load_dotenv

In [84]:
# searching
f_query = "Where should I eat?"
city = "Rome"  

# Encode the query
v_q = model.encode(f_query)

# Run hybrid search on all_data_vector
results = elastic_search_hybrid("all_data_vector", f_query, v_q, city)


In [85]:
# Show results
for i, r in enumerate(results, 1):
    print(f"Result {i}:")
    print(f"  City: {r['city']}")
    print(f"  Section: {r['section']}")
    print(f"  Subsection: {r['subsection']}")
    print(f"  Text: {r['text']}")
    print("-" * 50)

Result 1:
  City: Rome
  Section: Drink
  Subsection: Clubbing &amp; Night Life
  Text: On the other side of the River Tiber (Tevere) is Trastevere district where there are many places to eat and drink. This is also a good place where to enjoy a walk in crowded streets at night. In summer time on Isola Tiberina , the island in the Tiber, temporary bar are built and there are all sorts of things to do.
--------------------------------------------------
Result 2:
  City: Rome
  Section: Eat
  Subsection: None
  Text: Many of the good restaurants in Rome are hard to find, but a good tip is to go where Italians live and eat. The downside is that waitstaff at these restaurants usually do not speak English, so be prepared to have to speak some Italian. On the top of the green, old mountain (Monte Verde Vecchio) there are some trattorias with authentic Italian cuisine at an affordable price. Rome also has many beautiful spots to eat, so buying some delicacies to make up a picnic can be a grea

In [86]:
# loads variables from .env
load_dotenv()  

True

In [87]:
api_key = os.getenv("API_KEY")

In [88]:
client = Mistral(api_key = api_key)

In [99]:
def build_prompt(query, search_results):
    context_template = "Q: {question}\n A: {text}"

    context_parts = []
    for source in search_results:
        context_parts.append(context_template.format(question=source.get('question', ''), text=source.get('text', '')))

    context = "\n\n".join(context_parts)

    prompt_template = """
You're a travel assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}

    """.strip()
    
    prompt = prompt_template.format(question=query, context=context)

    return prompt


In [100]:
def llm(prompt):
    response = client.chat.complete(
        model= "mistral-medium-latest",
        messages=[UserMessage(content=prompt)],
    )


    return response.choices[0].message.content

In [101]:
def rag(query):
    v_q = model.encode(query)
    search_results = elastic_search_hybrid("all_data_vector", query, v_q, city)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)

    return answer

In [102]:
answer = rag(f_query)
print("\n Answer:", answer)


 Answer: Here are some great places to eat in Rome based on the context:

1. **Trastevere District** – Cross the River Tiber to find many authentic and affordable restaurants, especially for pizza (served in the evening). Avoid tourist areas for better quality and prices.

2. **Monte Verde Vecchio** – A green, old mountain with trattorias offering authentic Italian cuisine at reasonable prices.

3. **Via Marmorata (Volpetti’s)** – A famous deli with high-quality cheese, prosciutto, and pastries (though prices are higher). For a budget option, try a local supermarket for fresh picnic foods.

4. **Roman Pizza Tips** – Try fried starters like *baccala* (battered salt cod) followed by a thin-crust pizza. Eat it with a fork and knife, as Romans do.

5. **Vegetarian/Vegan Options** – Many restaurants have buffets with grilled vegetables, and pizzas like *Marinara* (tomato, garlic, oregano) are vegan-friendly.

For a budget meal (€15–20), look for places where Italian office workers eat—thes

In [104]:
answer = rag("Plan two days trip to Roma")
print("\n Answer:", answer)


 Answer: Here’s a well-balanced **2-day Rome itinerary** based on the provided context, optimized for efficiency and cost savings with the **Roma Pass 48-Hours (€36.50)**:

---
### **Day 1: Ancient Rome & Iconic Landmarks**
**Morning:**
- **Colosseum (Colosseo)** – Use your **Roma Pass for free entry + skip-the-line access** (save 1+ hour waiting). Book a timed slot in advance.
- **Palatine Hill (Palatino Hill)** – Included in the same ticket as the Colosseum. Explore the ruins of imperial palaces and the birthplace of Rome.

**Afternoon:**
- **Roman Forum** – Walk through the heart of ancient Rome (entry included with Colosseum ticket).
- **Lunch**: Quick bite near **Monti district** (e.g., *La Carbonara* for authentic pasta).
- **Baths of Caracalla (Terme di Caracalla)** – Use your **second free Roma Pass entry** (massive imperial baths, less crowded than the Forum).

**Evening:**
- **Piazza Venezia & Altare della Patria** – Free panoramic views of Rome from the terrace.
- **Dinner*

In [105]:
answer = rag("Plan three-day itinerary for Dubai")
print("\n Answer:", answer)


 Answer: I’m unable to provide a three-day itinerary for **Dubai** based on the given **CONTEXT**, as the provided information pertains exclusively to **Rome** (e.g., accommodation taxes, museums, water fountains, and shopping malls in Rome).

For a Dubai itinerary, I’d need relevant details about attractions, transportation, cultural norms, and seasonal considerations specific to Dubai.


In [108]:
answer = rag("what trasnportation I should use in Seoul")
print("\n Answer:", answer)


 Answer: I’m sorry, but the provided **CONTEXT** does not contain any information about transportation in **Seoul**.

Would you like me to provide general transportation advice for Seoul based on standard travel knowledge? Let me know!
