In [9]:
import os
os.environ["SSL_CERT_FILE"] = "Fortinet_CA_SSL(15).cer"


In [10]:
import requests
import json
from bs4 import BeautifulSoup


In [21]:
import requests
import json
from bs4 import BeautifulSoup

def fetch_wikivoyage_sections(page: str, user_agent="MyWikiVoyageClient/1.0 (contact: your-email@example.com)"):
    api_url = "https://en.wikivoyage.org/w/api.php"
    headers = {"User-Agent": user_agent}

    # Step 1: Get metadata about all sections (top + sub)
    params = {
        "action": "parse",
        "page": page,
        "format": "json",
        "prop": "sections"
    }
    res = requests.get(api_url, params=params, headers=headers).json()
    sections_meta = res["parse"]["sections"]

    wanted = {"districts", "see", "buy", "eat", "drink", "sleep", "stay safe"}
    result = {}

    # Helper: clean HTML → paragraphs, list items, tables
    def html_to_paragraphs(html: str):
        soup = BeautifulSoup(html, "html.parser")
        paragraphs = []

        for p in soup.find_all("p"):
            text = p.get_text(" ", strip=True)
            if text:
                paragraphs.append(text)

        for li in soup.find_all("li"):
            text = li.get_text(" ", strip=True)
            if text:
                paragraphs.append(text)

        for td in soup.find_all("td"):
            text = td.get_text(" ", strip=True)
            if text:
                paragraphs.append(text)

        return paragraphs

    # Step 2: Grab description (section 0)
    lead_params = {
        "action": "parse",
        "page": page,
        "format": "json",
        "prop": "text",
        "section": 0
    }
    lead = requests.get(api_url, params=lead_params, headers=headers).json()
    lead_html = lead["parse"]["text"]["*"]
    result["description"] = html_to_paragraphs(lead_html)

    # Step 3: Iterate through sections to build nested structure
    for sec in sections_meta:
        title = sec["line"].strip()
        title_lower = title.lower()
        number = sec["number"]
        index = sec["index"]

        if title_lower in wanted and "." not in number:
            sec_params = {
                "action": "parse",
                "page": page,
                "format": "json",
                "prop": "text",
                "section": index
            }
            sec_data = requests.get(api_url, params=sec_params, headers=headers).json()
            html = sec_data["parse"]["text"]["*"]
            result[title] = {"content": html_to_paragraphs(html), "subsections": {}}

        elif "." in number:
            parent_num = number.split(".")[0]
            parent_section = next((s for s in sections_meta if s["number"] == parent_num), None)
            if parent_section and parent_section["line"].strip().lower() in wanted:
                parent_title = parent_section["line"].strip()
                sec_params = {
                    "action": "parse",
                    "page": page,
                    "format": "json",
                    "prop": "text",
                    "section": index
                }
                sec_data = requests.get(api_url, params=sec_params, headers=headers).json()
                html = sec_data["parse"]["text"]["*"]
                if parent_title in result:
                    result[parent_title]["subsections"][title] = html_to_paragraphs(html)

    return result

# === Multiple cities ===
cities = ["London", "Rome", "Seoul", "Cairo"]

for city in cities:
    print(f"📍 Fetching data for {city}...")
    data = fetch_wikivoyage_sections(city)
    # Create the directory structure if it doesn't exist
    os.makedirs("../data/raw_data", exist_ok=True)
    filename = f"../data/raw_data/{city.lower()}_subsections.json"
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"✅ Saved {filename}")


📍 Fetching data for London...
✅ Saved ../data/raw_data/london_subsections.json
📍 Fetching data for Rome...
✅ Saved ../data/raw_data/rome_subsections.json
📍 Fetching data for Seoul...
✅ Saved ../data/raw_data/seoul_subsections.json
📍 Fetching data for Cairo...
✅ Saved ../data/raw_data/cairo_subsections.json


In [22]:
import json
import csv
import glob
import re
import os
import requests


In [25]:
# Mapping city to their countries and continent
KNOWN_CITY_MAP = {
    "london": ("United Kingdom", "Europe"),
    "rome": ("Italy", "Europe"),
    "seoul": ("South Korea", "Asia"),
    "cairo": ("Egypt", "Africa"),
    # add more known cities here if you want
}

RAW_FOLDER = "../data/raw_data"
PROCESSED_FOLDER = "../data/processed_data"
os.makedirs(PROCESSED_FOLDER, exist_ok=True)


def convert_json_to_csv(json_file, csv_file):
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Prefer metadata.city if available, else infer from filename
    meta_city = data.get("metadata", {}).get("city")
    if meta_city:
        city = meta_city
    else:
        base = os.path.basename(json_file)
        m = re.match(r'(.+?)_subsections\.json$', base, flags=re.I)
        if m:
            city_raw = m.group(1)
        else:
            city_raw = os.path.splitext(base)[0]
        city = city_raw.replace("_", " ").strip().title()

    # Country + continent from known mapping
    country, continent = KNOWN_CITY_MAP.get(city.lower(), ("", ""))

    rows = []
    # Description rows
    for text in data.get("description", []):
        rows.append([city, country, continent, "description", "", text])

    # Sections
    for section, sec_data in data.items():
        if section in ("description", "metadata"):
            continue
        if not isinstance(sec_data, dict):
            continue
        # Top-level content
        for txt in sec_data.get("content", []):
            rows.append([city, country, continent, section, "", txt])
        # Subsections
        for sub, sub_list in sec_data.get("subsections", {}).items():
            for txt in sub_list:
                rows.append([city, country, continent, section, sub, txt])

    # Write per-file CSV
    with open(csv_file, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["city", "country", "continent", "section", "subsection", "text"])
        writer.writerows(rows)

    return rows


# === main: convert all *_subsections.json files and create combined CSV ===
all_rows = []
files = glob.glob(os.path.join(RAW_FOLDER, "*_subsections.json"))
if not files:
    print("No JSON files found in raw_data folder.")
else:
    for jf in files:
        base = os.path.basename(jf).replace(".json", ".csv")
        csv_name = os.path.join(PROCESSED_FOLDER, base)
        print(f"Converting {jf} -> {csv_name} ...")
        rows = convert_json_to_csv(jf, csv_name)
        all_rows.extend(rows)
        print(f"  rows written: {len(rows)}")

    # Write combined CSV
    combined_name = os.path.join(PROCESSED_FOLDER, "all_cities_combined.csv")
    print(f"Writing combined CSV: {combined_name} (total rows: {len(all_rows)})")
    with open(combined_name, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["city", "country", "continent", "section", "subsection", "text"])
        writer.writerows(all_rows)

    print("✅ Done. Per-file CSVs plus combined CSV created in processed_data folder.")


Converting ../data/raw_data/cairo_subsections.json -> ../data/processed_data/cairo_subsections.csv ...
  rows written: 109
Converting ../data/raw_data/london_subsections.json -> ../data/processed_data/london_subsections.csv ...
  rows written: 468
Converting ../data/raw_data/rome_subsections.json -> ../data/processed_data/rome_subsections.csv ...
  rows written: 252
Converting ../data/raw_data/seoul_subsections.json -> ../data/processed_data/seoul_subsections.csv ...
  rows written: 71
Writing combined CSV: ../data/processed_data/all_cities_combined.csv (total rows: 900)
✅ Done. Per-file CSVs plus combined CSV created in processed_data folder.


In [28]:
df = pd.read_csv("../data/processed_data/all_cities_combined.csv")

In [29]:
df

Unnamed: 0,city,country,continent,section,subsection,text
0,Cairo,Egypt,Africa,description,,Cairo (pronounced KY-roh ; Arabic : القاهرة al...
1,Cairo,Egypt,Africa,description,,"On the Nile river, Cairo is famous for its own..."
2,Cairo,Egypt,Africa,description,,"Though firmly attached to the past, Cairo is a..."
3,Cairo,Egypt,Africa,Districts,,Greater Cairo is vast; with more than 20 milli...
4,Cairo,Egypt,Africa,Districts,,Downtown The modern city centre. Midan Tahrir ...
...,...,...,...,...,...,...
895,Seoul,South Korea,Asia,Stay safe,,Do not try to use drones to take pictures in S...
896,Seoul,South Korea,Asia,Stay safe,,"Unfortunately, crimes by American soldiers aga..."
897,Seoul,South Korea,Asia,Stay safe,,Protesting : Large-scale demonstrations agains...
898,Seoul,South Korea,Asia,Stay safe,,Fake monks have been known to operate in Seoul...


In [30]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py --no-check-certificate

--2025-09-22 13:43:08--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4273 (4.2K) [text/plain]
Saving to: ‘minsearch.py’


2025-09-22 13:43:09 (2.22 MB/s) - ‘minsearch.py’ saved [4273/4273]



In [31]:
import minsearch



In [32]:
df.columns

Index(['city', 'country', 'continent', 'section', 'subsection', 'text'], dtype='object')

In [33]:
documents = df.to_dict(orient='records')

In [34]:
documents[15]

{'city': 'Cairo',
 'country': 'Egypt',
 'continent': 'Africa',
 'section': 'See',
 'subsection': nan,
 'text': 'Cairo has an overwhelming array of attractions, listed under their individual districts along with transport and other practicalities. Some highlights:'}

In [37]:
index = minsearch.Index(
    text_fields=['city', 'country', 'continent', 'section',  'text'],
    keyword_fields=[]
)

In [38]:
index.fit(documents)

<minsearch.Index at 0x7f28e5265070>

In [39]:
query = "give me a three-day trip to London"

In [40]:
index.search(query, num_results=10)

[{'city': 'London',
  'country': 'United Kingdom',
  'continent': 'Europe',
  'section': 'See',
  'subsection': 'London Pass',
  'text': 'Whereas some London museums offer free entry, some other top London attractions are ridiculously expensive. For example, entry to Westminster Abbey costs £20 per person (adult), and entry to the Tower is £21.50 per adult if bought online (2017). These prices can be sometimes mitigated by a purchase of London Pass, which needs to be done at the London Pass website . The pass comes in several varieties and gives access to over 60 attractions, including both Westminster Abbey and the Tower. For example, a day pass costs £62 for an adult (2017). The best strategy, if one wants to visit several expensive high-profile attractions, is to buy a day pass and to try visiting all of them in the same day. This requires some advanced planning and will not give you much time at each place you visit - for example, it can take an hour on public transport to travel b

In [41]:
from mistralai import Mistral
from mistralai.models import UserMessage
import os
from dotenv import load_dotenv

In [42]:
# loads variables from .env
load_dotenv()  

True

In [43]:
api_key = os.getenv("API_KEY")

In [44]:
client = Mistral(api_key = api_key)
response = client.chat.complete(
    model= "mistral-large-2411",
    messages=[UserMessage(content=query)],
)


print(response.choices[0].message.content)

Here's a suggested three-day itinerary for a trip to London, covering major attractions, culture, and food:

**Day 1: Iconic Landmarks**

*Morning:*
- Start your day at the **Tower of London**. Explore the historic castle, see the Crown Jewels, and join a guided tour by the Yeoman Warders.
- Walk across **Tower Bridge** for panoramic views of the River Thames.

*Afternoon:*
- Have lunch at **Borough Market**, London's most renowned food market.
- Visit **St. Paul's Cathedral** and climb to the dome for breathtaking views of the city.

*Evening:*
- Take a stroll along the **South Bank** and enjoy street performers, shops, and restaurants.
- Have dinner at **Dishoom**, a popular Bombay-style café in Covent Garden.

**Day 2: Museums & Parks**

*Morning:*
- Visit the **British Museum** to explore world history and cultures.

*Afternoon:*
- Have lunch at **Neal's Yard** in Covent Garden, offering a variety of eateries in a colorful courtyard.
- Choose between the **Natural History Museum** 

In [47]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [86]:
that prompt 
prompt_template = """
You're a travel assistant. Answer the QUESTION based on the CONTEXT from our travel database.
Use only the facts from the CONTEXT when answering the QUESTION.
QUESTION: {question}
CONTEXT:
{context}
""".strip()

entry_template = """
City: 'city',
 Country: 'country',
 Continent: 'continent',
 Section: 'section',
 Subsection: 'subsection',
 Text:'text'
""".strip()

def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [87]:
search_results = search(query)
prompt = build_prompt(query, search_results)

In [88]:
print(prompt)

You're a travel assistant. Answer the QUESTION using ONLY the information from the CONTEXT from our travel database.
When asked for itineraries, organize the places, attractions, and activities mentioned in the CONTEXT into a logical day-by-day plan.
Do not add any information that is not present in the CONTEXT.

QUESTION: give me a three-day trip to London
CONTEXT:
City: 'city',
 Country: 'country',
 Continent: 'continent',
 Section: 'section',
 Subsection: 'subsection',
 Text:'text'

City: 'city',
 Country: 'country',
 Continent: 'continent',
 Section: 'section',
 Subsection: 'subsection',
 Text:'text'

City: 'city',
 Country: 'country',
 Continent: 'continent',
 Section: 'section',
 Subsection: 'subsection',
 Text:'text'

City: 'city',
 Country: 'country',
 Continent: 'continent',
 Section: 'section',
 Subsection: 'subsection',
 Text:'text'

City: 'city',
 Country: 'country',
 Continent: 'continent',
 Section: 'section',
 Subsection: 'subsection',
 Text:'text'

City: 'city',
 Countr

In [89]:
def llm(prompt, model='mistral-large-2411'):
    client = Mistral(api_key = api_key)
    response = client.chat.complete(
        model= model,
        messages=[UserMessage(content=prompt)],
    )    
    return response.choices[0].message.content

In [90]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [91]:
answer = rag(query)
print(answer)

Based on the provided CONTEXT, there is no specific information about attractions, restaurants, or activities for London. Therefore, I cannot create a three-day itinerary for your trip to London.


In [92]:
# Ask another question 
answer = rag('I want some tips before visting cairo')
print(answer)

Based on the provided CONTEXT, there is no specific information available about Cairo. Therefore, I cannot provide any tips or itineraries for visiting Cairo.
