In [1]:
import pymongo
from langchain_groq import ChatGroq
from langchain.schema import SystemMessage, HumanMessage
from dotenv import load_dotenv
from PIL import Image, ImageDraw, ImageFont
import random
import json
import re
import os
from openai import AzureOpenAI

load_dotenv(dotenv_path="../config.env")

True

In [2]:
def create_llm_client():
    return AzureOpenAI(
        api_key=os.getenv("AZURE_OPENAI_API_KEY"),
        api_version="2025-03-01-preview",
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
    )

def connect_to_db():
    db_client = pymongo.MongoClient(os.getenv("MONGODB_URI_REMOTE"))
    return db_client


def extract_locations_data(db):
    collection = db["Location"]
    locations_data = {doc["name"]: doc["coordinates"]["coordinates"] for doc in collection.find()}
    locations_data = {f"[{index}]": value for index, value in locations_data.items()}
    return locations_data

def extract_business_category_metadata(db):
    collection = db["Business"]
    business_metadata = []
    for doc in collection.find():
        if doc["category"] not in business_metadata:
            business_metadata.append(doc["category"])
    return business_metadata

def extract_business_data(db):
    collection = db["Business"]
    business_data = {doc["name"]: [doc["address"], doc["category"], doc["coordinates"]["coordinates"], doc["averageRating"]] for doc in collection.find()}
    business_data = {f"[{index}]": value for index, value in business_data.items()}
    return business_data

def clean_json_response(response):
    return re.sub(r"```json\n(.*?)\n```", r"\1", response, flags=re.DOTALL).strip()

# Need bigger model for larger capacity
def trim_business_data(business_data, max_tokens=5000):
    all_businesses = [(location, business) for location, business in business_data.items()]
    random.shuffle(all_businesses)    

    estimated_tokens = 0
    filtered_data = {}
    for location, business in all_businesses:
        if location not in filtered_data:
            business_tokens =  len(json.dumps(business))
            if estimated_tokens + business_tokens > max_tokens:
                break

            filtered_data[location] = business
            estimated_tokens += business_tokens

    return filtered_data


In [27]:
# Connecting to DB
db_client = connect_to_db()
db = db_client["OdysseumDatabase"]

# LLM model
llm_client = create_llm_client()

# Extracting data and useful metadata
locations_data = extract_locations_data(db)
business_data = extract_business_data(db)
business_category_data = extract_business_category_metadata(db)

# business_data = trim_business_data(business_data)   # Comment out when we have bigger model

In [48]:
business_category_data

['Restaurant', 'Hotel', 'Entertainment', 'Other', 'Services']

In [38]:
itinerary = {'id': 'itinerary-1744614246322', 'createdAt': '2025-04-14T07:04:06.322Z', 'lastModified': '2025-04-14T07:04:06.322Z', 'optimization': 'ratings', 'destinations': [{'id': '67310369aa977e99fcc2c320', 'name': 'Murree, Punjab', 'coordinates': None, 'address': None, 'stops': [{'id': 'stop-1744611998910-5pe3co075', 'category': 'hotel'}]}, {'id': '67310369aa977e99fcc2c321', 'name': 'Fairy Meadows, Gilgit-Baltistan', 'coordinates': None, 'address': None, 'stops': [{'id': 'stop-1744612001080-1wfp9xx80', 'category': 'restaurant'}, {'id': 'stop-1744614077380-9otzgrkuf', 'category': 'entertainment'}]}]}


In [21]:
def generate_itinerary_data(itinerary_data, business_data, llm_client):
    system_prompt = f"""
    You are a data mapping assistant. Your task is to connect travel itinerary stops to relevant businesses using the provided business database.

    Input Data: 
    - Itinerary Data (JSON):  {json.dumps(itinerary_data, indent=2, ensure_ascii=False)}
    - Dictionary of businesses: {json.dumps(business_data, indent=2, ensure_ascii=False)}


    Instructions:
    - For each destination, match businesses that are relevant to the stop categories listed under it or if they closely match it in meaning.
    - If a destination has no matching businesses in the database, return an empty list.
    - Consider partial matches (e.g., "food" category may match restaurants, cafes, etc.)
    - Use city names to match locations, and categories to filter businesses.

    Output Format (JSON Dictionary):
    Strictly return a dictionary in this format:
    {{
    "Location Name 1": [
        {{
        "name": "Business Name",
        "address": "Business Address",
        "category": "Business Category",
        "rating": "Business Rating",
        "coordinates": "Business Coordinates"
        }},
        ...
    ],
    "Location Name 2": []
    }}

    Return ONLY this output dictionary
    """

    user_prompt = f"""
    Itinerary Data:
    {json.dumps(itinerary_data['destinations'], indent=2, ensure_ascii=False)}

    Business Data:
    {json.dumps(business_data, indent=2, ensure_ascii=False)}

    Generate the mapping as instructed.
    """

    # For Azure OpenAI, use the chat completions API
    response = llm_client.chat.completions.create(
        model="o3-mini", 
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
    )
    response_content = response.choices[0].message.content
    cleaned_response = clean_json_response(response_content)
    return cleaned_response

In [29]:
def generate_business_location_data_algorithm(locations_data, business_data):
    # Initialize result dictionary with empty lists for each location
    result = {location_name.strip("[]"): [] for location_name in locations_data.keys()}
    
    # Convert location coordinates to a more accessible format
    location_coords = {}
    for loc_name, coords in locations_data.items():
        clean_name = loc_name.strip("[]")
        location_coords[clean_name] = coords
    
    # Process each business
    for business_key, business_value in business_data.items():
        business_name = business_key.strip("[]")
        business_address = business_value[0] if len(business_value) > 0 else ""
        business_category = business_value[1] if len(business_value) > 1 else ""
        business_coords = business_value[2] if len(business_value) > 2 else None
        business_rating = business_value[3] if len(business_value) > 3 else "N/A"
        
        # Find the closest location based on coordinates if available
        if business_coords:
            closest_location = find_closest_location(business_coords, location_coords)
            
            # Add business to the matched location
            if closest_location:
                result[closest_location].append({
                    "name": business_name,
                    "address": business_address,
                    "category": business_category,
                    "rating": business_rating,
                    "coordinates": business_coords
                })
    
    return result

def find_closest_location(business_coords, location_coords):
    if not business_coords:
        return None
    
    closest_location = None
    min_distance = float('inf')
    
    for location_name, coords in location_coords.items():
        if coords:
            # Calculate Euclidean distance
            distance = calculate_distance(business_coords, coords)
            
            if distance < min_distance:
                min_distance = distance
                closest_location = location_name
    
    return closest_location

def calculate_distance(coords1, coords2):
    if not coords1 or not coords2:
        return float('inf')
    
    try:
        # Assuming coordinates are [longitude, latitude]
        return ((coords1[0] - coords2[0]) ** 2 + (coords1[1] - coords2[1]) ** 2) ** 0.5
    except (TypeError, IndexError):
        return float('inf')
    

In [40]:
CATEGORIES = ['restaurant', 'entertainment', 'service', 'hotel', 'other']
def generate_optimized_business_data(locations_data, business_data, optimization, top_n=5):
    # First, grouping all businesses by location
    all_businesses_by_location = generate_business_location_data_algorithm(locations_data, business_data)
    result = {}
    
    # For each location, filter the top N businesses based on optimization criteria
    for location, businesses in all_businesses_by_location.items():
        result[location] = {category: [] for category in CATEGORIES}

        if not businesses:
            continue
        
        for category in CATEGORIES:
            filtered = [
                b for b in businesses if b.get('category', '').lower() == category
            ]

            # Sorting businesses based on optimization criteria
            if optimization.lower() == 'ratings':
                sorted_businesses = sorted(filtered, key=lambda x: float(x.get('rating', 0) or 0), reverse=True)

            elif optimization.lower() == 'distance':
                central_point = find_central_point(location, locations_data)
                sorted_businesses = sorted(filtered, key=lambda x: calculate_distance(x.get('coordinates'), central_point))
            
            else:
                sorted_businesses = sorted(filtered, key=lambda x: float(x.get('rating', 0) or 0), reverse=True)
            
            result[location][category] = sorted_businesses[:top_n]
    return result

def find_central_point(location_name, locations_data):
    for loc_name, coords in locations_data.items():
        clean_name = loc_name.strip("[]")
        if clean_name == location_name and coords:
            return coords
    return [0, 0]  

optimized_data = generate_optimized_business_data(locations_data, business_data, 'ratings')

In [46]:
def get_itinerary_recommendations(itinerary, optimized_data):
    result = {}
    for dst in itinerary.get('destinations', ''):
        location = dst.get('name')
        result[location] = {}

        stops = dst.get('stops')
        for stop in stops:
            stop_category = stop.get('category')
            top_n_stops = optimized_data[location][stop_category]

            result[location][stop_category] = top_n_stops
    return result

itinerary_recommendations = get_itinerary_recommendations(itinerary, optimized_data)

{'Murree, Punjab': {'hotel': [{'name': 'LOKAL X Murree Kashmir Point', 'address': 'Kashmir Point, 25 Viewforth Rd, Murree, Pakistan', 'category': 'Hotel', 'rating': 4.8, 'coordinates': [73.4018359, 33.9094981]}, {'name': 'Jungle resorts murree', 'address': 'main Kuldana Rd, near Halal e Ahmar House, Murree, 47130, Pakistan', 'category': 'Hotel', 'rating': 4.7, 'coordinates': [73.39951359999999, 33.9146293]}, {'name': 'Montaña Villa Murree', 'address': 'Fazal Lodges, Murree, 47140, Pakistan', 'category': 'Hotel', 'rating': 4.7, 'coordinates': [73.4139194, 33.9167741]}, {'name': 'Fiora Hotel', 'address': 'Viewforth Rd, Murree, 47150, Pakistan', 'category': 'Hotel', 'rating': 4.7, 'coordinates': [73.4015612, 33.9073656]}, {'name': 'Cecil by Pearl Continental Hotels & Resorts', 'address': 'Mount View Road, Cecil Apartments Murree, Pakistan', 'category': 'Hotel', 'rating': 4.5, 'coordinates': [73.3897405, 33.9072856]}]}, 'Fairy Meadows, Gilgit-Baltistan': {'restaurant': [{'name': 'Nanga Par

In [None]:
# Full workflow

locations_data = extract_locations_data(db)
business_data = extract_business_data(db)
optimization = 'ratings'
optimized_data = generate_optimized_business_data(locations_data, business_data, optimization)
itinerary_recommendations = get_itinerary_recommendations(itinerary, optimized_data)


In [None]:
def generate_business_location_data(locations_data, business_data, llm_client):
    system_prompt = f"""
    You are an expert in data extraction. Your task is to group businesses under their respective cities.

    Input Data:
    - List of locations: {json.dumps(locations_data, indent=2, ensure_ascii=False)}
    - Dictionary of businesses: {json.dumps(business_data, indent=2, ensure_ascii=False)}

    Instructions:
    - Match businesses to their corresponding location.
    - If a location does not have any businesses, return an empty list.

    Output Format (Strict dictionary):
    Respond with only a valid dictionary, structured like this:
    {{
        "location_name": [
            {{
                "name": "Business Name",
                "address": "Business Address",
                "category": "Business Category",
                "rating": "Business Rating"
                "cooridates": "Location Coordinates"
            }},
            ...
        ]
    }}
    DO NOT return any code, STRICTLY return a DICTIONARY OBJECT
    """

    user_prompt = "Generate the dictionary as instructed."

    messages = [
        SystemMessage(content=system_prompt),
        HumanMessage(content=user_prompt),
    ]

    response = llm_client.invoke(messages).content
    response = clean_json_response(response)
    return response