In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
import ast  # For converting stringified lists to actual lists

# Load spaCy NLP model
nlp = spacy.load("en_core_web_sm")

In [2]:

## 1. Data Loading and Preprocessing

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, accuracy_score, f1_score
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from geopy.distance import geodesic
import random

# Download NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# Load the dataset
df = pd.read_csv(r"C:\Users\harsh\Downloads\merged_dataset.csv")

# Check if amenities column is string and convert if needed
if df['amenities'].dtype == 'object':
    # Convert string representation of list to actual list
    df['amenities'] = df['amenities'].apply(lambda x: eval(x) if isinstance(x, str) and x.startswith('[') else x)

# Clean amenities - ensure it's a list and properly formatted
def clean_amenities(amenities):
    if isinstance(amenities, list):
        # Join the list elements into a single string
        return [item.strip() for item in amenities if isinstance(item, str)]
    elif isinstance(amenities, str):
        try:
            # Try to evaluate if it's a string representation of a list
            cleaned = eval(amenities)
            if isinstance(cleaned, list):
                return [item.strip() for item in cleaned if isinstance(item, str)]
        except:
            pass
    return []

df['amenities_cleaned'] = df['amenities'].apply(clean_amenities)
df['amenities_str'] = df['amenities_cleaned'].apply(lambda x: ', '.join(x) if x else '')

# Create a combined text field for content-based filtering
df['combined_features'] = df['name'] + ' ' + df['subcategories'] + ' ' + df['amenities_str'] + ' ' + df['country'] + ' ' + df['city']

# Clean the text
def clean_text(text):
    if not isinstance(text, str):
        return ""
    # Convert to lowercase
    text = text.lower()
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['combined_features'] = df['combined_features'].apply(clean_text)
df['description_clean'] = df['description'].apply(clean_text)

# Make sure price columns are numeric
for col in ['LowerPrice', 'UpperPrice']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Fill missing values for ratings and prices
df['rating'] = df['rating'].fillna(0)
df['LowerPrice'] = df['LowerPrice'].fillna(0)
df['UpperPrice'] = df['UpperPrice'].fillna(0)

# Make sure latitude and longitude are numeric
df['latitude'] = pd.to_numeric(df['latitude'], errors='coerce')
df['longitude'] = pd.to_numeric(df['longitude'], errors='coerce')

print(f"Dataset loaded with {len(df)} entries")
print(f"Attractions: {len(df[df['type'] == 'ATTRACTION'])}")
print(f"Hotels: {len(df[df['type'] == 'HOTEL'])}")
print(f"Dataset columns: {df.columns.tolist()}")


Dataset loaded with 14250 entries
Attractions: 3887
Hotels: 10290
Dataset columns: ['id', 'type', 'subcategories', 'name', 'description', 'rating', 'latitude', 'longitude', 'numberOfReviews', 'amenities', 'LowerPrice', 'UpperPrice', 'Rank', 'Total', 'Location', 'RankingType', 'country', 'city', 'regional_rating', 'image', 'webUrl', 'amenities_cleaned', 'amenities_str', 'combined_features', 'description_clean']


In [3]:
df.head(3)

Unnamed: 0,id,type,subcategories,name,description,rating,latitude,longitude,numberOfReviews,amenities,...,RankingType,country,city,regional_rating,image,webUrl,amenities_cleaned,amenities_str,combined_features,description_clean
0,7945044,ATTRACTION,Sights & Landmarks,Kuminda Farm,It's a small farm and we are into Argo tourism...,5.0,-20.98,27.25,22,bathroom only,...,things to do,Botswana,Francistown,3.0,https://media-cdn.tripadvisor.com/media/photo-...,https://www.tripadvisor.com/Attraction_Review-...,[],,kuminda farm sights landmarks botswana francis...,it s a small farm and we are into argo tourism...
1,1743605,ATTRACTION,"Casinos & Gambling, Fun & Games",Gaborone Sun,Botswana is where the white-hot Kalahari Deser...,4.0,-24.65,25.93,124,bathroom only,...,things to do,Botswana,Gaborone,6.25,https://media-cdn.tripadvisor.com/media/photo-...,https://www.tripadvisor.com/Attraction_Review-...,[],,gaborone sun casinos gambling fun games botswa...,botswana is where the white hot kalahari deser...
2,4162082,HOTEL,Specialty Lodging,Pelican Lodge & Camping,"The lodge is situated in Nata, Botswana. Clien...",3.0,-20.22,26.23,189,"[Pool, Internet, Free Internet, Free parking, ...",...,Specialty lodging,Botswana,Nata,2.25,https://media-cdn.tripadvisor.com/media/photo-...,https://www.tripadvisor.com/Hotel_Review-g3161...,"[Pool, Internet, Free Internet, Free parking, ...","Pool, Internet, Free Internet, Free parking, R...",pelican lodge camping specialty lodging pool i...,the lodge is situated in nata botswana clients...


In [4]:
## 2. TF-IDF Vectorization and Similarity Computation

# Create TF-IDF vectorizer for the combined features
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'])

# Create TF-IDF vectorizer for descriptions
tfidf_vectorizer_desc = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix_desc = tfidf_vectorizer_desc.fit_transform(df['description_clean'])

# Compute the cosine similarity matrix for combined features
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Compute the cosine similarity matrix for descriptions
cosine_sim_desc = cosine_similarity(tfidf_matrix_desc, tfidf_matrix_desc)

# Create a mapping of place names to indices
indices = pd.Series(df.index, index=df['name']).drop_duplicates()

print(f"TF-IDF matrices created successfully")
print(f"Shape of combined features similarity matrix: {cosine_sim.shape}")

TF-IDF matrices created successfully
Shape of combined features similarity matrix: (14250, 14250)


In [5]:
# 3. Entity Extraction Function

# Function to extract entities from a user query
def extract_entities(query):
    query = query.lower()
    
    # Initialize variables to store extracted information
    country = None
    city = None
    place_type = None
    subcategories = []
    amenities = []
    price_preference = None
    
    # Extract country - get from actual dataset
    countries = df['country'].str.lower().unique().tolist()
    for country_name in countries:
        if country_name in query:
            country = country_name.title()
            break
    
    # Extract city - get from actual dataset
    cities = df['city'].str.lower().unique().tolist()
    for city_name in cities:
        if city_name in query:
            city = city_name.title()
            break
    
    # Extract place type
    if any(word in query for word in ['hotel', 'lodging', 'accommodation', 'stay', 'room']):
        place_type = 'HOTEL'
    elif any(word in query for word in ['attraction', 'landmark', 'sight', 'visit', 'see', 'tour']):
        place_type = 'ATTRACTION'
    
    # Extract subcategories - add more based on your dataset
    subcategory_keywords = {
        'landmark': 'Sights & Landmarks',
        'sight': 'Sights & Landmarks',
        'casino': 'Casinos & Gambling',
        'gambling': 'Casinos & Gambling',
        'game': 'Fun & Games',
        'specialty': 'Specialty Lodging',
        'bed and breakfast': 'Bed and Breakfast',
        'b&b': 'Bed and Breakfast',
        'tour': 'Tours',
        'shopping': 'Shopping',
        'museum': 'Museums',
        'nature': 'Nature & Parks',
        'restaurant': 'Restaurants',
        'outdoor': 'Outdoor Activities'
    }
    
    for keyword, subcategory in subcategory_keywords.items():
        if keyword in query:
            subcategories.append(subcategory)
    
    # Extract amenities
    amenity_keywords = ['pool', 'internet', 'wifi', 'parking', 'restaurant', 'bar', 
                        'breakfast', 'air conditioning', 'gym', 'fitness', 'spa', 
                        'free', 'shuttle', 'airport', 'service']
    for amenity in amenity_keywords:
        if amenity in query:
            amenities.append(amenity.title())
    
    # Extract price preference
    if any(term in query for term in ['low price', 'cheap', 'affordable', 'budget', 'inexpensive']):
        price_preference = 'low'
    elif any(term in query for term in ['high price', 'luxury', 'expensive', 'premium', 'high end']):
        price_preference = 'high'
    elif any(term in query for term in ['mid price', 'medium price', 'reasonable', 'average']):
        price_preference = 'medium'
    
    return {
        'country': country,
        'city': city,
        'place_type': place_type,
        'subcategories': subcategories,
        'amenities': amenities,
        'price_preference': price_preference
    }

In [6]:
# Function to calculate distance between two geo coordinates
def calculate_distance(lat1, lon1, lat2, lon2):
    """Calculate distance between two coordinates in kilometers"""
    if pd.isna(lat1) or pd.isna(lon1) or pd.isna(lat2) or pd.isna(lon2):
        return float('inf')  # Return infinity for missing coordinates
    
    point1 = (lat1, lon1)
    point2 = (lat2, lon2)
    
    return geodesic(point1, point2).kilometers

# Function to find nearby attractions based on city coordinates
def find_nearby_attractions(df, city_name, distance_threshold=100, top_n=5):
    """Find attractions near a given city within a specified distance threshold"""
    # Get city coordinates
    city_info = df[(df['city'].str.lower() == city_name.lower()) & (df['type'] == 'HOTEL')]
    
    if city_info.empty:
        # Try to find any entry with this city name
        city_info = df[df['city'].str.lower() == city_name.lower()]
    
    if city_info.empty:
        # If still no match, return top attractions in the country
        return df[df['type'] == 'ATTRACTION'].sort_values('rating', ascending=False).head(top_n)
    
    # Take the first entry's coordinates
    city_lat = city_info.iloc[0]['latitude']
    city_lon = city_info.iloc[0]['longitude']
    
    # Calculate distances for all attractions
    attractions_df = df[df['type'] == 'ATTRACTION'].copy()
    
    # Calculate distance from city to each attraction
    attractions_df['distance'] = attractions_df.apply(
        lambda row: calculate_distance(city_lat, city_lon, row['latitude'], row['longitude']), 
        axis=1
    )
    
    # Filter attractions within the distance threshold
    nearby_attractions = attractions_df[attractions_df['distance'] <= distance_threshold]
    
    # If no nearby attractions found, expand the search radius
    if nearby_attractions.empty:
        nearby_attractions = attractions_df.sort_values('distance').head(top_n)
    
    # Sort by rating (descending) and return top N
    return nearby_attractions.sort_values(['rating', 'distance'], ascending=[False, True]).head(top_n)

# Function to recommend attractions and hotels separately
def recommend_attractions_and_hotels(df, query, top_n=5):
    """Recommend attractions and hotels based on the query"""
    # Extract entities from the query
    entities = extract_entities(query)
    
    # Process attractions
    attractions_df = df[df['type'] == 'ATTRACTION'].copy()
    
    # Apply filters for attractions
    if entities['country']:
        attractions_df = attractions_df[attractions_df['country'].str.lower() == entities['country'].lower()]
    
    attractions_found = True
    
    if entities['city']:
        # Look for exact matches first
        city_matches = attractions_df[attractions_df['city'].str.lower() == entities['city'].lower()]
        # If no exact matches, try partial matches
        if len(city_matches) == 0:
            partial_matches = attractions_df[attractions_df['city'].str.lower().str.contains(entities['city'].lower())]
            if len(partial_matches) > 0:
                attractions_df = partial_matches
            else:
                # No attractions found in the city, set flag to find nearby attractions later
                attractions_found = False
                # Keep the original attractions_df for now
        else:
            attractions_df = city_matches
    
    if entities['subcategories']:
        # Create a mask for subcategories (check if any subcategory is in the string)
        mask = attractions_df['subcategories'].apply(
            lambda x: any(sub.lower() in str(x).lower() for sub in entities['subcategories'])
        )
        filtered = attractions_df[mask]
        if not filtered.empty:
            attractions_df = filtered
    
    # Sort attractions by rating (descending)
    attractions_df = attractions_df.sort_values('rating', ascending=False)
    
    # If no attractions found in the specified city, find nearby attractions
    if not attractions_found and entities['city']:
        print(f"\n🔍 No attractions found in {entities['city']}. Finding nearby attractions...")
        attractions_df = find_nearby_attractions(df, entities['city'], distance_threshold=100, top_n=top_n)
    
    # If still no attractions or very few, get top country attractions
    if len(attractions_df) < top_n and entities['country']:
        remaining_spots = top_n - len(attractions_df)
        country_attractions = df[(df['type'] == 'ATTRACTION') & 
                                (df['country'].str.lower() == entities['country'].lower())].sort_values('rating', ascending=False)
        
        # Filter out attractions already in the list
        if not attractions_df.empty:
            country_attractions = country_attractions[~country_attractions['id'].isin(attractions_df['id'])]
        
        # Add top country attractions
        attractions_df = pd.concat([attractions_df, country_attractions.head(remaining_spots)])
    
    # Process hotels
    hotels_df = df[df['type'] == 'HOTEL'].copy()
    
    # Apply filters for hotels
    if entities['country']:
        hotels_df = hotels_df[hotels_df['country'].str.lower() == entities['country'].lower()]
    
    if entities['city']:
        # Look for exact matches first
        city_matches = hotels_df[hotels_df['city'].str.lower() == entities['city'].lower()]
        # If no exact matches, try partial matches or nearby cities
        if len(city_matches) == 0:
            hotels_df = hotels_df[hotels_df['city'].str.lower().str.contains(entities['city'].lower())]
        else:
            hotels_df = city_matches
    
    # Filter hotels by amenities if specified
    if entities['amenities']:
        # Check if any amenity from the query is in the amenities list
        def has_amenities(amenities_list, query_amenities):
            if not isinstance(amenities_list, list):
                return False
            
            amenities_str = ' '.join(str(item).lower() for item in amenities_list)
            return any(amenity.lower() in amenities_str for amenity in query_amenities)
        
        mask = hotels_df['amenities_cleaned'].apply(lambda x: has_amenities(x, entities['amenities']))
        filtered = hotels_df[mask]
        if not filtered.empty:
            hotels_df = filtered
    
    # Always sort hotels by rating first (high to low)
    hotels_df = hotels_df.sort_values('rating', ascending=False)
    
    # Then apply price preference as a secondary sort if specified
    if entities['price_preference']:
        if entities['price_preference'] == 'low':
            # For low price preference, first sort by rating, then by price (ascending)
            hotels_df = hotels_df.sort_values(['rating', 'LowerPrice'], ascending=[False, True])
        elif entities['price_preference'] == 'high':
            # For high price preference, first sort by rating, then by price (descending)
            hotels_df = hotels_df.sort_values(['rating', 'UpperPrice'], ascending=[False, False])
        elif entities['price_preference'] == 'medium':
            # Calculate average price for medium range
            hotels_df['AvgPrice'] = (hotels_df['LowerPrice'] + hotels_df['UpperPrice']) / 2
            # Get median price
            median_price = hotels_df['AvgPrice'].median()
            # Sort by rating first, then by distance from median price
            hotels_df['PriceDiff'] = abs(hotels_df['AvgPrice'] - median_price)
            hotels_df = hotels_df.sort_values(['rating', 'PriceDiff'], ascending=[False, True])
    
    # Return the top N recommendations
    return attractions_df.head(top_n), hotels_df.head(top_n)

In [7]:
def format_price(price):
    """Format price for display"""
    if pd.isna(price) or price == 0:
        return "N/A"
    if price >= 1000:
        return f"${price/1000:.1f}K"
    return f"${price:.0f}"

def format_amenities(amenities_list):
    """Format amenities for display"""
    if not amenities_list or not isinstance(amenities_list, list):
        return "No amenities listed"
    
    # Clean the amenities list
    clean_amenities = []
    for item in amenities_list:
        if isinstance(item, str):
            clean_item = item.strip()
            if clean_item and clean_item not in clean_amenities:
                clean_amenities.append(clean_item)
    
    if not clean_amenities:
        return "No amenities listed"
    
    return ", ".join(clean_amenities)

def process_user_query(query, df):
    """Process a natural language query and return recommendations with improved formatting."""
    print(f"Processing query: '{query}'")
    print("-" * 80)
    
    # Extract entities from the query
    entities = extract_entities(query)
    print("📋 Query Analysis:")
    for key, value in entities.items():
        if value:  # Only print non-empty values
            print(f"- {key.title()}: {value}")
    print("-" * 80)
    
    # Get separate recommendations for attractions and hotels
    recommended_attractions, recommended_hotels = recommend_attractions_and_hotels(df, query, top_n=5)
    
    # Print attraction recommendations
    print("🏛️ TOP 5 RECOMMENDED ATTRACTIONS:")
    if not recommended_attractions.empty:  # Check if the dataframe is not empty
        for i, (_, attraction) in enumerate(recommended_attractions.iterrows(), 1):
            print(f"\n{i}. 🌟 {attraction['name']} ({attraction['subcategories']})")
            print(f"   Rating: {'⭐' * int(attraction['rating'])}{' ' * (5-int(attraction['rating']))} {attraction['rating']}/5 ({attraction['numberOfReviews']} reviews)")
            
            # Get price info for attractions if available
            price_info = "Price: "
            if not pd.isna(attraction['LowerPrice']) and not pd.isna(attraction['UpperPrice']) and (attraction['LowerPrice'] > 0 or attraction['UpperPrice'] > 0):
                price_info += f"{format_price(attraction['LowerPrice'])} - {format_price(attraction['UpperPrice'])}"
            else:
                price_info += "Not available"
            print(f"   {price_info}")
            
            print(f"   Location: {attraction['city']}, {attraction['country']}")
            
            # Add image and webUrl as requested
            print(f"   Image: {attraction['image']}")
            print(f"   Web URL: {attraction['webUrl']}")
            
            # Print distance information if available
            if 'distance' in attraction and not pd.isna(attraction['distance']):
                print(f"   Distance: {attraction['distance']:.1f} km")
            
            # Print full description without truncation
            desc = attraction['description']
            if isinstance(desc, str):
                print(f"   Description: {desc}")
            else:
                print(f"   Description: Not available")
    else:
        print("No attractions found matching your criteria.")
    print("-" * 80)
    
    # Print hotel recommendations
    print("🏨 TOP 5 RECOMMENDED HOTELS:")
    if not recommended_hotels.empty:  # Check if the dataframe is not empty
        for i, (_, hotel) in enumerate(recommended_hotels.iterrows(), 1):
            print(f"\n{i}. 🌟 {hotel['name']} ({hotel['subcategories']})")
            print(f"   Rating: {'⭐' * int(hotel['rating'])}{' ' * (5-int(hotel['rating']))} {hotel['rating']}/5 ({hotel['numberOfReviews']} reviews)")
            
            # Format price range
            price_range = f"   Price Range: {format_price(hotel['LowerPrice'])} - {format_price(hotel['UpperPrice'])}"
            print(price_range)
            
            print(f"   Location: {hotel['city']}, {hotel['country']}")
            
            # Format and clean amenities for display
            formatted_amenities = format_amenities(hotel['amenities_cleaned'])
            print(f"   Amenities: {formatted_amenities}")
            
            # Add image and webUrl as requested
            print(f"   Image: {hotel['image']}")
            print(f"   Web URL: {hotel['webUrl']}")
            
            # Print full description without truncation
            desc = hotel['description']
            if isinstance(desc, str):
                print(f"   Description: {desc}")
            else:
                print(f"   Description: Not available")
    else:
        print("No hotels found matching your criteria.")
    
    return recommended_attractions, recommended_hotels
    result = ["Some Result 1", "Some Result 2"]  # example output
    return result 


In [8]:
def evaluate_recommendation_system(df):
    """Run evaluation on predefined test queries."""
    test_queries = [
        "Show me attractions in Gaborone, Botswana",
        "I need a hotel in Francistown with a swimming pool",
        "What are the best wildlife attractions in Botswana?",
        "Recommend affordable hotels in Maun",
        "I want to visit historical landmarks in Botswana"
    ]

    
    
    print("\n" + "=" * 80)
    print("📊 SYSTEM EVALUATION")
    print("=" * 80)
    
    for i, (query, expected) in enumerate(zip(test_queries, expected_results), 1):
        print(f"\nTest Query {i}: '{query}'")
        print("-" * 80)

        predicted = process_user_query(query, df)  # Should return a list of names/titles

        # Binary relevance vectors for calculating metrics
        y_true = [1 if item in expected else 0 for item in predicted]
        y_pred = [1] * len(predicted)  # all predicted items are considered 'retrieved'

        # Precision: correct results / total predicted
        if y_true:
            precision = precision_score(y_true, y_pred, zero_division=0)
            accuracy = accuracy_score(y_true, y_pred)
            print(f"✅ Precision: {precision:.2f}")
            print(f"✅ Accuracy: {accuracy:.2f}")
        else:
            print("⚠️ No relevant results found. Precision & Accuracy = 0")


In [9]:
def interactive_recommendation(df):
    """Run an interactive recommendation system."""
    print("=" * 80)
    print("🌍 TRAVEL RECOMMENDATION SYSTEM")
    print("=" * 80)
    print("\nThis system provides personalized travel recommendations.")
    print("You can ask questions like:")
    print("- I want to go to Nata, Botswana. Suggest attractions like landmarks.")
    print("- Looking for hotels in Maun with pool and internet. Price should be affordable.")
    print("- Recommend top attractions in Gaborone.")
    print("\nType 'exit' to quit the system.")
    print("Type 'evaluate' to run system evaluation.")
    print("\n" + "-" * 80 + "\n")
    
    while True:
        query = input("\nEnter your travel query: ").strip()
        
        if query.lower() == 'exit':
            print("\nThank you for using the travel recommendation system. Goodbye!")
            break
        elif query.lower() == 'evaluate':
            evaluate_recommendation_system(df)
        else:
            process_user_query(query, df)


In [10]:
def main():
    # Load your DataFrame here
    # This is just a placeholder - you need to replace with your actual data loading code
    # df = pd.read_csv('your_dataset.csv')
    
    # Assuming df is your loaded dataset with all necessary columns
    interactive_recommendation(df)

if __name__ == "__main__":
    main()

🌍 TRAVEL RECOMMENDATION SYSTEM

This system provides personalized travel recommendations.
You can ask questions like:
- I want to go to Nata, Botswana. Suggest attractions like landmarks.
- Looking for hotels in Maun with pool and internet. Price should be affordable.
- Recommend top attractions in Gaborone.

Type 'exit' to quit the system.
Type 'evaluate' to run system evaluation.

--------------------------------------------------------------------------------




Enter your travel query:  I want to go to Nata, Botswana. Suggest attractions like landmarks. - Looking for hotels in Maun with pool and internet. Price should be affordable.


Processing query: 'I want to go to Nata, Botswana. Suggest attractions like landmarks. - Looking for hotels in Maun with pool and internet. Price should be affordable.'
--------------------------------------------------------------------------------
📋 Query Analysis:
- Country: Botswana
- City: Nata
- Place_Type: HOTEL
- Subcategories: ['Sights & Landmarks']
- Amenities: ['Pool', 'Internet']
- Price_Preference: low
--------------------------------------------------------------------------------

🔍 No attractions found in Nata. Finding nearby attractions...
🏛️ TOP 5 RECOMMENDED ATTRACTIONS:

1. 🌟 Safari Addicts (Tours, Other, Transportation, Outdoor Activities)
   Rating: ⭐⭐⭐⭐⭐ 5.0/5 (2 reviews)
   Price: $141 - $281
   Location: Maun, Botswana
   Image: https://media-cdn.tripadvisor.com/media/photo-o/1c/a0/ce/b9/caption.jpg
   Web URL: https://www.tripadvisor.com/Attraction_Review-g317055-d23147974-Reviews-Safari_Addicts-Maun_Ngamiland_East_North_West_District.html
   Distance: 32.1 km


Enter your travel query:  exit



Thank you for using the travel recommendation system. Goodbye!


In [1]:
from sklearn.metrics import precision_score, accuracy_score

def evaluate_recommendation_system(df):
    """Run evaluation on predefined test queries."""
    test_queries = [
        "I want to visit historical landmarks in Botswana"
    ]

    expected_results = [
        ["Best Ghana Tours ", "Grassroot Tours Ghana","Mole Boy Travel & Safe Guide Tours (Tours, Outdoor Activities)",
        "Larabanga Mosque"]
    ]
    
    print("\n" + "=" * 80)
    print("📊 SYSTEM EVALUATION")
    print("=" * 80)
    
    for i, (query, expected) in enumerate(zip(test_queries, expected_results), 1):
        print(f"\nTest Query {i}: '{query}'")
        print("-" * 80)

        recommended_attractions, _ = process_user_query(query, df)
        predicted = recommended_attractions['name'].dropna().astype(str).str.strip().str.lower().tolist()
        expected = [e.strip().lower() for e in expected]

        if not predicted:
            print("❌ No predictions made.")
            continue

        # Binary relevance vectors
        y_true = [1 if item in expected else 0 for item in predicted]
        y_pred = [1] * len(predicted)  # everything in predicted is treated as retrieved

        if any(y_true):
            precision = precision_score(y_true, y_pred, zero_division=0)
            accuracy = accuracy_score(y_true, y_pred)
            print(f"Precision: {precision:.2f}")
            print(f"Accuracy: {accuracy:.2f}")
        else:
            print("⚠️ No relevant results found. Precision & Accuracy = 0")

        # Optional debug
        print(f" Predicted: {predicted}")
        print(f"Expected: {expected}")


In [2]:
evaluate_recommendation_system(df)


NameError: name 'df' is not defined