# Entertainment Recommendations Model

This notebook builds a content-based recommendation system for entertainment venues.

**Approach:** Content-based filtering using:
- Venue features (category, price tier, target audience, popularity)
- User preferences (budget, group type, preferred categories)

**Output:** Top-N recommended venues with relevance scores

In [0]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import joblib
import json
import warnings
warnings.filterwarnings('ignore')

print("Libraries loaded successfully")

Libraries loaded successfully


## 1. Load Venue Data

In [0]:
# Load venues data
venues = pd.read_parquet('entierement data/venues_enriched.parquet')
print(f"Loaded {len(venues)} venues")
print(f"Columns: {list(venues.columns)}")

Loaded 2425 venues
Columns: ['venue_id', 'venue_name', 'city', 'state', 'category', 'subcategory', 'price_tier', 'price_avg', 'is_free', 'rating', 'review_count', 'popularity_score', 'target_audience', 'estimated_duration_hours', 'good_for_tags', 'good_for_kids', 'wheelchair_accessible', 'outdoor_seating', 'has_happy_hour', 'has_parking', 'wifi', 'noise_level', 'ambience', 'activities', 'activity_count', 'topics', 'hours_description', 'has_seasonal_hours', 'entrance_passes', 'has_discount_passes', 'weather_info', 'image_count', 'has_contact_info', 'designation', 'data_source']


In [0]:
# Examine key columns
print("\nCategories:", venues['category'].unique().tolist())
print("\nPrice tiers:", venues['price_tier'].unique().tolist())
print("\nTarget audiences:", venues['target_audience'].unique().tolist())


Categories: ['museum', 'tour', 'sports_recreation', 'performing_arts', 'attraction', 'outdoor_activity', 'theme_park']

Price tiers: ['medium', 'budget', 'expensive', 'luxury']

Target audiences: ['family', 'adults', 'all']


In [0]:
# Preview sample ve anues
venues[['venue_name', 'city', 'state', 'category', 'price_tier', 'target_audience', 'popularity_score', 'rating']].head(10)

Unnamed: 0,venue_name,city,state,category,price_tier,target_audience,popularity_score,rating
0,Edwardsville Children's Museum,Edwardsville,IL,museum,medium,family,68.4,4.5
1,New Orleans Spirit Tours,New Orleans,LA,tour,medium,adults,69.0,4.0
2,Budweiser Brewery Experience,Saint Louis,MO,tour,budget,adults,91.0,4.5
3,David Thomas Trailways,Philadelphia,PA,tour,medium,adults,64.3,4.5
4,Gaylord Opryland Resort & Convention Center,Nashville,TN,museum,expensive,family,76.0,3.0
5,Enjoy The Mountain,Santa Barbara,CA,museum,medium,family,76.4,4.5
6,Watson Adventures Scavenger Hunts,Philadelphia,PA,museum,medium,adults,48.0,3.0
7,Best Tours,Philadelphia,PA,tour,medium,adults,33.0,1.0
8,The Music Box Village,New Orleans,LA,museum,medium,family,77.9,5.0
9,Taste of Nawlins,New Orleans,LA,tour,medium,adults,77.9,4.0


## 2. Create Venue Feature Vectors

In [0]:
# Define feature configuration
CATEGORIES = ['museum', 'tour', 'sports_recreation', 'performing_arts', 'attraction', 'outdoor_activity', 'theme_park']
PRICE_TIERS = ['budget', 'medium', 'expensive', 'luxury']
AUDIENCES = ['family', 'adults', 'all']

# Price tier numeric mapping for filtering
PRICE_TIER_ORDER = {'budget': 0, 'medium': 1, 'expensive': 2, 'luxury': 3}

print(f"Categories: {len(CATEGORIES)}")
print(f"Price tiers: {len(PRICE_TIERS)}")
print(f"Audiences: {len(AUDIENCES)}")

Categories: 7
Price tiers: 4
Audiences: 3


In [0]:
def create_venue_features(venues_df):
    """Create feature matrix for all venues."""
    
    # One-hot encode categorical features
    # Category features
    for cat in CATEGORIES:
        venues_df[f'cat_{cat}'] = (venues_df['category'] == cat).astype(float)
    
    # Price tier features
    for tier in PRICE_TIERS:
        venues_df[f'price_{tier}'] = (venues_df['price_tier'] == tier).astype(float)
    
    # Audience features
    for aud in AUDIENCES:
        venues_df[f'aud_{aud}'] = (venues_df['target_audience'] == aud).astype(float)
    
    # Numeric features (normalized)
    venues_df['popularity_norm'] = venues_df['popularity_score'].fillna(50) / 100
    venues_df['rating_norm'] = venues_df['rating'].fillna(3.5) / 5.0
    venues_df['duration_norm'] = venues_df['estimated_duration_hours'].fillna(2) / 8.0
    
    # Boolean features
    venues_df['is_kid_friendly'] = venues_df['good_for_kids'].fillna(False).astype(float)
    venues_df['is_accessible'] = venues_df['wheelchair_accessible'].fillna(False).astype(float)
    venues_df['has_parking_flag'] = venues_df['has_parking'].fillna(False).astype(float)
    
    # Price tier numeric for filtering
    venues_df['price_tier_num'] = venues_df['price_tier'].map(PRICE_TIER_ORDER)
    
    return venues_df

# Apply feature engineering
venues = create_venue_features(venues.copy())
print(f"Features created. Total columns: {len(venues.columns)}")

Features created. Total columns: 56


In [0]:
# Define feature columns for similarity computation
FEATURE_COLUMNS = (
    [f'cat_{cat}' for cat in CATEGORIES] +
    [f'price_{tier}' for tier in PRICE_TIERS] +
    [f'aud_{aud}' for aud in AUDIENCES] +
    ['popularity_norm', 'rating_norm', 'duration_norm', 
     'is_kid_friendly', 'is_accessible', 'has_parking_flag']
)

print(f"Feature columns: {len(FEATURE_COLUMNS)}")
print(FEATURE_COLUMNS)

Feature columns: 20
['cat_museum', 'cat_tour', 'cat_sports_recreation', 'cat_performing_arts', 'cat_attraction', 'cat_outdoor_activity', 'cat_theme_park', 'price_budget', 'price_medium', 'price_expensive', 'price_luxury', 'aud_family', 'aud_adults', 'aud_all', 'popularity_norm', 'rating_norm', 'duration_norm', 'is_kid_friendly', 'is_accessible', 'has_parking_flag']


In [0]:
# Create feature matrix
venue_features = venues[FEATURE_COLUMNS].values
print(f"Feature matrix shape: {venue_features.shape}")

Feature matrix shape: (2425, 20)


## 3. Build Recommendation Engine

In [0]:
class EntertainmentRecommender:
    """Content-based entertainment venue recommender."""
    
    def __init__(self, venues_df, feature_columns):
        self.venues = venues_df.copy()
        self.feature_columns = feature_columns
        self.feature_matrix = self.venues[feature_columns].values
        
    def create_preference_vector(self, preferences):
        """Create a preference vector from user input."""
        vector = np.zeros(len(self.feature_columns))
        
        # Category preferences (can be multiple)
        preferred_categories = preferences.get('categories', [])
        for cat in preferred_categories:
            col_name = f'cat_{cat}'
            if col_name in self.feature_columns:
                idx = self.feature_columns.index(col_name)
                vector[idx] = 1.0
        
        # If no specific categories, weight all equally
        if not preferred_categories:
            for cat in CATEGORIES:
                idx = self.feature_columns.index(f'cat_{cat}')
                vector[idx] = 1.0 / len(CATEGORIES)
        
        # Price tier preference
        budget_style = preferences.get('budget_style', 'medium')
        price_col = f'price_{budget_style}'
        if price_col in self.feature_columns:
            idx = self.feature_columns.index(price_col)
            vector[idx] = 1.0
        
        # Audience preference
        group_type = preferences.get('group_type', 'all')
        aud_col = f'aud_{group_type}'
        if aud_col in self.feature_columns:
            idx = self.feature_columns.index(aud_col)
            vector[idx] = 1.0
        # Also include 'all' audience as acceptable
        if group_type != 'all':
            all_idx = self.feature_columns.index('aud_all')
            vector[all_idx] = 0.8  # Slightly lower weight for 'all'
        
        # Popularity preference (default to high)
        popularity_weight = preferences.get('popularity_weight', 0.7)
        pop_idx = self.feature_columns.index('popularity_norm')
        vector[pop_idx] = popularity_weight
        
        # Rating preference (default to high)
        rating_weight = preferences.get('rating_weight', 0.8)
        rating_idx = self.feature_columns.index('rating_norm')
        vector[rating_idx] = rating_weight
        
        # Optional: accessibility, kid-friendly, parking
        if preferences.get('needs_accessible', False):
            idx = self.feature_columns.index('is_accessible')
            vector[idx] = 1.0
        
        if preferences.get('has_kids', False):
            idx = self.feature_columns.index('is_kid_friendly')
            vector[idx] = 1.0
        
        if preferences.get('needs_parking', False):
            idx = self.feature_columns.index('has_parking_flag')
            vector[idx] = 1.0
        
        return vector
    
    def recommend(self, preferences, city=None, state=None, top_n=10, max_price_tier=None):
        """Get top-N recommendations based on preferences.
        
        Args:
            preferences: dict with keys like 'categories', 'budget_style', 'group_type'
            city: filter by city name (optional)
            state: filter by state (optional)
            top_n: number of recommendations to return
            max_price_tier: maximum price tier ('budget', 'medium', 'expensive', 'luxury')
        
        Returns:
            DataFrame with recommended venues and scores
        """
        # Create preference vector
        pref_vector = self.create_preference_vector(preferences).reshape(1, -1)
        
        # Calculate similarity scores
        similarities = cosine_similarity(pref_vector, self.feature_matrix)[0]
        
        # Create results dataframe
        results = self.venues.copy()
        results['match_score'] = similarities
        
        # Apply filters
        if city:
            results = results[results['city'].str.lower() == city.lower()]
        
        if state:
            results = results[results['state'].str.upper() == state.upper()]
        
        if max_price_tier:
            max_tier_num = PRICE_TIER_ORDER.get(max_price_tier, 3)
            results = results[results['price_tier_num'] <= max_tier_num]
        
        # Sort by match score and return top N
        results = results.sort_values('match_score', ascending=False).head(top_n)
        
        # Return relevant columns
        output_columns = [
            'venue_name', 'city', 'state', 'category', 'price_tier','price_avg',
            'target_audience', 'popularity_score', 'rating',
            'estimated_duration_hours', 'match_score'
        ]
        
        return results[output_columns].reset_index(drop=True)

# Initialize recommender
recommender = EntertainmentRecommender(venues, FEATURE_COLUMNS)
print("Recommender initialized successfully")

Recommender initialized successfully


## 4. Test Recommendations

In [0]:
# Test case 1: Family trip to New York, medium budget, looking for museums
print("TEST 1: Family trip to New York, medium budget, museums")

preferences_1 = {
    'categories': ['museum'],
    'budget_style': 'medium',
    'group_type': 'family',
    'has_kids': True
}

recs_1 = recommender.recommend(
    preferences_1, 
    city='New York',
    state='NY',
    top_n=5
)
recs_1.display()

TEST 1: Family trip to New York, medium budget, museums


venue_name,city,state,category,price_tier,price_avg,target_audience,popularity_score,rating,estimated_duration_hours,match_score
Metropolitan Museum of Art,New York,NY,museum,medium,30.0,adults,50.0,,2.5,0.6184036536935725
Museum of Modern Art (MoMA),New York,NY,museum,budget,25.0,adults,50.0,,2.5,0.4058938070634789
African Burial Ground National Monument,New York,NY,outdoor_activity,budget,0.0,all,50.0,,3.0,0.3613743846987601
Castle Clinton National Monument,New York,NY,outdoor_activity,budget,0.0,all,50.0,,3.0,0.3613743846987601
Ellis Island Part of Statue of Liberty National Monument,New York,NY,outdoor_activity,budget,0.0,all,50.0,,3.0,0.3613743846987601


In [0]:
# Test case 2: Adults trip to Los Angeles, expensive style, theme parks and attractions
print("TEST 2: Adults in Los Angeles, expensive, theme parks/attractions")

preferences_2 = {
    'categories': ['theme_park', 'attraction'],
    'budget_style': 'expensive',
    'group_type': 'adults'
}

recs_2 = recommender.recommend(
    preferences_2,
    city='Los Angeles',
    state='CA',
    top_n=5
)
recs_2.display()

TEST 2: Adults in Los Angeles, expensive, theme parks/attractions


venue_name,city,state,category,price_tier,price_avg,target_audience,popularity_score,rating,estimated_duration_hours,match_score
Universal Studios Hollywood,Los Angeles,CA,theme_park,expensive,139.0,family,50.0,,10.4,0.5198824627913637
Getty Center,Los Angeles,CA,museum,budget,0.0,adults,50.0,,2.5,0.4058938070634788


In [0]:
# Test case 3: Budget traveler in Florida, outdoor activities
print("TEST 3: Budget traveler in Florida, outdoor activities")

preferences_3 = {
    'categories': ['outdoor_activity'],
    'budget_style': 'budget',
    'group_type': 'all'
}

recs_3 = recommender.recommend(
    preferences_3,
    state='FL',
    max_price_tier='medium',  # Don't exceed medium price
    top_n=5
)
recs_3.display()

TEST 3: Budget traveler in Florida, outdoor activities


venue_name,city,state,category,price_tier,price_avg,target_audience,popularity_score,rating,estimated_duration_hours,match_score
Big Cypress National Preserve,Ochopee,FL,outdoor_activity,budget,0.0,all,50.0,,3.0,0.9766768480609292
Fort Matanzas National Monument,Saint Augustine,FL,outdoor_activity,budget,0.0,all,50.0,,3.0,0.9766768480609292
Biscayne National Park,Homestead,FL,outdoor_activity,budget,0.0,all,50.0,,3.0,0.9766768480609292
Canaveral National Seashore,New Smyrna Beach,FL,outdoor_activity,budget,25.0,all,50.0,,3.0,0.9766768480609292
Timucuan Ecological & Historic Preserve,Jacksonville,FL,outdoor_activity,budget,0.0,all,50.0,,3.0,0.9766768480609292


In [0]:
# Test case 4: Accessibility needs, any city
print("TEST 4: Accessible venues nationwide, family-friendly")

preferences_4 = {
    'categories': ['museum', 'attraction'],
    'budget_style': 'medium',
    'group_type': 'family',
    'needs_accessible': True,
    'has_kids': True
}

recs_4 = recommender.recommend(
    preferences_4,
    top_n=10
)
recs_4.display()

TEST 4: Accessible venues nationwide, family-friendly


venue_name,city,state,category,price_tier,price_avg,target_audience,popularity_score,rating,estimated_duration_hours,match_score
Rhythm! Discovery Center,Indianapolis,IN,museum,medium,50.0,family,69.5,4.0,2.5,0.881217177428105
University Family Fun Center,Philadelphia,PA,museum,medium,50.0,family,69.0,4.0,2.5,0.8812043727278815
Laser Tag & Games,Metairie,LA,museum,medium,50.0,family,68.9,4.0,2.5,0.881201417360436
Santa Barbara Museum of Natural History Sea Center,Santa Barbara,CA,museum,medium,50.0,family,74.4,4.0,2.5,0.8811708515156672
Launch Trampoline Park,Deptford Township,NJ,museum,medium,50.0,family,66.4,4.0,2.5,0.8810845163694434
Ultrazone,Bensalem,PA,museum,medium,50.0,family,66.1,4.0,2.5,0.8810648948603526
One Liberty Observation Deck,Philadelphia,PA,museum,medium,50.0,family,77.3,4.0,2.5,0.880999376418917
Adventure Science Center,Nashville,TN,museum,medium,50.0,family,78.0,4.0,2.5,0.8809422669950381
Tucson Botanical Gardens,Tucson,AZ,museum,medium,50.0,family,78.9,4.0,2.5,0.8808599606333547
Bette's Family Fun Center,Aston,PA,museum,medium,50.0,family,72.4,4.5,2.5,0.8808197680713457


## 5. Evaluate Recommendation Quality

In [0]:
def evaluate_recommendations(recommender, test_cases):
    """Evaluate recommendation quality with basic metrics."""
    results = []
    
    for name, prefs, filters in test_cases:
        recs = recommender.recommend(prefs, **filters, top_n=10)
        
        if len(recs) == 0:
            continue
        
        # Calculate metrics
        avg_score = recs['match_score'].mean()
        avg_popularity = recs['popularity_score'].mean()
        avg_rating = recs['rating'].mean()
        
        # Category precision (how many match preferred category)
        preferred_cats = prefs.get('categories', [])
        if preferred_cats:
            cat_precision = recs['category'].isin(preferred_cats).mean()
        else:
            cat_precision = 1.0  # No preference = all valid
        
        results.append({
            'test_case': name,
            'num_results': len(recs),
            'avg_match_score': avg_score,
            'category_precision': cat_precision,
            'avg_popularity': avg_popularity,
            'avg_rating': avg_rating
        })
    
    return pd.DataFrame(results)

# Define test cases
test_cases = [
    ('NYC Museums', {'categories': ['museum'], 'budget_style': 'medium', 'group_type': 'family'}, {'city': 'New York', 'state': 'NY'}),
    ('LA Theme Parks', {'categories': ['theme_park'], 'budget_style': 'expensive', 'group_type': 'family'}, {'state': 'CA'}),
    ('Budget Outdoor', {'categories': ['outdoor_activity'], 'budget_style': 'budget', 'group_type': 'all'}, {'max_price_tier': 'medium'}),
    ('Tours Nationwide', {'categories': ['tour'], 'budget_style': 'medium', 'group_type': 'adults'}, {}),
    ('Luxury Attractions', {'categories': ['attraction'], 'budget_style': 'luxury', 'group_type': 'adults'}, {}),
]

eval_results = evaluate_recommendations(recommender, test_cases)
print("\nRecommendation Quality Metrics:")
print(eval_results.to_string(index=False))


Recommendation Quality Metrics:
         test_case  num_results  avg_match_score  category_precision  avg_popularity  avg_rating
       NYC Museums           10         0.430619                 0.2           50.00         NaN
    LA Theme Parks           10         0.616126                 0.7           58.69    4.833333
    Budget Outdoor           10         0.976677                 1.0           50.00         NaN
  Tours Nationwide           10         0.915024                 1.0           72.87    4.100000
Luxury Attractions           10         0.598805                 0.0           72.67    4.500000


In [0]:
# Summary statistics
print("\nOverall Performance:")
print(f"  Average Match Score: {eval_results['avg_match_score'].mean():.3f}")
print(f"  Average Category Precision: {eval_results['category_precision'].mean():.3f}")
print(f"  Average Popularity: {eval_results['avg_popularity'].mean():.1f}")
print(f"  Average Rating: {eval_results['avg_rating'].mean():.2f}")


Overall Performance:
  Average Match Score: 0.707
  Average Category Precision: 0.580
  Average Popularity: 60.8
  Average Rating: 4.48


## 6. Save Recommender for Deployment

In [0]:
# Save the venue features dataframe (with engineered features)
venues_with_features_path = '/Workspace/Users/muradrahimli@campus.technion.ac.il/entierement data/venues_with_features.parquet'
venues.to_parquet(venues_with_features_path)
print(f"Venues with features saved to: {venues_with_features_path}")

# Save configuration
recommender_config = {
    'feature_columns': FEATURE_COLUMNS,
    'categories': CATEGORIES,
    'price_tiers': PRICE_TIERS,
    'audiences': AUDIENCES,
    'price_tier_order': PRICE_TIER_ORDER
}

config_path = '/Workspace/Users/muradrahimli@campus.technion.ac.il/entierement notebooks/Models/recommender_config.json'
with open(config_path, 'w') as f:
    json.dump(recommender_config, f, indent=2)
print(f"Recommender config saved to: {config_path}")

Venues with features saved to: /Workspace/Users/muradrahimli@campus.technion.ac.il/entierement data/venues_with_features.parquet
Recommender config saved to: /Workspace/Users/muradrahimli@campus.technion.ac.il/entierement notebooks/Models/recommender_config.json


In [0]:
# Verify saved data loads correctly
loaded_venues = pd.read_parquet(venues_with_features_path)
with open(config_path, 'r') as f:
    loaded_config = json.load(f)

# Create new recommender from saved data
loaded_recommender = EntertainmentRecommender(
    loaded_venues, 
    loaded_config['feature_columns']
)

# Test loaded recommender
test_prefs = {'categories': ['museum'], 'budget_style': 'medium', 'group_type': 'family'}
test_recs = loaded_recommender.recommend(test_prefs, top_n=3)
print("\nVerification - Top 3 museum recommendations:")
print(test_recs[['venue_name', 'city', 'match_score']].to_string(index=False))
print("\nRecommender saved and verified successfully")


Verification - Top 3 museum recommendations:
         venue_name         city  match_score
The Undercover Unit Philadelphia     0.918886
Gretna Bingo Palace       Gretna     0.918338
        Cactus Bowl       Tucson     0.917609

Recommender saved and verified successfully


## 7. Helper Function for Production Use

In [0]:
def get_recommendations_for_trip(
    city: str,
    state: str,
    budget_style: str = 'medium',
    group_type: str = 'all',
    preferred_categories: list = None,
    has_kids: bool = False,
    needs_accessible: bool = False,
    top_n: int = 10
):
    """
    Get entertainment recommendations for a trip.
    
    Args:
        city: City name
        state: State abbreviation (e.g., 'NY', 'CA')
        budget_style: 'budget', 'medium', 'expensive', or 'luxury'
        group_type: 'family', 'adults', or 'all'
        preferred_categories: List of preferred categories
        has_kids: Whether the group includes children
        needs_accessible: Whether wheelchair accessibility is needed
        top_n: Number of recommendations to return
    
    Returns:
        DataFrame with recommended venues
    """
    preferences = {
        'categories': preferred_categories or [],
        'budget_style': budget_style,
        'group_type': group_type,
        'has_kids': has_kids,
        'needs_accessible': needs_accessible
    }
    
    return recommender.recommend(
        preferences,
        city=city,
        state=state,
        max_price_tier=budget_style if budget_style != 'luxury' else None,
        top_n=top_n
    )

# Example usage
print("Example: Family trip to Chicago with kids, medium budget")
recs = get_recommendations_for_trip(
    city='Chicago',
    state='IL',
    budget_style='medium',
    group_type='family',
    preferred_categories=['museum', 'attraction'],
    has_kids=True,
    top_n=5
)
recs.display()

Example: Family trip to Chicago with kids, medium budget


venue_name,city,state,category,price_tier,price_avg,target_audience,popularity_score,rating,estimated_duration_hours,match_score
Art Institute of Chicago,Chicago,IL,museum,budget,25.0,adults,50.0,,2.5,0.3747192441233852
Pullman National Historical Park,Chicago,IL,outdoor_activity,budget,0.0,all,50.0,,3.0,0.3336191238283541


###Widgets for interactive use
##Note: you must run the last cell to close them widgets or they will reamin on your screen

In [0]:
try:
    dbutils.widgets.removeAll()

    # Get unique values from the data
    states_list = sorted(venues['state'].dropna().unique().tolist())
    categories_list = sorted(venues['category'].dropna().unique().tolist())                                                                      
    price_tiers_list = ['budget', 'medium', 'expensive', 'luxury']
    audiences_list = ['family', 'adults', 'all']
    dbutils.widgets.text("city", "", "1. City (leave empty for state-wide)")
    dbutils.widgets.dropdown("state", states_list[0], states_list, "2. State")
    dbutils.widgets.dropdown("budget_style", "medium", price_tiers_list, "3. Budget Style")
    dbutils.widgets.dropdown("group_type", "family", audiences_list, "4. Group Type")
    dbutils.widgets.multiselect("categories", categories_list[0], categories_list, "5. Categories")
    dbutils.widgets.dropdown("has_kids", "No", ["Yes", "No"], "6. Traveling with Kids?")
    dbutils.widgets.dropdown("needs_accessible", "No", ["Yes", "No"], "7. Need Wheelchair Access?")
    dbutils.widgets.dropdown("top_n", "10", ["5", "10", "15", "20"], "8. Number of Results")

    print(f"Widgets created from data: {len(states_list)} states, {len(categories_list)} categories")
    print("\n")
    print("widgets ready")
    print("1. Select your preferences in the widgets above")
    print("2. Run the NEXT CELL to get your recommendations")
    dbutils.notebook.exit("Interactive mode - run recommendation cell manually")                                            
except NameError:
    print("Widgets only work in Databricks.")

After filling in the widgets (balnk uses defult) run the next cell

In [0]:
# Get recommendations based on widget values
user_city = dbutils.widgets.get("city")
user_state = dbutils.widgets.get("state")
user_budget = dbutils.widgets.get("budget_style")
user_group = dbutils.widgets.get("group_type")
user_categories = dbutils.widgets.get("categories")
user_has_kids = dbutils.widgets.get("has_kids") == "Yes"
user_accessible = dbutils.widgets.get("needs_accessible") == "Yes"
user_top_n = int(dbutils.widgets.get("top_n"))

category_list = [c.strip() for c in user_categories.split(",") if c.strip()]

print(f"Location: {user_city}, {user_state} | Budget: {user_budget} | Group: {user_group}")                                                      
print(f"Categories: {category_list} | Kids: {user_has_kids} | Accessible: {user_accessible}")         
                                                                                                                                                   
user_recs = get_recommendations_for_trip(
    city=user_city, state=user_state, budget_style=user_budget,
    group_type=user_group, preferred_categories=category_list or None,
    has_kids=user_has_kids, needs_accessible=user_accessible, top_n=user_top_n
  )

if len(user_recs) > 0:
    display(user_recs)
else:
    print(f"No venues found in {user_city}, {user_state}. Try different criteria.")

Location: ,  | Budget: medium | Group: family
Categories: ['attraction'] | Kids: False | Accessible: False


venue_name,city,state,category,price_tier,price_avg,target_audience,popularity_score,rating,estimated_duration_hours,match_score
The Reef Aquarium Shop,Indianapolis,IN,attraction,medium,50.0,family,71.3,4.0,2.0,0.9235509600850422
Animal House Naturals Pet Center,Saint Petersburg,FL,attraction,medium,50.0,family,65.9,4.0,2.0,0.9232750156495824
Pet Supplies Plus - Cherry Hill,Cherry Hill,NJ,attraction,medium,50.0,family,71.0,4.5,2.0,0.9229453444883622
Sea Life Fish & Aquariums,Saint Petersburg,FL,attraction,medium,50.0,family,70.4,4.5,2.0,0.9229210908936438
Reef To Rift Aquarium Store,Hatfield,PA,attraction,medium,50.0,family,62.8,4.0,2.0,0.9228509420374654
Aquatic Sealife,Metairie,LA,attraction,medium,50.0,family,62.4,4.0,2.0,0.9227817240237186
Seascape Studio,Saint Louis,MO,attraction,medium,50.0,family,67.3,4.5,2.0,0.9226866561488336
Current Aquatics,Cherry Hill,NJ,attraction,medium,50.0,family,66.0,4.5,2.0,0.9225330867915972
Aquatic Specialties,Kenner,LA,attraction,medium,50.0,family,59.2,4.0,2.0,0.9221063878541428
Williams Greenbank Aquarium,Wilmington,DE,attraction,medium,50.0,family,58.3,4.0,2.0,0.9218770238157236


### To remove widges uncomment and run this:

In [0]:
#dbutils.widgets.removeAll()

In [0]:
import pandas as pd
import numpy as np
import pickle
import os
import shutil
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import HTML, display

# --- 1. CONFIGURATION ---
CATEGORIES = ['museum', 'tour', 'sports_recreation', 'performing_arts', 'attraction', 'outdoor_activity', 'theme_park']
PRICE_TIERS = ['budget', 'medium', 'expensive', 'luxury']
AUDIENCES = ['family', 'adults', 'all']
PRICE_TIER_ORDER = {'budget': 0, 'medium': 1, 'expensive': 2, 'luxury': 3}

# --- 2. PREPROCESSING FUNCTIONS ---
def create_venue_features(venues_df):
    """Create feature matrix for all venues."""
    df = venues_df.copy()
    
    # One-hot encode categorical features
    for cat in CATEGORIES:
        df[f'cat_{cat}'] = (df['category'] == cat).astype(float)
    
    for tier in PRICE_TIERS:
        df[f'price_{tier}'] = (df['price_tier'] == tier).astype(float)
    
    for aud in AUDIENCES:
        df[f'aud_{aud}'] = (df['target_audience'] == aud).astype(float)
    
    # Numeric features (normalized)
    df['popularity_norm'] = df['popularity_score'].fillna(50) / 100
    df['rating_norm'] = df['rating'].fillna(3.5) / 5.0
    df['duration_norm'] = df['estimated_duration_hours'].fillna(2) / 8.0
    
    # Boolean features
    df['is_kid_friendly'] = df['good_for_kids'].fillna(False).astype(float)
    df['is_accessible'] = df['wheelchair_accessible'].fillna(False).astype(float)
    df['has_parking_flag'] = df['has_parking'].fillna(False).astype(float)
    
    # Price tier numeric for filtering
    df['price_tier_num'] = df['price_tier'].map(PRICE_TIER_ORDER)
    
    return df

# --- 3. LOAD & PROCESS DATA ---
print("Loading data...")
# Assuming 'venues' is already loaded in your notebook environment. 
# If not, uncomment the line below:
# venues = pd.read_parquet('/Workspace/Users/.../venues_enriched.parquet')

print("Engineering features...")
venues_processed = create_venue_features(venues)

# Define feature columns
FEATURE_COLUMNS = (
    [f'cat_{cat}' for cat in CATEGORIES] +
    [f'price_{tier}' for tier in PRICE_TIERS] +
    [f'aud_{aud}' for aud in AUDIENCES] +
    ['popularity_norm', 'rating_norm', 'duration_norm', 
     'is_kid_friendly', 'is_accessible', 'has_parking_flag']
)

# Create feature matrix
print("Creating feature matrix...")
feature_matrix = venues_processed[FEATURE_COLUMNS].values

# --- 4. PACKAGE THE MODEL ---
print("Packaging model...")
model_data = {
    'venues_df': venues_processed,      # The DataFrame with engineered features
    'feature_matrix': feature_matrix,   # The pre-calculated numpy matrix
    'feature_columns': FEATURE_COLUMNS, # List of column names used
    'config': {
        'CATEGORIES': CATEGORIES,
        'PRICE_TIERS': PRICE_TIERS,
        'AUDIENCES': AUDIENCES,
        'PRICE_TIER_ORDER': PRICE_TIER_ORDER
    }
}

# --- 5. SAVE LOCALLY (Driver Node) ---
local_filename = "recommendation_engine.pkl"
print(f"Saving {local_filename} to driver...")

with open(local_filename, 'wb') as f:
    pickle.dump(model_data, f)

# --- 6. MOVE TO DBFS FILESTORE ---
# This makes the file accessible via a browser URL
dbfs_path = f"/dbfs/FileStore/{local_filename}"

# Create directory if it doesn't exist
os.makedirs("/dbfs/FileStore/", exist_ok=True)

# Copy the file from driver to DBFS
print(f"Copying to {dbfs_path}...")
shutil.copy(local_filename, dbfs_path)

# --- 7. GENERATE DOWNLOAD BUTTON ---
download_url = f"/files/{local_filename}"

html_button = f"""
<div style="text-align:center; margin-top:20px; padding: 20px; background-color: #f8f9fa; border-radius: 10px;">
    <h3 style="color: #2c3e50;">✅ Model Successfully Processed & Saved!</h3>
    <p>Your model includes {len(venues_processed)} venues and {feature_matrix.shape[1]} features.</p>
    
    <a href="{download_url}" target="_blank" download="{local_filename}">
        <button style="
            background-color: #2563eb; 
            border: none; 
            color: white; 
            padding: 15px 32px; 
            text-align: center; 
            text-decoration: none; 
            display: inline-block; 
            font-size: 16px; 
            font-weight: bold;
            margin: 15px 2px; 
            cursor: pointer; 
            border-radius: 8px;
            box-shadow: 0 4px 6px rgba(0,0,0,0.1);
            transition: transform 0.2s;">
            ⬇️ Download recommendation_engine.pkl
        </button>
    </a>
    
    <p style="margin-top:10px; color:#666; font-size:0.9em;">
        (Move this file to your local project folder: <code>sub_agents/models/</code>)
    </p>
</div>
"""

display(HTML(html_button))

Loading data...
Engineering features...
Creating feature matrix...
Packaging model...
Saving recommendation_engine.pkl to driver...
Copying to /dbfs/FileStore/recommendation_engine.pkl...
