# Entertainment Cost Prediction Model

This notebook trains an ML model to predict entertainment costs based on:
- City profile features (venue mix, pricing, ratings)
- Trip parameters (number of people, days, travel style)

**Target:** Predict total entertainment cost for a trip

In [0]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import joblib
import warnings
warnings.filterwarnings('ignore')

print("Libraries loaded successfully")

Libraries loaded successfully


## 1. Load Data

In [0]:
# Load city profiles and entertainment costs
city_profiles = pd.read_parquet('entierement data/city_profiles.parquet')
entertainment_costs = pd.read_parquet('entierement data/entertainment_costs_by_city.parquet')

print(f"City profiles: {city_profiles.shape}")
print(f"Entertainment costs: {entertainment_costs.shape}")

City profiles: (634, 33)
Entertainment costs: (634, 6)


In [0]:
# Merge city profiles with entertainment costs
df = city_profiles.merge(entertainment_costs, on=['city', 'state'], how='inner')
print(f"Merged data: {df.shape}")
df.head()

Merged data: (634, 37)


Unnamed: 0,city,state,venue_count,avg_price,price_std,price_min,price_max,avg_rating,avg_popularity,avg_duration,pct_attraction,pct_museum,pct_outdoor_activity,pct_performing_arts,pct_sports_recreation,pct_theme_park,pct_tour,pct_audience_adults,pct_audience_all,pct_audience_family,avg_price_museum,avg_price_theme_park,avg_price_outdoor_activity,avg_price_tour,avg_price_attraction,dominant_category,dominant_audience,pct_accessible,pct_kid_friendly,pct_has_deals,pct_has_passes,pct_has_parking,avg_activity_count,cost_budget,cost_medium,cost_expensive,cost_luxury
0,Abington,PA,1,50.0,0.0,50.0,50.0,4.5,65.2,2.5,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,50.0,0.0,0.0,0.0,0.0,museum,family,100.0,100.0,0.0,0.0,100.0,0.0,6.3,50.0,101.76,191.79
1,Accokeek,MD,1,0.0,0.0,0.0,0.0,0.0,50.0,3.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,outdoor_activity,all,0.0,0.0,0.0,0.0,0.0,2.0,6.3,49.76,101.76,191.79
2,Ajo,AZ,1,25.0,0.0,25.0,25.0,0.0,50.0,3.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,25.0,0.0,0.0,outdoor_activity,all,0.0,0.0,0.0,100.0,0.0,29.0,25.0,49.76,101.76,191.79
3,Alamogordo,NM,1,25.0,0.0,25.0,25.0,0.0,50.0,3.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,25.0,0.0,0.0,outdoor_activity,all,0.0,0.0,0.0,100.0,0.0,11.0,25.0,49.76,101.76,191.79
4,Albuquerque,NM,1,0.0,0.0,0.0,0.0,0.0,50.0,3.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,outdoor_activity,all,0.0,0.0,0.0,0.0,0.0,8.0,6.3,49.76,101.76,191.79


## 2. Generate Synthetic Training Data

Generate realistic trip scenarios with varying:
- Number of people (1-8)
- Number of days (1-7)
- Travel style (budget, medium, expensive, luxury)

In [0]:
# Travel style encoding
TRAVEL_STYLE_MAP = {
    'budget': 0,
    'medium': 1,
    'expensive': 2,
    'luxury': 3
}

# Cost column mapping
COST_COLUMNS = {
    'budget': 'cost_budget',
    'medium': 'cost_medium',
    'expensive': 'cost_expensive',
    'luxury': 'cost_luxury'
}

def generate_training_data(df, samples_per_city=50, random_state=42):
    """Generate synthetic trip data for training."""
    np.random.seed(random_state)
    
    training_records = []
    
    for _, row in df.iterrows():
        for _ in range(samples_per_city):
            # Random trip parameters
            num_people = np.random.randint(1, 9)  # 1-8 people
            num_days = np.random.randint(1, 8)    # 1-7 days
            travel_style = np.random.choice(['budget', 'medium', 'expensive', 'luxury'])
            
            # Get base cost per person per day for the travel style
            cost_col = COST_COLUMNS[travel_style]
            base_cost = row[cost_col]
            
            # Calculate total entertainment cost
            # Add some realistic noise (+-15%)
            noise = np.random.uniform(0.85, 1.15)
            entertainment_cost = base_cost * num_people * num_days * noise
            
            record = {
                # City features
                'venue_count': row['venue_count'],
                'avg_price': row['avg_price'],
                'price_std': row['price_std'],
                'avg_rating': row['avg_rating'],
                'avg_popularity': row['avg_popularity'],
                'pct_museum': row['pct_museum'],
                'pct_theme_park': row['pct_theme_park'],
                'pct_outdoor_activity': row['pct_outdoor_activity'],
                'pct_tour': row['pct_tour'],
                'pct_attraction': row['pct_attraction'],
                'pct_audience_family': row['pct_audience_family'],
                'pct_audience_adults': row['pct_audience_adults'],
                'pct_audience_all': row['pct_audience_all'],
                'pct_accessible': row['pct_accessible'],
                'pct_kid_friendly': row['pct_kid_friendly'],
                # Trip parameters
                'num_people': num_people,
                'num_days': num_days,
                'travel_style_encoded': TRAVEL_STYLE_MAP[travel_style],
                # Target
                'entertainment_cost': entertainment_cost,
                # Metadata (for analysis)
                'city': row['city'],
                'state': row['state'],
                'travel_style': travel_style
            }
            training_records.append(record)
    
    return pd.DataFrame(training_records)

# Generate training data
training_df = generate_training_data(df, samples_per_city=50)
print(f"Training data generated: {training_df.shape}")
training_df.head()

Training data generated: (31700, 22)


Unnamed: 0,venue_count,avg_price,price_std,avg_rating,avg_popularity,pct_museum,pct_theme_park,pct_outdoor_activity,pct_tour,pct_attraction,pct_audience_family,pct_audience_adults,pct_audience_all,pct_accessible,pct_kid_friendly,num_people,num_days,travel_style_encoded,entertainment_cost,city,state,travel_style
0,1,50.0,0.0,4.5,65.2,100.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,100.0,100.0,7,4,0,159.647369,Abington,PA,budget
1,1,50.0,0.0,4.5,65.2,100.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,100.0,100.0,8,5,0,225.995009,Abington,PA,budget
2,1,50.0,0.0,4.5,65.2,100.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,100.0,100.0,3,7,2,2110.834954,Abington,PA,expensive
3,1,50.0,0.0,4.5,65.2,100.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,100.0,100.0,5,4,3,4075.237438,Abington,PA,luxury
4,1,50.0,0.0,4.5,65.2,100.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,100.0,100.0,6,5,1,1599.899448,Abington,PA,medium


In [0]:
# Check distribution of target variable
print("Entertainment cost statistics:")
print(training_df['entertainment_cost'].describe())

Entertainment cost statistics:
count    31700.000000
mean      1566.128482
std       1955.995040
min          5.362121
25%        217.833782
50%        815.392820
75%       2128.346124
max      12292.613071
Name: entertainment_cost, dtype: float64


## 3. Feature Engineering

In [0]:
# Define feature columns
FEATURE_COLUMNS = [
    # City features
    'venue_count', 'avg_price', 'price_std', 'avg_rating', 'avg_popularity',
    'pct_museum', 'pct_theme_park', 'pct_outdoor_activity', 'pct_tour', 'pct_attraction',
    'pct_audience_family', 'pct_audience_adults', 'pct_audience_all',
    'pct_accessible', 'pct_kid_friendly',
    # Trip parameters
    'num_people', 'num_days', 'travel_style_encoded'
]

TARGET_COLUMN = 'entertainment_cost'

# Prepare features and target
X = training_df[FEATURE_COLUMNS].copy()
y = training_df[TARGET_COLUMN].copy()

# Handle any missing values
X = X.fillna(0)

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

Features shape: (31700, 18)
Target shape: (31700,)


## 4. Train/Test Split

In [0]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

Training set: 25360 samples
Test set: 6340 samples


## 5. Train Models

In [0]:
def evaluate_model(model, X_test, y_test, model_name):
    """Evaluate model and print metrics."""
    y_pred = model.predict(X_test)
    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"\n{model_name}:")
    print(f"  RMSE: ${rmse:.2f}")
    print(f"  MAE:  ${mae:.2f}")
    print(f"  R2:   {r2:.4f}")
    
    return {'model': model_name, 'rmse': rmse, 'mae': mae, 'r2': r2}

In [0]:
# Model 1: Linear Regression
print("Training Linear Regression as baseline")
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_results = evaluate_model(lr_model, X_test, y_test, "Linear Regression")

Training Linear Regression as baseline

Linear Regression:
  RMSE: $1083.40
  MAE:  $772.06
  R2:   0.7088


In [0]:
# Model 2: Random Forest
print("Training Random Forest")
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    min_samples_split=5,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)
rf_results = evaluate_model(rf_model, X_test, y_test, "Random Forest")

Training Random Forest

Random Forest:
  RMSE: $241.94
  MAE:  $128.88
  R2:   0.9855


In [0]:
# Model 3: Gradient Boosting
print("Training Gradient Boosting")
gb_model = GradientBoostingRegressor(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42
)
gb_model.fit(X_train, y_train)
gb_results = evaluate_model(gb_model, X_test, y_test, "Gradient Boosting")

Training Gradient Boosting

Gradient Boosting:
  RMSE: $227.20
  MAE:  $124.62
  R2:   0.9872


In [0]:
# Compare all models and select best
all_results = [lr_results, rf_results, gb_results]
all_models = [lr_model, rf_model, gb_model]
model_names = ['linear_regression', 'random_forest', 'gradient_boosting']

results_df = pd.DataFrame(all_results)
print("MODEL COMPARISON")
print(results_df.to_string(index=False))

# Select best model based on R2
best_idx = max(range(len(all_results)), key=lambda i: all_results[i]['r2'])
best_model = all_models[best_idx]
best_model_name = model_names[best_idx]
best_r2 = all_results[best_idx]['r2']

print(f"BEST MODEL: {all_results[best_idx]['model']} (R2: {best_r2:.4f})")


MODEL COMPARISON
            model        rmse        mae       r2
Linear Regression 1083.395481 772.057391 0.708777
    Random Forest  241.942367 128.876774 0.985476
Gradient Boosting  227.195887 124.618414 0.987193
BEST MODEL: Gradient Boosting (R2: 0.9872)


## 6. Feature Importance Analysis

In [0]:
# Get feature importances from best model
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': FEATURE_COLUMNS,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)

    print(f"Feature Importance ({all_results[best_idx]['model']}):")
    print(feature_importance.to_string(index=False))
else:
    print("Feature importance not available for Linear Regression")

Feature Importance (Gradient Boosting):
             feature  importance
travel_style_encoded    0.395052
            num_days    0.311619
          num_people    0.290891
           avg_price    0.000956
           price_std    0.000260
      avg_popularity    0.000239
         venue_count    0.000178
pct_outdoor_activity    0.000164
      pct_theme_park    0.000106
          avg_rating    0.000093
 pct_audience_family    0.000087
      pct_accessible    0.000085
 pct_audience_adults    0.000061
          pct_museum    0.000056
            pct_tour    0.000055
    pct_audience_all    0.000048
      pct_attraction    0.000037
    pct_kid_friendly    0.000012


## 7. Test Predictions on Sample Inputs

In [0]:
def predict_entertainment_cost(model, city_row, num_people, num_days, travel_style):
    """Make a prediction for a specific trip scenario."""
    features = {
        'venue_count': city_row['venue_count'],
        'avg_price': city_row['avg_price'],
        'price_std': city_row['price_std'],
        'avg_rating': city_row['avg_rating'],
        'avg_popularity': city_row['avg_popularity'],
        'pct_museum': city_row['pct_museum'],
        'pct_theme_park': city_row['pct_theme_park'],
        'pct_outdoor_activity': city_row['pct_outdoor_activity'],
        'pct_tour': city_row['pct_tour'],
        'pct_attraction': city_row['pct_attraction'],
        'pct_audience_family': city_row['pct_audience_family'],
        'pct_audience_adults': city_row['pct_audience_adults'],
        'pct_audience_all': city_row['pct_audience_all'],
        'pct_accessible': city_row['pct_accessible'],
        'pct_kid_friendly': city_row['pct_kid_friendly'],
        'num_people': num_people,
        'num_days': num_days,
        'travel_style_encoded': TRAVEL_STYLE_MAP[travel_style]
    }
    
    X_pred = pd.DataFrame([features])[FEATURE_COLUMNS]
    return model.predict(X_pred)[0]

# Test predictions for major cities
test_cities = ['New York', 'Los Angeles', 'Chicago', 'Miami', 'Las Vegas']

print(f"Sample Predictions using {all_results[best_idx]['model']} (4 people, 3 days, medium style):")
for city_name in test_cities:
    city_data = df[df['city'] == city_name]
    if len(city_data) > 0:
        city_row = city_data.iloc[0]
        predicted_cost = predict_entertainment_cost(
            best_model, city_row, 
            num_people=4, num_days=3, travel_style='medium'
        )
        print(f"{city_name}: ${predicted_cost:.2f}")

Sample Predictions using Gradient Boosting (4 people, 3 days, medium style):
New York: $384.97
Los Angeles: $796.17
Chicago: $578.09
Las Vegas: $576.78


In [0]:
# Test with different travel styles for one city
city_row = df[df['city'] == 'New York'].iloc[0]

print(f"\nNew York - 4 people, 3 days, different styles ({all_results[best_idx]['model']}):")
for style in ['budget', 'medium', 'expensive', 'luxury']:
    predicted_cost = predict_entertainment_cost(
        best_model, city_row,
        num_people=4, num_days=3, travel_style=style
    )
    print(f"{style.capitalize():12}: ${predicted_cost:.2f}")


New York - 4 people, 3 days, different styles (Gradient Boosting):
Budget      : $15.80
Medium      : $384.97
Expensive   : $1221.03
Luxury      : $2292.31


## 8. Save Model for Deployment

In [0]:
 # Select best model based on R2 score
all_results = [lr_results, rf_results, gb_results]
all_models = [lr_model, rf_model, gb_model]
model_names = ['linear_regression', 'random_forest', 'gradient_boosting']

best_idx = max(range(len(all_results)), key=lambda i: all_results[i]['r2'])
best_model = all_models[best_idx]
best_model_name = model_names[best_idx]
best_r2 = all_results[best_idx]['r2']

print(f"Best model: {all_results[best_idx]['model']} (R2: {best_r2:.4f})")

# Save best model
model_path = '/Workspace/Users/muradrahimli@campus.technion.ac.il/entierement notebooks/Models/entertainment_cost_model.joblib'
joblib.dump(best_model, model_path)
print(f"Model saved to: {model_path}")
print(f"Model type: {all_results[best_idx]['model']}")

# Save feature columns for reference
feature_config = {
    'feature_columns': FEATURE_COLUMNS,
    'travel_style_map': TRAVEL_STYLE_MAP,
    'model_type': best_model_name
  }

import json
config_path = '/Workspace/Users/muradrahimli@campus.technion.ac.il/entierement notebooks/Models/entertainment_cost_model_config.json'
with open(config_path, 'w') as f:
    json.dump(feature_config, f, indent=2)
print(f"Config saved to: {config_path}")  

Best model: Gradient Boosting (R2: 0.9872)
Model saved to: /Workspace/Users/muradrahimli@campus.technion.ac.il/entierement notebooks/Models/entertainment_cost_model.joblib
Model type: Gradient Boosting
Config saved to: /Workspace/Users/muradrahimli@campus.technion.ac.il/entierement notebooks/Models/entertainment_cost_model_config.json


In [0]:
# Verify saved model loads correctly
loaded_model = joblib.load(model_path)

# Test prediction with loaded model
city_row = df[df['city'] == 'New York'].iloc[0]
test_pred = predict_entertainment_cost(
    loaded_model, city_row,
    num_people=4, num_days=3, travel_style='medium'
)
print(f"Verification - New York prediction: ${test_pred:.2f}")
print("\nModel saved and verified successfully")

Verification - New York prediction: $384.97

Model saved and verified successfully


In [0]:
import pickle
import shutil
import os
import json

# --- 1. SAVE AS PICKLE (.pkl) ---
# Define local paths in the workspace
pkl_model_path = '/Workspace/Users/muradrahimli@campus.technion.ac.il/entierement notebooks/Models/entertainment_cost_model.pkl'
config_path = '/Workspace/Users/muradrahimli@campus.technion.ac.il/entierement notebooks/Models/entertainment_cost_model_config.json'

print(f"Saving best model ({best_model_name}) to .pkl...")

# Save Model using Pickle
with open(pkl_model_path, 'wb') as f:
    pickle.dump(best_model, f)

# Save Config (JSON)
feature_config = {
    'features': FEATURE_COLUMNS,  # Renamed key to match your Agent expectations
    'travel_style_map': TRAVEL_STYLE_MAP,
    'model_type': best_model_name
}
with open(config_path, 'w') as f:
    json.dump(feature_config, f, indent=2)

print("‚úÖ Model and Config saved locally.")

# --- 2. PREPARE FOR DOWNLOAD ---
# Copy to FileStore (Publicly accessible folder in Databricks)
dest_model = '/dbfs/FileStore/entertainment_cost_model.pkl'
dest_config = '/dbfs/FileStore/entertainment_cost_model_config.json'

try:
    shutil.copy(pkl_model_path, dest_model)
    shutil.copy(config_path, dest_config)
    print("‚úÖ Files moved to FileStore.")
except Exception as e:
    print(f"‚ùå Error moving files: {e}")

# --- 3. GENERATE DOWNLOAD BUTTONS ---
download_link_model = "/files/entertainment_cost_model.pkl"
download_link_config = "/files/entertainment_cost_model_config.json"

displayHTML(f'''
<div style="font-family: sans-serif; padding: 15px; border: 1px solid #ddd; border-radius: 8px; background-color: #f9f9f9; display: inline-block;">
    <h3 style="margin-top:0;">‚¨áÔ∏è Download Agent Artifacts</h3>
    <p>Click below to download the files to your computer:</p>
    
    <a href="{download_link_model}" download="entertainment_cost_model.pkl">
        <button style="background-color: #28a745; color: white; padding: 12px 24px; border: none; border-radius: 5px; cursor: pointer; font-size: 14px; margin-right: 10px; font-weight: bold;">
            üíæ Download Model (.pkl)
        </button>
    </a>
    
    <a href="{download_link_config}" download="entertainment_cost_model_config.json">
        <button style="background-color: #17a2b8; color: white; padding: 12px 24px; border: none; border-radius: 5px; cursor: pointer; font-size: 14px; font-weight: bold;">
            ‚öôÔ∏è Download Config (.json)
        </button>
    </a>
</div>
''')

Saving best model (gradient_boosting) to .pkl...
‚úÖ Model and Config saved locally.
‚úÖ Files moved to FileStore.


In [0]:
import pandas as pd
import numpy as np
import xgboost as xgb
import pickle
import shutil
import os
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

# --- 1. CONFIGURATION & DATA LOADING ---
print("--- 1. Loading Data ---")
base_path = '/Workspace/Users/muradrahimli@campus.technion.ac.il/entierement data'
city_profiles = pd.read_parquet(f'{base_path}/city_profiles.parquet')
entertainment_costs = pd.read_parquet(f'{base_path}/entertainment_costs_by_city.parquet')

# Merge
df = city_profiles.merge(entertainment_costs, on=['city', 'state'], how='inner')
print(f"Merged Data Shape: {df.shape}")

# Maps
TRAVEL_STYLE_MAP = {'budget': 0, 'medium': 1, 'expensive': 2, 'luxury': 3}
COST_COLUMNS = {'budget': 'cost_budget', 'medium': 'cost_medium', 'expensive': 'cost_expensive', 'luxury': 'cost_luxury'}

# --- 2. GENERATE SYNTHETIC TRAINING DATA ---
print("--- 2. Generating Synthetic Training Data ---")
def generate_training_data(df, samples_per_city=50, random_state=42):
    np.random.seed(random_state)
    training_records = []
    
    for _, row in df.iterrows():
        for _ in range(samples_per_city):
            num_people = np.random.randint(1, 9)
            num_days = np.random.randint(1, 8)
            travel_style = np.random.choice(['budget', 'medium', 'expensive', 'luxury'])
            
            base_cost = row[COST_COLUMNS[travel_style]]
            noise = np.random.uniform(0.85, 1.15)
            entertainment_cost = base_cost * num_people * num_days * noise
            
            record = {
                # City stats
                'venue_count': row['venue_count'], 'avg_price': row['avg_price'], 'price_std': row['price_std'],
                'avg_rating': row['avg_rating'], 'avg_popularity': row['avg_popularity'],
                'pct_museum': row['pct_museum'], 'pct_theme_park': row['pct_theme_park'],
                'pct_outdoor_activity': row['pct_outdoor_activity'], 'pct_tour': row['pct_tour'],
                'pct_attraction': row['pct_attraction'], 'pct_audience_family': row['pct_audience_family'],
                'pct_audience_adults': row['pct_audience_adults'], 'pct_audience_all': row['pct_audience_all'],
                'pct_accessible': row['pct_accessible'], 'pct_kid_friendly': row['pct_kid_friendly'],
                # Trip Params
                'num_people': num_people, 'num_days': num_days, 'travel_style_encoded': TRAVEL_STYLE_MAP[travel_style],
                # Target
                'entertainment_cost': entertainment_cost
            }
            training_records.append(record)
    return pd.DataFrame(training_records)

training_df = generate_training_data(df, samples_per_city=50)

# Features
FEATURE_COLUMNS = [
    'venue_count', 'avg_price', 'price_std', 'avg_rating', 'avg_popularity',
    'pct_museum', 'pct_theme_park', 'pct_outdoor_activity', 'pct_tour', 'pct_attraction',
    'pct_audience_family', 'pct_audience_adults', 'pct_audience_all',
    'pct_accessible', 'pct_kid_friendly',
    'num_people', 'num_days', 'travel_style_encoded'
]
X = training_df[FEATURE_COLUMNS].fillna(0)
y = training_df['entertainment_cost']

# --- 3. TRAIN XGBOOST MODEL ---
print("--- 3. Training XGBoost Model ---")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Using XGBRegressor (more portable than sklearn GradientBoosting)
model = xgb.XGBRegressor(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

# Evaluate
preds = model.predict(X_test)
r2 = r2_score(y_test, preds)
print(f"‚úÖ Model Trained. R2 Score: {r2:.4f}")

# --- 4. PACKAGE ARTIFACTS ---
print("--- 4. Saving Artifacts ---")

# We include the model AND the config in one pickle file for the agent
artifacts = {
    "model": model,
    "features": FEATURE_COLUMNS,
    "travel_style_map": TRAVEL_STYLE_MAP,
    "model_type": "xgboost",
    "metadata": {"r2_score": r2}
}

pkl_filename = "entertainment_cost_model.pkl"
local_path = f"/Workspace/Users/muradrahimli@campus.technion.ac.il/entierement data/{pkl_filename}"

with open(local_path, 'wb') as f:
    pickle.dump(artifacts, f)

# --- 5. GENERATE DOWNLOAD ---
filestore_path = f"/dbfs/FileStore/{pkl_filename}"
try:
    shutil.copy(local_path, filestore_path)
    print("‚úÖ Copied to FileStore.")
except Exception as e:
    print(f"‚ùå Error copying: {e}")

download_url = f"/files/{pkl_filename}"

displayHTML(f'''
<div style="text-align: center; margin-top: 20px;">
  <a href="{download_url}" download="{pkl_filename}">
    <button style="background-color: #28a745; color: white; padding: 15px 32px; 
                   border: none; border-radius: 8px; font-size: 16px; cursor: pointer; font-weight: bold;">
      üíæ Download entertainment_cost_model.pkl
    </button>
  </a>
</div>
''')

--- 1. Loading Data ---
Merged Data Shape: (634, 37)
--- 2. Generating Synthetic Training Data ---
--- 3. Training XGBoost Model ---


Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

üèÉ View run painted-squid-9 at: https://adb-983293358114278.18.azuredatabricks.net/ml/experiments/1072358835251820/runs/cc2c26af32d241f2b54d6b81ac17fbb3
üß™ View experiment at: https://adb-983293358114278.18.azuredatabricks.net/ml/experiments/1072358835251820
‚úÖ Model Trained. R2 Score: 0.9871
--- 4. Saving Artifacts ---
‚úÖ Copied to FileStore.


## 9. Widgets for predicting
###Note: you must run the last cell to close them widgets or they will reamin on your screen

In [0]:
try:
    dbutils.widgets.removeAll()
    # Get cities from data
    cities_list = sorted(df['city'].dropna().unique().tolist())
    dbutils.widgets.combobox("city", "New York", cities_list, "1. City")
    dbutils.widgets.text("num_people", "2", "2. Number of People")
    dbutils.widgets.text("num_days", "3", "3. Number of Days")
    dbutils.widgets.dropdown("travel_style", "medium", ["budget","medium","expensive","luxury"], "4. Travel Style")

    print(f"Widgets created with {len(cities_list)} cities")
    print("widgets ready")
    print("1. Select your preferences in the widgets above")
    print("2. Run the NEXT CELL to get your prediction")
    dbutils.notebook.exit("Interactive mode - run prediction cell manually")
except NameError:
    print("Widgets only work in Databricks")

After filling in the widgets (balnk uses defult) run the next cell

In [0]:
user_city = dbutils.widgets.get("city")
user_people = int(dbutils.widgets.get("num_people"))
user_days = int(dbutils.widgets.get("num_days"))
user_style = dbutils.widgets.get("travel_style")

city_data = df[df['city'] == user_city]
if len(city_data) > 0:
    prediction = predict_entertainment_cost(
        best_model, city_data.iloc[0],
        user_people, user_days, user_style
    )
    print(f"Estimated entertainment cost for {user_city}:")
    print(f"  {user_people} people, {user_days} days, {user_style} style")                                                                        
    print(f"  Total: ${prediction:.2f}")
else:
    print(f"City '{user_city}' not found")

###To remvoe widgets uncomment and run this:

In [0]:
#dbutils.widgets.removeAll()