In [23]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Dense, Add
from tensorflow.keras.regularizers import l2
from geopy.distance import geodesic

In [2]:
# Load the dataset
place  = pd.read_csv('data/tourism_with_id.csv')
ratings = pd.read_csv('data/tourism_rating.csv')
user = pd.read_csv('data/user.csv')

In [3]:
# Check for missing values
print(place.isnull().sum())
print(ratings.isnull().sum())
print(user.isnull().sum())

Place_Id          0
Place_Name        0
Description       0
Category          0
City              0
Price             0
Rating            0
Time_Minutes    232
Coordinate        0
Lat               0
Long              0
Unnamed: 11     437
Unnamed: 12       0
dtype: int64
User_Id          0
Place_Id         0
Place_Ratings    0
dtype: int64
User_Id     0
Location    0
Age         0
dtype: int64


In [4]:
# Drop column that are not used
place = place.drop(['Unnamed: 11', 'Unnamed: 12'], axis=1)

In [5]:
# Calculate the median 'Time_Minutes' for each location
city_median = place.groupby('City')['Time_Minutes'].median()

# Fill in the blank value based on the median location
place['Time_Minutes'] = place.apply(
    lambda row: city_median[row['City']] if pd.isnull(row['Time_Minutes']) else row['Time_Minutes'],
    axis=1
)

In [6]:
# Check for duplicate values
print(place.duplicated().sum())
print(ratings.duplicated().sum())
print(user.duplicated().sum())

0
79
0


In [7]:
# Drop duplicate values
ratings.drop_duplicates(inplace = True)

In [8]:
# Combining the dataset
merge_data = pd.merge(ratings, place[['Place_Id', 'Place_Name', 'Category', 'City', 'Price', 'Time_Minutes']], on='Place_Id', how='left')

# Calculating the mean rating for each place 
merge_data = merge_data.groupby('Place_Id').agg(
    Mean_Rating=('Place_Ratings', 'mean'), 
    Place_Name=('Place_Name', 'first'), 
    Category=('Category', 'first'), 
    City=('City', 'first'), 
    Price=('Price', 'first'), 
    Time_Minutes=('Time_Minutes', 'first')
).reset_index()

# Sorting the data by Place_Id 
merge_data.sort_values('Place_Id')

Unnamed: 0,Place_Id,Mean_Rating,Place_Name,Category,City,Price,Time_Minutes
0,1,3.722222,Monumen Nasional,Budaya,Jakarta,20000,15.0
1,2,2.840000,Kota Tua,Budaya,Jakarta,0,90.0
2,3,2.526316,Dunia Fantasi,Taman Hiburan,Jakarta,270000,360.0
3,4,2.857143,Taman Mini Indonesia Indah (TMII),Taman Hiburan,Jakarta,10000,90.0
4,5,3.520000,Atlantis Water Adventure,Taman Hiburan,Jakarta,94000,60.0
...,...,...,...,...,...,...,...
432,433,3.304348,Museum Mpu Tantular,Budaya,Surabaya,2000,45.0
433,434,2.400000,Taman Bungkul,Taman Hiburan,Surabaya,0,45.0
434,435,3.000000,Taman Air Mancur Menari Kenjeran,Taman Hiburan,Surabaya,0,45.0
435,436,3.090909,Taman Flora Bratang Surabaya,Taman Hiburan,Surabaya,0,45.0


In [9]:
merge_data['Category'].unique()

array(['Budaya', 'Taman Hiburan', 'Cagar Alam', 'Bahari',
       'Pusat Perbelanjaan', 'Tempat Ibadah'], dtype=object)

In [10]:
merge_data['City'].unique()

array(['Jakarta', 'Yogyakarta', 'Bandung', 'Semarang', 'Surabaya'],
      dtype=object)

Itinerary

In [24]:
def recommend_places_with_time_and_budget_unique(
    city=None, categories=None, place_name=None, days=1, max_hours_per_day=8, start_lat=None, start_long=None, ratings=None, places=None, budget=None
):
    """
    Recommend places across multiple days based on city, multiple categories, and budget,
    ensuring unique recommendations for each day.

    Args:
        city (str, optional): City to filter places.
        categories (list, optional): List of categories to filter places.
        place_name (str, optional): Reference place for recommendations.
        days (int): Number of days for the trip.
        max_hours_per_day (int): Maximum hours available per day (default is 8 hours).
        start_lat (float, optional): Starting latitude for the trip.
        start_long (float, optional): Starting longitude for the trip.
        ratings (DataFrame, optional): User ratings data.
        places (DataFrame, optional): Places data.
        budget (float, optional): Maximum budget available for the trip.

    Returns:
        DataFrame: Recommended places across days with time and travel adjustments.
        dict: Daily prices spent.
        dict: Total hours spent per day.
        float: Mean Absolute Error (MAE) of recommendations.
    """
    if not any([city, categories, place_name]):
        raise ValueError("At least one of 'city', 'categories', or 'place_name' must be provided.")

    # Step 1: Filter by city
    filtered_data = places.copy()
    if city:
        filtered_data = filtered_data[filtered_data['City'] == city]

    # Step 2: Filter by multiple categories
    if categories:
        filtered_data = filtered_data[filtered_data['Category'].isin(categories)]

    # Step 3: Use place_name or starting coordinates as reference if provided
    if place_name:
        if place_name not in places['Place_Name'].values:
            raise ValueError(f"'{place_name}' not found in the dataset.")
        ref_place = places[places['Place_Name'] == place_name].iloc[0]
        ref_coords = (ref_place['Lat'], ref_place['Long'])
    elif start_lat is not None and start_long is not None:
        ref_coords = (start_lat, start_long)
    else:
        # Default to central coordinates of filtered data
        ref_coords = (filtered_data['Lat'].mean(), filtered_data['Long'].mean())

    # Step 4: Adjust recommendations per day based on time and budget
    max_minutes_per_day = max_hours_per_day * 60
    all_recommendations = []
    visited_places = set()
    daily_prices = {}
    daily_hours = {}

    for day in range(1, days + 1):
        day_recs = []
        time_spent = 0
        money_spent = 0
        total_distance = 0

        # Exclude places already visited and create a copy to avoid SettingWithCopyWarning
        day_data = filtered_data[~filtered_data['Place_Id'].isin(visited_places)].copy()

        # Update starting point for subsequent days
        if day > 1 and len(all_recommendations) > 0:
            previous_day_first_location = all_recommendations[-1].iloc[0]
            ref_coords = (previous_day_first_location['Lat'], previous_day_first_location['Long'])

        # Calculate distances from the current reference point
        day_data['Distance'] = day_data.apply(
            lambda row: geodesic((row['Lat'], row['Long']), ref_coords).kilometers,
            axis=1
        )
        day_data = day_data.sort_values(by=['Rating', 'Distance'], ascending=[False, True])

        for _, row in day_data.iterrows():
            # Calculate round-trip travel time (assuming average speed of 40 km/h)
            travel_time = (row['Distance'] / 40) * 60 * 2
            total_time = travel_time + row['Time_Minutes']
            total_cost = row['Price']

            # Check time and budget constraints
            if time_spent + total_time <= max_minutes_per_day and (budget is None or money_spent + total_cost <= budget):
                row['Day'] = day
                day_recs.append(row)
                time_spent += total_time
                money_spent += total_cost
                total_distance += row['Distance']
                visited_places.add(row['Place_Id'])

        # Store daily recommendations
        day_df = pd.DataFrame(day_recs)
        daily_prices[day] = money_spent
        daily_hours[day] = time_spent / 60
        all_recommendations.append(day_df)

    # Combine recommendations across days
    final_recommendations = pd.concat(all_recommendations).reset_index(drop=True)

    # Step 5: Normalize ratings to [0, 1]
    min_rating = ratings['Place_Ratings'].min()
    max_rating = ratings['Place_Ratings'].max()
    ratings['Place_Ratings'] = (ratings['Place_Ratings'] - min_rating) / (max_rating - min_rating)

    # Step 6: Collaborative Filtering (CF) model
    user_ids = ratings['User_Id'].unique().tolist()
    place_ids = ratings['Place_Id'].unique().tolist()

    user_id_to_index = {x: i for i, x in enumerate(user_ids)}
    place_id_to_index = {x: i for i, x in enumerate(place_ids)}

    ratings['User_Id'] = ratings['User_Id'].map(user_id_to_index)
    ratings['Place_Id'] = ratings['Place_Id'].map(place_id_to_index)

    num_users = len(user_ids)
    num_places = len(place_ids)
    embedding_size = 50

    # User embedding
    user_input = Input(shape=(1,))
    user_embedding = Embedding(num_users, embedding_size, embeddings_regularizer=l2(1e-6))(user_input)
    user_vec = Flatten()(user_embedding)
    user_bias = Embedding(num_users, 1)(user_input)
    user_bias = Flatten()(user_bias)

    # Place embedding
    place_input = Input(shape=(1,))
    place_embedding = Embedding(num_places, embedding_size, embeddings_regularizer=l2(1e-6))(place_input)
    place_vec = Flatten()(place_embedding)
    place_bias = Embedding(num_places, 1)(place_input)
    place_bias = Flatten()(place_bias)

    # Dot product of user and place embeddings
    dot_product = Dot(axes=1)([user_vec, place_vec])
    prediction = Add()([dot_product, user_bias, place_bias])  # Add bias terms to the prediction

    # Define the CF model
    cf_model = Model([user_input, place_input], prediction)
    cf_model.compile(optimizer='adam', loss='mean_squared_error')

    # Train the CF model
    cf_model.fit(
        [ratings['User_Id'], ratings['Place_Id']],
        ratings['Place_Ratings'],
        epochs=20,  # Increase epochs if necessary
        verbose=1
    )

    # Step 7: Integrate CF predictions and reverse normalization
    cf_predictions = []
    for place_id in final_recommendations['Place_Id']:
        if place_id in place_id_to_index:
            place_index = place_id_to_index[place_id]
            user_indices = np.array(ratings['User_Id'].unique())
            predictions = cf_model.predict([user_indices, np.array([place_index] * len(user_indices))])
            # Reverse normalization
            predictions = predictions * (max_rating - min_rating) + min_rating
            cf_predictions.append(predictions.mean())
        else:
            cf_predictions.append(0)

    final_recommendations['CF_Predicted_Rating'] = cf_predictions

    # Calculate Mean Absolute Error (MAE)
    common_place_ids = set(final_recommendations['Place_Id']).intersection(set(ratings['Place_Id']))
    actual_ratings = ratings[ratings['Place_Id'].isin(common_place_ids)]['Place_Ratings']
    predicted_ratings = final_recommendations[final_recommendations['Place_Id'].isin(common_place_ids)]['CF_Predicted_Rating']

    # Ensure consistent lengths for MAE calculation
    actual_ratings = actual_ratings[:len(predicted_ratings)]
    predicted_ratings = predicted_ratings[:len(actual_ratings)]
    
    mae = mean_absolute_error(actual_ratings, predicted_ratings)
    
    return final_recommendations, daily_prices, daily_hours, mae

In [27]:
# Define the test function
def test_recommendation_system():
    # Call the recommendation function
    final_recommendations, daily_prices, daily_hours, mae = recommend_places_with_time_and_budget_unique(
        city = 'Jakarta',
        categories = ['Taman Hiburan', 'Budaya'],
        days = 2,
        max_hours_per_day = 8,
        start_lat = -6.122128,
        start_long = 106.836431,
        ratings = ratings,
        places = place,
        budget = 20000
    )

    # Print the results
    print("Final Recommendations:")
    print(final_recommendations)
    print("\nDaily Prices:")
    print(daily_prices)
    print("\nDaily Hours:")
    print(daily_hours)
    print("\nMean Absolute Error (MAE):")
    print(mae)

# Run the test
test_recommendation_system()

Epoch 1/20




[1m311/311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 908us/step - loss: 0.3576
Epoch 2/20
[1m311/311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 862us/step - loss: 0.2274
Epoch 3/20
[1m311/311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 952us/step - loss: 0.1282
Epoch 4/20
[1m311/311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 907us/step - loss: 0.0902
Epoch 5/20
[1m311/311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0694
Epoch 6/20
[1m311/311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 929us/step - loss: 0.0508
Epoch 7/20
[1m311/311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 856us/step - loss: 0.0348
Epoch 8/20
[1m311/311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 894us/step - loss: 0.0241
Epoch 9/20
[1m311/311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 855us/step - loss: 0.0167
Epoch 10/20
[1m311/311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 871us/st