**Travel Recommendation System + Itinerary Generator**

This dataset contains several tourist attractions in 5 major cities in Indonesia, namely:
**Jakarta, Yogyakarta, Semarang, Bandung, Surabaya**  

This dataset also consists of 4 files, namely:  
__tourism_with_id.csv__: contains information on tourist attractions in 5 major cities in Indonesia totaling ~400.
__user.csv__ : contains dummy user data to make recommendation features based on user.
__tourism_rating.csv__: contains 3 columns, namely the user, the place, and the rating given, serves to create a recommendation system based on the rating.
__package_tourism.csv__: contains recommendations for nearby places based on time, cost, and rating.

In [7]:
# Preparing the dataset
# Import necessary libraries
# Data Processing
import pandas as pd
import numpy as np
from zipfile import ZipFile
from pathlib import Path


In [9]:
# Phase 1: Data Integration — Combine All Datasets
# Goal: Create a unified dataset of users, their rated places, and place details.
# Steps: 
# Merge tourism_rating.csv with tourism_with_id.csv → match ratings to place info
# Join with user.csv → add user location and age
# Clean the result for ML or recommendation use

# Step 1: Read the data 

users = pd.read_csv("/Users/roshan_thakur/Desktop/Intro to Artificial Intelligence/Module-7-Project/Dataset-Vacation-Planner/Indonesia-Tourism/user.csv")
tourism_places_info = pd.read_csv("/Users/roshan_thakur/Desktop/Intro to Artificial Intelligence/Module-7-Project/Dataset-Vacation-Planner/Indonesia-Tourism/tourism_with_id.csv")
tourism_rating = pd.read_csv("/Users/roshan_thakur/Desktop/Intro to Artificial Intelligence/Module-7-Project/Dataset-Vacation-Planner/Indonesia-Tourism/tourism_rating.csv")
package_tourism = pd.read_csv("/Users/roshan_thakur/Desktop/Intro to Artificial Intelligence/Module-7-Project/Dataset-Vacation-Planner/Indonesia-Tourism/package_tourism.csv")

In [11]:
users.head(5)

Unnamed: 0,User_Id,Location,Age
0,1,"Semarang, Jawa Tengah",20
1,2,"Bekasi, Jawa Barat",21
2,3,"Cirebon, Jawa Barat",23
3,4,"Bekasi, Jawa Barat",21
4,5,"Lampung, Sumatera Selatan",20


In [13]:
tourism_places_info.head(5)

Unnamed: 0,Place_Id,Place_Name,Description,Category,City,Price,Rating,Time_Minutes,Coordinate,Lat,Long,Unnamed: 11,Unnamed: 12
0,1,Monumen Nasional,Monumen Nasional atau yang populer disingkat d...,Budaya,Jakarta,20000,4.6,15.0,"{'lat': -6.1753924, 'lng': 106.8271528}",-6.175392,106.827153,,1
1,2,Kota Tua,"Kota tua di Jakarta, yang juga bernama Kota Tu...",Budaya,Jakarta,0,4.6,90.0,"{'lat': -6.137644799999999, 'lng': 106.8171245}",-6.137645,106.817125,,2
2,3,Dunia Fantasi,Dunia Fantasi atau disebut juga Dufan adalah t...,Taman Hiburan,Jakarta,270000,4.6,360.0,"{'lat': -6.125312399999999, 'lng': 106.8335377}",-6.125312,106.833538,,3
3,4,Taman Mini Indonesia Indah (TMII),Taman Mini Indonesia Indah merupakan suatu kaw...,Taman Hiburan,Jakarta,10000,4.5,,"{'lat': -6.302445899999999, 'lng': 106.8951559}",-6.302446,106.895156,,4
4,5,Atlantis Water Adventure,Atlantis Water Adventure atau dikenal dengan A...,Taman Hiburan,Jakarta,94000,4.5,60.0,"{'lat': -6.12419, 'lng': 106.839134}",-6.12419,106.839134,,5


In [15]:
# Drop unused columns
tourism_places_info = tourism_places_info.drop(['Unnamed: 11','Unnamed: 12'],axis=1)
tourism_places_info.head(2)

Unnamed: 0,Place_Id,Place_Name,Description,Category,City,Price,Rating,Time_Minutes,Coordinate,Lat,Long
0,1,Monumen Nasional,Monumen Nasional atau yang populer disingkat d...,Budaya,Jakarta,20000,4.6,15.0,"{'lat': -6.1753924, 'lng': 106.8271528}",-6.175392,106.827153
1,2,Kota Tua,"Kota tua di Jakarta, yang juga bernama Kota Tu...",Budaya,Jakarta,0,4.6,90.0,"{'lat': -6.137644799999999, 'lng': 106.8171245}",-6.137645,106.817125


In [17]:
tourism_rating.head(5)

Unnamed: 0,User_Id,Place_Id,Place_Ratings
0,1,179,3
1,1,344,2
2,1,5,5
3,1,373,3
4,1,101,4


In [19]:
package_tourism.head(5)

Unnamed: 0,Package,City,Place_Tourism1,Place_Tourism2,Place_Tourism3,Place_Tourism4,Place_Tourism5
0,1,Jakarta,Pasar Tanah Abang,Taman Ayodya,Museum Tekstil,,
1,2,Jakarta,Pasar Tanah Abang,Pasar Taman Puring,Pasar Petak Sembilan,,
2,3,Jakarta,Perpustakaan Nasional,Monas,Masjid Istiqlal,,
3,4,Jakarta,Pulau Tidung,Pulau Bidadari,Pulau Pari,Pulau Pramuka,Pulau Pelangi
4,5,Jakarta,Museum Satria Mandala,Museum Wayang,Museum Bahari Jakarta,Museum Macan (Modern and Contemporary Art in N...,


In [21]:
print(f"Number of places in the datasets : {len(tourism_places_info.Place_Id.unique())}")
print(f"Number of users : {len(users.User_Id.unique())}")
print(f"The number of ratings given by the user to the dataset : {len(tourism_rating.User_Id)}")

Number of places in the datasets : 437
Number of users : 300
The number of ratings given by the user to the dataset : 10000


In [23]:
# Exploratory Data Analysis
tourism_places_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 437 entries, 0 to 436
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Place_Id      437 non-null    int64  
 1   Place_Name    437 non-null    object 
 2   Description   437 non-null    object 
 3   Category      437 non-null    object 
 4   City          437 non-null    object 
 5   Price         437 non-null    int64  
 6   Rating        437 non-null    float64
 7   Time_Minutes  205 non-null    float64
 8   Coordinate    437 non-null    object 
 9   Lat           437 non-null    float64
 10  Long          437 non-null    float64
dtypes: float64(4), int64(2), object(5)
memory usage: 37.7+ KB


In [25]:
tourism_places_info.isnull().sum()

Place_Id          0
Place_Name        0
Description       0
Category          0
City              0
Price             0
Rating            0
Time_Minutes    232
Coordinate        0
Lat               0
Long              0
dtype: int64

In [27]:
tourism_places_info.Category.unique()

array(['Budaya', 'Taman Hiburan', 'Cagar Alam', 'Bahari',
       'Pusat Perbelanjaan', 'Tempat Ibadah'], dtype=object)

In [29]:
tourism_rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   User_Id        10000 non-null  int64
 1   Place_Id       10000 non-null  int64
 2   Place_Ratings  10000 non-null  int64
dtypes: int64(3)
memory usage: 234.5 KB


In [31]:
tourism_rating.isnull().sum()

User_Id          0
Place_Id         0
Place_Ratings    0
dtype: int64

In [33]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   User_Id   300 non-null    int64 
 1   Location  300 non-null    object
 2   Age       300 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 7.2+ KB


In [35]:
users.isnull().sum()

User_Id     0
Location    0
Age         0
dtype: int64

In [37]:
package_tourism.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Package         100 non-null    int64 
 1   City            100 non-null    object
 2   Place_Tourism1  100 non-null    object
 3   Place_Tourism2  100 non-null    object
 4   Place_Tourism3  100 non-null    object
 5   Place_Tourism4  66 non-null     object
 6   Place_Tourism5  39 non-null     object
dtypes: int64(1), object(6)
memory usage: 5.6+ KB


In [39]:
# Data Preprocessing
## tourism_all will be a sorted array of all unique Place IDs that exist in either tourism_places_info or tourism_rating
import numpy as np

tourism_all = np.concatenate((
    tourism_places_info.Place_Id.unique(),
    tourism_rating.Place_Id.unique()
))

tourism_all = np.sort(np.unique(tourism_all))

print(f"Total number of tourism: {len(tourism_all)}")


Total number of tourism: 437


In [41]:
all_tourism_ratings = tourism_rating
all_tourism_ratings
#tourism_rating.info(2)

Unnamed: 0,User_Id,Place_Id,Place_Ratings
0,1,179,3
1,1,344,2
2,1,5,5
3,1,373,3
4,1,101,4
...,...,...,...
9995,300,425,2
9996,300,64,4
9997,300,311,3
9998,300,279,4


In [43]:
#Merge tourism_rating.csv with tourism_places_info.csv → match ratings to place info
#Join with user.csv → add user location and age

In [45]:
tourism_combined = pd.merge(all_tourism_ratings,tourism_places_info[["Place_Id","Place_Name","Description","City","Category","Price"]],on='Place_Id', how='left')
tourism_combined

Unnamed: 0,User_Id,Place_Id,Place_Ratings,Place_Name,Description,City,Category,Price
0,1,179,3,Candi Ratu Boko,Situs Ratu Baka atau Candi Boko (Hanacaraka:ꦕꦤ...,Yogyakarta,Budaya,75000
1,1,344,2,Pantai Marina,"Pantai Marina (bahasa Jawa: ꦥꦱꦶꦱꦶꦂ​ꦩꦫꦶꦤ, trans...",Semarang,Bahari,3000
2,1,5,5,Atlantis Water Adventure,Atlantis Water Adventure atau dikenal dengan A...,Jakarta,Taman Hiburan,94000
3,1,373,3,Museum Kereta Ambarawa,Museum Kereta Api Ambarawa (bahasa Inggris: In...,Semarang,Budaya,10000
4,1,101,4,Kampung Wisata Sosro Menduran,Kampung wisata Sosromenduran merupakan kampung...,Yogyakarta,Budaya,0
...,...,...,...,...,...,...,...,...
9995,300,425,2,Waterpark Kenjeran Surabaya,Waterpark Kenjeran Surabaya merupakan wisata k...,Surabaya,Taman Hiburan,35000
9996,300,64,4,Museum Sasmita Loka Ahmad Yani,Museum Sasmita Loka Ahmad Yani adalah salah sa...,Jakarta,Budaya,2000
9997,300,311,3,The Lodge Maribaya,The Lodge Maribaya adalah salah satu tempat wi...,Bandung,Cagar Alam,25000
9998,300,279,4,Masjid Agung Trans Studio Bandung,Masjid Agung Trans Studio Bandung (TSB) berdir...,Bandung,Tempat Ibadah,0


In [47]:
# This line creates a new column called 'city_category' in the all_tourism DataFrame by combining the values in the 'City' and 'Category'
# columns.
# It's useful for grouping, visualization, or text-based filtering. You might use 'city_category' in a TF-IDF model,
# or to create tags for search or recommendation
tourism_combined['city_category'] = tourism_combined[['City','Category']].agg(' '.join,axis=1)

In [49]:
tourism_combined

Unnamed: 0,User_Id,Place_Id,Place_Ratings,Place_Name,Description,City,Category,Price,city_category
0,1,179,3,Candi Ratu Boko,Situs Ratu Baka atau Candi Boko (Hanacaraka:ꦕꦤ...,Yogyakarta,Budaya,75000,Yogyakarta Budaya
1,1,344,2,Pantai Marina,"Pantai Marina (bahasa Jawa: ꦥꦱꦶꦱꦶꦂ​ꦩꦫꦶꦤ, trans...",Semarang,Bahari,3000,Semarang Bahari
2,1,5,5,Atlantis Water Adventure,Atlantis Water Adventure atau dikenal dengan A...,Jakarta,Taman Hiburan,94000,Jakarta Taman Hiburan
3,1,373,3,Museum Kereta Ambarawa,Museum Kereta Api Ambarawa (bahasa Inggris: In...,Semarang,Budaya,10000,Semarang Budaya
4,1,101,4,Kampung Wisata Sosro Menduran,Kampung wisata Sosromenduran merupakan kampung...,Yogyakarta,Budaya,0,Yogyakarta Budaya
...,...,...,...,...,...,...,...,...,...
9995,300,425,2,Waterpark Kenjeran Surabaya,Waterpark Kenjeran Surabaya merupakan wisata k...,Surabaya,Taman Hiburan,35000,Surabaya Taman Hiburan
9996,300,64,4,Museum Sasmita Loka Ahmad Yani,Museum Sasmita Loka Ahmad Yani adalah salah sa...,Jakarta,Budaya,2000,Jakarta Budaya
9997,300,311,3,The Lodge Maribaya,The Lodge Maribaya adalah salah satu tempat wi...,Bandung,Cagar Alam,25000,Bandung Cagar Alam
9998,300,279,4,Masjid Agung Trans Studio Bandung,Masjid Agung Trans Studio Bandung (TSB) berdir...,Bandung,Tempat Ibadah,0,Bandung Tempat Ibadah


In [51]:
## Data Preparation
## Check for missing values

tourism_combined.isnull().sum()

User_Id          0
Place_Id         0
Place_Ratings    0
Place_Name       0
Description      0
City             0
Category         0
Price            0
city_category    0
dtype: int64

In [53]:
tourism_cleaned = tourism_combined.drop_duplicates("Place_Id")
tourism_cleaned

Unnamed: 0,User_Id,Place_Id,Place_Ratings,Place_Name,Description,City,Category,Price,city_category
0,1,179,3,Candi Ratu Boko,Situs Ratu Baka atau Candi Boko (Hanacaraka:ꦕꦤ...,Yogyakarta,Budaya,75000,Yogyakarta Budaya
1,1,344,2,Pantai Marina,"Pantai Marina (bahasa Jawa: ꦥꦱꦶꦱꦶꦂ​ꦩꦫꦶꦤ, trans...",Semarang,Bahari,3000,Semarang Bahari
2,1,5,5,Atlantis Water Adventure,Atlantis Water Adventure atau dikenal dengan A...,Jakarta,Taman Hiburan,94000,Jakarta Taman Hiburan
3,1,373,3,Museum Kereta Ambarawa,Museum Kereta Api Ambarawa (bahasa Inggris: In...,Semarang,Budaya,10000,Semarang Budaya
4,1,101,4,Kampung Wisata Sosro Menduran,Kampung wisata Sosromenduran merupakan kampung...,Yogyakarta,Budaya,0,Yogyakarta Budaya
...,...,...,...,...,...,...,...,...,...
2008,62,370,1,Benteng Pendem,Benteng Pendem Cilacap (bahasa Belanda: Kustba...,Semarang,Budaya,5000,Semarang Budaya
2399,74,350,4,Pantai Cipta,Pantai Cipta juga dikenal sebagai Pantai Petik...,Semarang,Bahari,5000,Semarang Bahari
2448,75,10,2,Pulau Tidung,Pulau Tidung adalah salah satu kelurahan di ke...,Jakarta,Bahari,150000,Jakarta Bahari
2534,78,7,4,Kebun Binatang Ragunan,Kebun Binatang Ragunan adalah sebuah kebun bin...,Jakarta,Cagar Alam,4000,Jakarta Cagar Alam


In [55]:
place_id = tourism_cleaned.Place_Id.tolist()
place_name = tourism_cleaned.Place_Name.tolist()
place_category = tourism_cleaned.Category.tolist()
place_desc = tourism_cleaned.Description.tolist()
place_city = tourism_cleaned.City.tolist()
city_category = tourism_cleaned.city_category.tolist()
price = tourism_cleaned.Price.tolist()


In [57]:
tourism_new = pd.DataFrame({
    "id":place_id,
    "name":place_name,
    "category":place_category,
    "description":place_desc,
    "city":place_city,
    "city_category":city_category,
    "price":price
})

tourism_new

Unnamed: 0,id,name,category,description,city,city_category,price
0,179,Candi Ratu Boko,Budaya,Situs Ratu Baka atau Candi Boko (Hanacaraka:ꦕꦤ...,Yogyakarta,Yogyakarta Budaya,75000
1,344,Pantai Marina,Bahari,"Pantai Marina (bahasa Jawa: ꦥꦱꦶꦱꦶꦂ​ꦩꦫꦶꦤ, trans...",Semarang,Semarang Bahari,3000
2,5,Atlantis Water Adventure,Taman Hiburan,Atlantis Water Adventure atau dikenal dengan A...,Jakarta,Jakarta Taman Hiburan,94000
3,373,Museum Kereta Ambarawa,Budaya,Museum Kereta Api Ambarawa (bahasa Inggris: In...,Semarang,Semarang Budaya,10000
4,101,Kampung Wisata Sosro Menduran,Budaya,Kampung wisata Sosromenduran merupakan kampung...,Yogyakarta,Yogyakarta Budaya,0
...,...,...,...,...,...,...,...
432,370,Benteng Pendem,Budaya,Benteng Pendem Cilacap (bahasa Belanda: Kustba...,Semarang,Semarang Budaya,5000
433,350,Pantai Cipta,Bahari,Pantai Cipta juga dikenal sebagai Pantai Petik...,Semarang,Semarang Bahari,5000
434,10,Pulau Tidung,Bahari,Pulau Tidung adalah salah satu kelurahan di ke...,Jakarta,Jakarta Bahari,150000
435,7,Kebun Binatang Ragunan,Cagar Alam,Kebun Binatang Ragunan adalah sebuah kebun bin...,Jakarta,Jakarta Cagar Alam,4000


In [59]:
tourism_new.category.unique()

array(['Budaya', 'Bahari', 'Taman Hiburan', 'Cagar Alam',
       'Pusat Perbelanjaan', 'Tempat Ibadah'], dtype=object)

In [61]:
import warnings
warnings.filterwarnings('ignore')
# changing the naming into English
tourism_new.category[tourism_new.category == 'Taman Hiburan'] = 'Amusement Park & Downtown Attractions'
tourism_new.category[tourism_new.category == 'Budaya'] = 'Culture'
tourism_new.category[tourism_new.category == 'Cagar Alam'] = 'National Park'
tourism_new.category[tourism_new.category == 'Taman Hiburan'] = 'Amusement Park'
tourism_new.category[tourism_new.category == 'Bahari'] = 'Marine Tourism'
tourism_new.category[tourism_new.category == 'Pusat Perbelanjaan'] = 'Shopping Center'


In [64]:
tourism_new

Unnamed: 0,id,name,category,description,city,city_category,price
0,179,Candi Ratu Boko,Culture,Situs Ratu Baka atau Candi Boko (Hanacaraka:ꦕꦤ...,Yogyakarta,Yogyakarta Budaya,75000
1,344,Pantai Marina,Marine Tourism,"Pantai Marina (bahasa Jawa: ꦥꦱꦶꦱꦶꦂ​ꦩꦫꦶꦤ, trans...",Semarang,Semarang Bahari,3000
2,5,Atlantis Water Adventure,Amusement Park & Downtown Attractions,Atlantis Water Adventure atau dikenal dengan A...,Jakarta,Jakarta Taman Hiburan,94000
3,373,Museum Kereta Ambarawa,Culture,Museum Kereta Api Ambarawa (bahasa Inggris: In...,Semarang,Semarang Budaya,10000
4,101,Kampung Wisata Sosro Menduran,Culture,Kampung wisata Sosromenduran merupakan kampung...,Yogyakarta,Yogyakarta Budaya,0
...,...,...,...,...,...,...,...
432,370,Benteng Pendem,Culture,Benteng Pendem Cilacap (bahasa Belanda: Kustba...,Semarang,Semarang Budaya,5000
433,350,Pantai Cipta,Marine Tourism,Pantai Cipta juga dikenal sebagai Pantai Petik...,Semarang,Semarang Bahari,5000
434,10,Pulau Tidung,Marine Tourism,Pulau Tidung adalah salah satu kelurahan di ke...,Jakarta,Jakarta Bahari,150000
435,7,Kebun Binatang Ragunan,National Park,Kebun Binatang Ragunan adalah sebuah kebun bin...,Jakarta,Jakarta Cagar Alam,4000
