In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/MyDrive/Rekomendasi_Pariwisata

/content/drive/MyDrive/Rekomendasi_Pariwisata


In [None]:
# Install required packages
!pip install sastrawi tensorflowjs



In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models, losses, optimizers
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import random


In [None]:

# ===============================
df = pd.read_csv('dataset_jogja_pre.csv')
df

Unnamed: 0,no,nama,vote_average,vote_count,htm_weekday,htm_weekend,latitude,longitude,type_clean_Agrowisata,type_clean_Alam,...,type_clean_Desa Wisata,type_clean_Kuliner,type_clean_Minat Khusus,type_clean_Museum,type_clean_Pantai,type_clean_Pendidikan,type_clean_Religi,type_clean_Seni,type_clean_Wisata Air,description_clean
0,9,Candi Borobudur,4.7,81922,50000.0,50000.0,-7.607087,110.203623,0,0,...,0,0,0,0,0,0,0,0,0,candi yang pernah masuk sebagai salah satu dar...
1,10,Candi Prambanan,4.7,71751,50000.0,50000.0,-7.751835,110.491532,0,0,...,0,0,0,0,0,0,0,0,0,candi prambanan adalah kompleks candi hindu te...
2,24,Tebing Breksi,4.4,51431,10000.0,10000.0,-7.781477,110.504576,0,1,...,0,0,0,0,0,0,0,0,0,tebing breksi merupakan tempat wisata yang ber...
3,343,Gembira Loka Zoo,4.5,36337,20000.0,25000.0,-7.806234,110.396798,0,0,...,0,0,0,0,0,0,0,0,0,gambira loka adalah kebun binatang yang berada...
4,346,The Palace of Yogyakarta (Keraton Yogyakarta),4.6,30091,8000.0,8000.0,-7.805284,110.364203,0,0,...,0,0,0,0,0,0,0,0,0,kompleks keraton merupakan museum yang menyimp...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,139,Pemancingan Adi Winata,5.0,1,0.0,0.0,-7.704577,110.512011,0,0,...,0,0,0,0,0,0,0,0,1,tempat rekreasi keluarga yang menawarkan fasil...
472,105,Ruang Perawatan Jenderal Soedirman,5.0,1,4000.0,4000.0,-7.776474,110.376744,0,0,...,0,0,0,0,0,0,0,0,0,situs sejarah berupa ruangan tempat jenderal s...
473,110,Situs Gedong Pusoko,5.0,1,15000.0,15000.0,-7.807846,110.403758,0,0,...,0,0,0,0,0,0,0,0,0,situs arkeologi yang menyimpan tinggalan sejar...
474,164,Taman Edukasi dan Outbound Sunan Kalijaga,5.0,1,0.0,0.0,-7.809207,110.413252,0,0,...,0,0,0,0,0,0,0,0,0,area edukatif yang menyediakan kegiatan outbou...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 476 entries, 0 to 475
Data columns (total 23 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   no                             476 non-null    int64  
 1   nama                           476 non-null    object 
 2   vote_average                   476 non-null    float64
 3   vote_count                     476 non-null    int64  
 4   htm_weekday                    476 non-null    float64
 5   htm_weekend                    476 non-null    float64
 6   latitude                       476 non-null    float64
 7   longitude                      476 non-null    float64
 8   type_clean_Agrowisata          476 non-null    int64  
 9   type_clean_Alam                476 non-null    int64  
 10  type_clean_Buatan              476 non-null    int64  
 11  type_clean_Budaya Dan Sejarah  476 non-null    int64  
 12  type_clean_Desa Wisata         476 non-null    int

In [None]:
# Preprocessing teks
descriptions = df['description_clean'].astype(str).tolist()
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(descriptions)
sequences = tokenizer.texts_to_sequences(descriptions)
padded_sequences = pad_sequences(sequences, padding='post', maxlen=100)

In [None]:
# Buat pasangan data mirip dan tidak mirip
def create_pairs(data):
    pairs = []
    labels = []
    n = len(data)
    for i in range(n):
        # pasangan mirip (label=1)
        j = random.choice([x for x in range(n) if x != i])
        pairs.append([data[i], data[j]])
        labels.append(1 if df.iloc[i]['nama'][:4] == df.iloc[j]['nama'][:4] else 0)  # asumsi mirip jika nama mirip
    return np.array(pairs), np.array(labels)

pairs, labels = create_pairs(padded_sequences)

In [None]:
# ===============================
# 4. FEATURE ENGINEERING
# ===============================
tfidf = TfidfVectorizer(max_features=1000)
tfidf_matrix = tfidf.fit_transform(df['description_clean'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

type_columns = [col for col in df.columns if col.startswith('type_clean_')]
type_df = df[type_columns]

numeric_features = ['vote_average', 'vote_count', 'htm_weekday', 'htm_weekend']
numeric_df = pd.DataFrame(StandardScaler().fit_transform(df[numeric_features]), columns=numeric_features)

def get_tourism_types(row):
    return [col.replace('type_clean_', '').replace('_', ' ') for col in type_columns if row[col] == 1]

df['tourism_types'] = df.apply(get_tourism_types, axis=1)

all_features = pd.concat([numeric_df, type_df, tfidf_df], axis=1)
feature_dim = all_features.shape[1]

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(pairs, labels, test_size=0.2, random_state=42)

In [None]:
# Model base embedding
def build_base_network(input_shape):
    input = tf.keras.Input(shape=input_shape)
    x = layers.Embedding(input_dim=5000, output_dim=64, input_length=input_shape[0])(input)
    x = layers.Conv1D(64, 5, activation='relu')(x)
    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dense(64)(x)
    model = models.Model(input, x)
    return model

In [None]:
# Jarak cosine sebagai metrik kesamaan
def cosine_distance(vectors):
    x, y = vectors
    x = tf.math.l2_normalize(x, axis=1)
    y = tf.math.l2_normalize(y, axis=1)
    return 1 - tf.reduce_sum(x * y, axis=1, keepdims=True)


In [None]:
# Jaringan siamese
input_shape = (100,)
base_network = build_base_network(input_shape)

input_a = tf.keras.Input(shape=input_shape)
input_b = tf.keras.Input(shape=input_shape)

processed_a = base_network(input_a)
processed_b = base_network(input_b)

distance = layers.Lambda(cosine_distance)([processed_a, processed_b])
model = models.Model([input_a, input_b], distance)



In [None]:
# Loss dan compile
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'mae'])


# Latih model
model.fit([X_train[:, 0], X_train[:, 1]], y_train, batch_size=32, epochs=10, validation_split=0.2)


Epoch 1/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 283ms/step - accuracy: 0.9786 - loss: 0.1150 - mae: 0.0561 - val_accuracy: 0.9737 - val_loss: 0.1061 - val_mae: 0.0395
Epoch 2/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 94ms/step - accuracy: 0.9820 - loss: 0.0486 - mae: 0.0263 - val_accuracy: 0.9737 - val_loss: 0.0965 - val_mae: 0.0426
Epoch 3/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 106ms/step - accuracy: 0.9864 - loss: 0.0266 - mae: 0.0214 - val_accuracy: 0.9737 - val_loss: 0.0912 - val_mae: 0.0534
Epoch 4/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 203ms/step - accuracy: 1.0000 - loss: 0.0305 - mae: 0.0271 - val_accuracy: 0.9737 - val_loss: 0.0994 - val_mae: 0.0682
Epoch 5/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 176ms/step - accuracy: 1.0000 - loss: 0.0198 - mae: 0.0203 - val_accuracy: 0.9737 - val_loss: 0.0938 - val_mae: 0.0538
Epoch 6/10
[1m10/10[0m [32m

<keras.src.callbacks.history.History at 0x7ddde39e6010>

In [None]:
# Evaluasi
loss, acc, mae = model.evaluate([X_test[:, 0], X_test[:, 1]], y_test)
print(f"Loss: {loss}, Accuracy: {acc},mae :{mae}")

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step - accuracy: 0.9727 - loss: 0.0859 - mae: 0.0569
Loss: 0.10642115026712418, Accuracy: 0.96875,mae :0.05853477120399475


In [None]:
# Dapatkan embedding semua deskripsi dari model
embedding_model = models.Model(inputs=base_network.input, outputs=base_network.output)
all_embeddings = embedding_model.predict(padded_sequences)

# Fungsi rekomendasi berdasarkan cosine similarity antar embedding
def get_recommendations(place_name, top_n=5):
    if place_name not in df['nama'].values:
        print("Nama tempat tidak ditemukan.")
        return None

    idx = df[df['nama'] == place_name].index[0]
    place_embedding = all_embeddings[idx].reshape(1, -1)

    similarities = cosine_similarity(place_embedding, all_embeddings)[0]
    sim_indices = similarities.argsort()[::-1][1:top_n+1]

    # Ambil semua kolom yang merupakan tipe
    type_columns = [col for col in df.columns if col.startswith('type_clean_')]

    # Ambil kolom yang ingin ditampilkan
    hasil = df.iloc[sim_indices][['nama', 'description_clean', 'htm_weekday', 'htm_weekend'] + type_columns].copy()
    hasil['similarity'] = similarities[sim_indices]

    # Buat kolom type sebagai list dari type_columns yang bernilai 1
    def ambil_tipe(row):
        return [col.replace('type_clean_', '').replace('_', ' ') for col in type_columns if row[col] == 1]

    hasil['type'] = hasil.apply(ambil_tipe, axis=1)
    hasil = hasil[['nama', 'description_clean', 'type', 'htm_weekday', 'htm_weekend', 'similarity']]

    return hasil.reset_index(drop=True)


# Contoh penggunaan
print(get_recommendations('Candi Prambanan'))


[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step
                       nama  \
0           Pantai Ngandong   
1   Kampung wisata wonorejo   
2     Monumen Yogya Kembali   
3      Desa Wisata Pulesari   
4  Ijo Temple Wooden Mosque   

                                   description_clean           type  \
0  pantai ini memiliki garis pantai yang panjang ...       [Pantai]   
1  desa wisata yang menyuguhkan budaya lokal kera...   [Agrowisata]   
2  museum monumen jogja kembali adalah sebuah mus...       [Museum]   
3  suasana khas pedesaan akan segera menyambut pa...  [Desa Wisata]   
4  pasoedjoedan al liwung atau yang dikenal denga...       [Religi]   

   htm_weekday  htm_weekend  similarity  
0      10000.0      10000.0    0.984847  
1      20000.0      20000.0    0.984444  
2      15000.0      15000.0    0.982801  
3      50000.0      50000.0    0.980051  
4          0.0          0.0    0.979913  


In [None]:
def get_recommendations_by_keyword(keyword, top_n=5):
    # Tokenisasi dan padding keyword
    keyword_seq = tokenizer.texts_to_sequences([keyword])
    keyword_pad = pad_sequences(keyword_seq, maxlen=padded_sequences.shape[1])

    # Dapatkan embedding keyword dari model
    keyword_embedding = embedding_model.predict(keyword_pad)

    # Hitung cosine similarity antara keyword embedding dan semua deskripsi
    similarities = cosine_similarity(keyword_embedding, all_embeddings)[0]

    # Urutkan berdasarkan similarity tertinggi
    sim_indices = similarities.argsort()[::-1][:top_n]

    # Ambil kolom tipe
    type_columns = [col for col in df.columns if col.startswith('type_clean_')]

    # Ambil hasil teratas
    hasil = df.iloc[sim_indices][['nama', 'description_clean', 'htm_weekday', 'htm_weekend'] + type_columns].copy()
    hasil['similarity'] = similarities[sim_indices]

    # Ubah kolom tipe menjadi list
    def ambil_tipe(row):
        return [col.replace('type_clean_', '').replace('_', ' ') for col in type_columns if row[col] == 1]

    hasil['type'] = hasil.apply(ambil_tipe, axis=1)

    # Pilih kolom akhir
    hasil = hasil[['nama', 'description_clean', 'type', 'htm_weekday', 'htm_weekend', 'similarity']]

    return hasil.reset_index(drop=True)


In [None]:
get_recommendations_by_keyword("nyaman", top_n=5)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step


Unnamed: 0,nama,description_clean,type,htm_weekday,htm_weekend,similarity
0,Ruang Perawatan Jenderal Soedirman,situs sejarah berupa ruangan tempat jenderal s...,[Budaya Dan Sejarah],4000.0,4000.0,0.770215
1,Pantai Kuwaru,pantai kuwaru terkenal dengan wisata yang terk...,[Pantai],10000.0,10000.0,0.769571
2,MUSEUM GARUDA,museum yang menampilkan koleksi beragam tentan...,[Budaya Dan Sejarah],0.0,0.0,0.747598
3,Kori Agung Masjid Gedhe Mataram Kotagede,gerbang agung penuh nilai sejarah yang menjadi...,[Religi],0.0,0.0,0.746033
4,Curug Banyunibo,air terjun tersembunyi yang berada tak jauh da...,[Wisata Air],0.0,0.0,0.740923


In [None]:
def get_recommendations_by_category_and_rating(category_keyword, min_rating=4.0, top_n=3):
    # Rentang rating
    max_rating = min_rating + 0.5

    # Normalisasi nama kolom untuk pencocokan longgar
    def normalize(text):
        return text.strip().lower().replace("_", " ").replace("-", " ")

    normalized_input = normalize(category_keyword)

    # Temukan kolom yang cocok
    category_col = None
    for col in df.columns:
        if col.startswith("type_clean_"):
            col_name = col.replace("type_clean_", "")
            if normalize(col_name) == normalized_input:
                category_col = col
                break

    if category_col is None:
        print(f"Kategori '{category_keyword}' tidak ditemukan.")
        return None

    # Filter berdasarkan kategori dan rating rentang
    filtered_df = df[
        (df[category_col] == 1) &
        (df['vote_average'] >= min_rating) &
        (df['vote_average'] < max_rating)
    ]

    if filtered_df.empty:
        print("Tidak ada wisata yang sesuai dengan kategori dan rentang rating.")
        return None

    # Ambil wisata terbaik
    hasil = filtered_df.sort_values(by='vote_average', ascending=False).head(top_n)

    # Kolom tipe
    type_columns = [col for col in df.columns if col.startswith('type_clean_')]

    def ambil_tipe(row):
        return [col.replace('type_clean_', '').replace('_', ' ') for col in type_columns if row[col] == 1]

    hasil['type'] = hasil.apply(ambil_tipe, axis=1)

    # Kolom akhir
    hasil = hasil[['nama', 'description_clean', 'type', 'vote_average', 'vote_count', 'htm_weekday', 'htm_weekend']]

    return hasil.reset_index(drop=True)


In [None]:
get_recommendations_by_category_and_rating("Wisata air", min_rating=5.0)

Unnamed: 0,nama,description_clean,type,vote_average,vote_count,htm_weekday,htm_weekend
0,"Desa Wisata ""Bedog Ilir""",desa wisata yang menggabungkan nuansa pedesaan...,[Wisata Air],5.0,9,0.0,0.0
1,Desa Wisata Kembang Wonderful,desa wisata dengan keindahan alam dan kegiatan...,[Wisata Air],5.0,8,0.0,0.0
2,Wisata Jaga Bendung,wisata air dengan suasana alami di sekitar ben...,[Wisata Air],5.0,3,0.0,0.0
