## Import library

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

## Read dataset

In [None]:
df = pd.read_csv('/content/list_wisata_bangkit.csv')
df.head()

Unnamed: 0,jenis,nama,rating,deskripsi
0,rekreasi,Pulau kemaro,4.5,Pulau yang dipenuhi pohon hijau di dalam delta...
1,rekreasi,Benteng kuto besak,4.5,"Benteng, museum bersejarah, & tempat hangout d..."
2,rekreasi,Keliling sungai musi,4.6,Keliling sungai musi menggunakan perahu ketek ...
3,kuliner,Warung terapung,4.3,Tempat makan terapung yang berada di sungai mu...
4,kuliner,Riverside,4.6,Restaurant yang berlokasi di monpera berada di...


## Data cleaning

In [None]:
def remove_excessive_spaces(value):
    if isinstance(value, str):
        return ' '.join(value.split())
    else:
        return value

In [None]:
# Apply the function to all elements in the DataFrame
df_cleaned = df.applymap(remove_excessive_spaces)

In [None]:
df_cleaned.head()

Unnamed: 0,jenis,nama,rating,deskripsi
0,rekreasi,Pulau kemaro,4.5,Pulau yang dipenuhi pohon hijau di dalam delta...
1,rekreasi,Benteng kuto besak,4.5,"Benteng, museum bersejarah, & tempat hangout d..."
2,rekreasi,Keliling sungai musi,4.6,Keliling sungai musi menggunakan perahu ketek ...
3,kuliner,Warung terapung,4.3,Tempat makan terapung yang berada di sungai mu...
4,kuliner,Riverside,4.6,Restaurant yang berlokasi di monpera berada di...


## Encoding the type

In [None]:
label_encoder = LabelEncoder()
df['jenis_encoded'] = label_encoder.fit_transform(df['jenis'])

In [None]:
df.head()

Unnamed: 0,jenis,nama,rating,deskripsi,jenis_encoded
0,rekreasi,Pulau kemaro,4.5,Pulau yang dipenuhi pohon hijau di dalam delta...,3
1,rekreasi,Benteng kuto besak,4.5,"Benteng, museum bersejarah, & tempat hangout d...",3
2,rekreasi,Keliling sungai musi,4.6,Keliling sungai musi menggunakan perahu ketek ...,3
3,kuliner,Warung terapung,4.3,Tempat makan terapung yang berada di sungai mu...,2
4,kuliner,Riverside,4.6,Restaurant yang berlokasi di monpera berada di...,2


## Make Vectorization

In [None]:
# TF-IDF Vectorization for 'deskripsi'
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix_deskripsi = tfidf_vectorizer.fit_transform(df['deskripsi'])

In [None]:
# Combine TF-IDF features with 'type_encoded'
combined_features = pd.concat([pd.DataFrame(tfidf_matrix_deskripsi.toarray()), df['jenis_encoded']], axis=1)

## Compute the cosine similarity

In [None]:
# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(combined_features, combined_features)
cosine_sim[0]

array([1.        , 0.91693916, 0.91529779, 0.88078422, 0.88044036,
       0.85706787, 0.67858943, 0.68144308, 0.90627572, 0.90116352,
       0.90096357, 0.90715014, 0.9011444 , 0.90661798, 0.90139116,
       0.01452646, 0.00364177, 0.00635016, 0.00627871])

### Save the cosine matrix to csv

In [None]:
cos_sim_df = pd.DataFrame(cosine_sim)
cos_sim_df.to_csv("destination_similarities.csv")
df_cleaned.to_csv('destination_cleaned.csv')

In [None]:
cosine_matrix = pd.read_csv('/content/destination_similarities.csv', header=0, index_col=0)
cosine_matrix.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,1.0,0.916939,0.915298,0.880784,0.88044,0.857068,0.678589,0.681443,0.906276,0.901164,0.900964,0.90715,0.901144,0.906618,0.901391,0.014526,0.003642,0.00635,0.006279
1,0.916939,1.0,0.910528,0.896653,0.872808,0.859403,0.693972,0.713994,0.905043,0.9,0.9,0.906137,0.9,0.9,0.9,0.021104,0.0,0.02015,0.006369
2,0.915298,0.910528,1.0,0.886432,0.883144,0.855098,0.676797,0.683621,0.906195,0.902114,0.901751,0.906654,0.90208,0.902629,0.902528,0.015302,0.006618,0.004885,0.00483
3,0.880784,0.896653,0.886432,1.0,0.908457,0.833011,0.645058,0.692589,0.872254,0.852986,0.85222,0.862558,0.852913,0.854072,0.853858,0.055224,0.013953,0.070101,0.010185
4,0.88044,0.872808,0.883144,0.908457,1.0,0.826491,0.63606,0.666008,0.867477,0.851956,0.851367,0.857592,0.8519,0.852792,0.852627,0.016514,0.01073,0.0,0.0


In [None]:
cosine_matrix = np.array(cosine_matrix)

In [None]:
cosine_matrix[0]

array([1.        , 0.91693916, 0.91529779, 0.88078422, 0.88044036,
       0.85706787, 0.67858943, 0.68144308, 0.90627572, 0.90116352,
       0.90096357, 0.90715014, 0.9011444 , 0.90661798, 0.90139116,
       0.01452646, 0.00364177, 0.00635016, 0.00627871])

## Make function recommendation

In [None]:
def get_recommendations(user_preferences, cosine_sim_matrix, df):
    # Filter destinations based on user preferences
    filtered_df = df[df['jenis'].isin(user_preferences)]

    if filtered_df.empty:
        return []  # If no destinations match the user preferences, return an empty list

    # Get the indices of the filtered destinations
    indices = filtered_df.index.tolist()

    # Get recommendations based on the first destination in the filtered dataset
    idx = indices[0]
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:4]  # Get the top 3 similar destinations (excluding itself)

    destination_indices = [i[0] for i in sim_scores]
    return df['nama'].iloc[destination_indices].tolist()

## Testing the reccomendation

In [None]:
preference = ['rekreasi']

In [None]:
# Example: Get recommendations for 'Pulau kemaro'
recommendations = get_recommendations(preference, cosine_matrix, df_cleaned)
print(recommendations)

['Benteng kuto besak', 'Keliling sungai musi', 'Taman Kambang Iwak Besak']
