In [22]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [23]:
# LOADING THE TOURISM DATASET FROM A CSV FILE INTO A DATAFRAME  
Tourism_df = pd.read_csv("F:/DS/4th project Tourism Analysis/DATA/Final_Tourim_Table.csv")

In [24]:
Tourism_df.head()

Unnamed: 0,TransactionId,UserId,VisitYear,VisitMonth,VisitModeId,AttractionId,Rating,ContinentId,RegionId,CountryId,...,Continent,Region,Country,CityName,Attraction,AttractionAddress,AttractionTypeId,AttractionCityId,AttractionType,VisitModeName
0,5661,14,2018,12,4,640,4,5,20,155,...,Europe,Southern Europe,Portugal,Lagos,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia",63,1,Nature & Wildlife Areas,Friends
1,67652,14,2018,12,4,748,5,5,20,155,...,Europe,Southern Europe,Portugal,Lagos,Tegalalang Rice Terrace,"Jalan Raya Ceking, Tegalalang 80517 Indonesia",72,1,Points of Interest & Landmarks,Friends
2,68777,14,2018,12,4,748,5,5,20,155,...,Europe,Southern Europe,Portugal,Lagos,Tegalalang Rice Terrace,"Jalan Raya Ceking, Tegalalang 80517 Indonesia",72,1,Points of Interest & Landmarks,Friends
3,4735,16,2018,4,3,640,5,3,14,101,...,Asia,South East Asia,Indonesia,Jakarta,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia",63,1,Nature & Wildlife Areas,Family
4,5318,16,2017,12,4,640,5,3,14,101,...,Asia,South East Asia,Indonesia,Jakarta,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia",63,1,Nature & Wildlife Areas,Friends


In [25]:
Tourism_df.duplicated().sum()

np.int64(0)

In [26]:
Tourism_df.isnull().sum()

TransactionId         0
UserId                0
VisitYear             0
VisitMonth            0
VisitModeId           0
AttractionId          0
Rating                0
ContinentId           0
RegionId              0
CountryId             0
CityId                0
Continent             0
Region               23
Country               0
CityName              0
Attraction            0
AttractionAddress     0
AttractionTypeId      0
AttractionCityId      0
AttractionType        0
VisitModeName         0
dtype: int64

In [27]:
Tourism_df['Region'].fillna(Tourism_df['Region'].mode()[0],inplace=True)

print('After handling missing value')
Tourism_df.isnull().sum()

After handling missing value


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  Tourism_df['Region'].fillna(Tourism_df['Region'].mode()[0],inplace=True)


TransactionId        0
UserId               0
VisitYear            0
VisitMonth           0
VisitModeId          0
AttractionId         0
Rating               0
ContinentId          0
RegionId             0
CountryId            0
CityId               0
Continent            0
Region               0
Country              0
CityName             0
Attraction           0
AttractionAddress    0
AttractionTypeId     0
AttractionCityId     0
AttractionType       0
VisitModeName        0
dtype: int64

In [28]:
# Normalize Ratings
Tourism_df["Rating"] = MinMaxScaler().fit_transform(Tourism_df[["Rating"]])

# Create User-Attraction Matrix
user_attraction_matrix = Tourism_df.pivot_table(index="UserId", columns="AttractionId", values="Rating", fill_value=0)

In [32]:
# Apply Truncated SVD for Dimensionality Reduction
n_components = min(25, user_attraction_matrix.shape[1])  # Increased components for better reconstruction
svd = TruncatedSVD(n_components=n_components, random_state=42)
user_attraction_matrix_reduced = svd.fit_transform(user_attraction_matrix)

# Train KNN Model with Cosine Similarity
knn_model = NearestNeighbors(n_neighbors=7, metric='cosine', algorithm='auto')  # Increased neighbors for better diversity
knn_model.fit(user_attraction_matrix_reduced)

In [33]:
# Function to Recommend Attractions
def recommend_attractions(user_id, num_recommendations=5):
    if user_id not in user_attraction_matrix.index:
        return "User ID not found! Try with a different ID."

    user_idx = user_attraction_matrix.index.get_loc(user_id)
    distances, indices = knn_model.kneighbors([user_attraction_matrix_reduced[user_idx]], n_neighbors=7)
    
    similar_users = user_attraction_matrix.index[indices.flatten()[1:]]  # Exclude self

    user_ratings = user_attraction_matrix.loc[user_id]
    unseen_attractions = user_ratings[user_ratings == 0].index  # Attractions user has not rated

    attraction_scores = {}
    for sim_user, dist in zip(similar_users, distances.flatten()[1:]):  # Weighted Similarity
        for attraction in unseen_attractions:
            attraction_scores[attraction] = attraction_scores.get(attraction, 0) + (
                user_attraction_matrix.loc[sim_user, attraction] * (1 - dist)
            )

    recommended_attractions = sorted(attraction_scores, key=attraction_scores.get, reverse=True)[:num_recommendations]
    
    return recommended_attractions if recommended_attractions else "No new recommendations found."

In [34]:
# Convert Attraction ID to Names
def map_attractions(recommended_ids):
    attraction_mapping = dict(zip(Tourism_df["AttractionId"], Tourism_df["Attraction"]))
    return [attraction_mapping[att_id] for att_id in recommended_ids if att_id in attraction_mapping]


In [35]:
# Get Recommendations
user_id = 16
recommended_ids = recommend_attractions(user_id)
recommended_names = map_attractions(recommended_ids)

print(f"Recommended Attractions for User {user_id}: {recommended_names}")

Recommended Attractions for User 16: ['Tanah Lot Temple', 'Tegenungan Waterfall', 'Sanur Beach', 'Kuta Beach   Bali', 'Seminyak Beach']


In [36]:
# Evaluate SVD Reconstruction Quality
reconstructed_matrix = svd.inverse_transform(user_attraction_matrix_reduced)
original = user_attraction_matrix.to_numpy()
reconstructed = reconstructed_matrix

rmse = np.sqrt(mean_squared_error(original, reconstructed))
mse = mean_squared_error(original, reconstructed)
mae = mean_absolute_error(original, reconstructed)
r2 = r2_score(original, reconstructed)

print(f"Reconstruction RMSE: {rmse:.4f}")
print(f"Reconstruction MSE: {mse:.4f}")
print(f"Reconstruction MAE: {mae:.4f}")
print(f"Reconstruction R² Score: {r2:.4f}")

Reconstruction RMSE: 0.0107
Reconstruction MSE: 0.0001
Reconstruction MAE: 0.0004
Reconstruction R² Score: 0.8490


In [37]:
import joblib
joblib.dump(svd,"svd.plk")
joblib.dump(user_attraction_matrix_reduced,'user attraction matrix reduce.plk')
joblib.dump(knn_model,'KNN.plk')
joblib.dump(user_attraction_matrix,'use attraction matrix')
print('All models has been saved')

All models has been saved
