In [10]:
%pip install pandas scikit-learn numpy matplotlib seaborn openpyxl
import pandas as pd
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

Note: you may need to restart the kernel to use updated packages.


In [11]:
# Function to safely evaluate strings that are lists (for original data inspection)
def safely_evaluate(data):
    try:
        evaluated_data = literal_eval(data)
        if isinstance(evaluated_data, list) and all(isinstance(item, str) for item in evaluated_data):
            return ' '.join(evaluated_data)
        else:
            return ''
    except (ValueError, SyntaxError):
        return ''

# Load the datasets as excel files .This is for use in a kaggle enviorenment.
"""
places_df = pd.read_excel("/kaggle/input/travelsenseds/Places Dataset.xlsx")
preferences_df = pd.read_excel("/kaggle/input/travelsenseds/Visitors Preference Dataset.xlsx") """

#Load datasets in a local enviorenment.Please copy dataset file paths and paste here accordingly 
places_df = pd.read_excel("Places Dataset.xlsx")
preferences_df = pd.read_excel("Visitors Preference Dataset.xlsx")

# Displayed the first few rows of the places and preferences datasets to see if it was imported
print("First few rows of Places Dataset:")
print(places_df.head())

print("First few rows of Preferences Dataset:")
print(preferences_df.head())

# Visualization of original data to get an idea about what kind of output we can expect
# Visualizing the distribution of ratings in the places dataset
plt.figure(figsize=(10, 5))
sns.histplot(places_df['rating'].dropna(), bins=10, kde=True, color='blue')
plt.title('Distribution of Ratings in Places Dataset')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.show()

# Visualizing the count of preferred activities in the preferences dataset
plt.figure(figsize=(10, 5))
preferences_df['Preferred Activities'].apply(lambda x: safely_evaluate(x)).value_counts().head(10).plot(kind='bar', color='green')
plt.title('Top 10 Preferred Activities in Preferences Dataset')
plt.xlabel('Activity')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.show()

ValueError: Excel file format cannot be determined, you must specify an engine manually.

In [4]:
# Function to safely evaluate strings that are lists (for original data inspection)
def safely_evaluate(data):
    try:
        evaluated_data = literal_eval(data)
        if isinstance(evaluated_data, list) and all(isinstance(item, str) for item in evaluated_data):
            return ' '.join(evaluated_data)
        else:
            return ''
    except (ValueError, SyntaxError):
        return ''

# Application of TF-IDF vector

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
# Vectorize the original places data
tfidf_matrix = tfidf_vectorizer.fit_transform(places_df['latest_reviews'].apply(lambda x: safely_evaluate(x)))
# Vectorize user preferences
user_preferences_vector = tfidf_vectorizer.transform(preferences_df['Preferred Activities'].apply(lambda x: safely_evaluate(x)) + ' ' + preferences_df['Bucket list destinations Sri Lanka'].apply(lambda x: safely_evaluate(x)))
# Compute similarity scores between user preferences and places
similarity_scores = cosine_similarity(user_preferences_vector, tfidf_matrix)
# If similarity scores are calculated, add them to the DataFrame (average of all user preferences)
if not similarity_scores.size == 0:
    places_df['similarity'] = similarity_scores.mean(axis=0)
    # Sort the places DataFrame by similarity score to get recommendations
    recommended_places = places_df.sort_values(by='similarity', ascending=False)
    
    # Display the top 10 recommended places
    print(recommended_places[['name', 'similarity']].head(10))
    
    # Visualization of similarity scores
    plt.figure(figsize=(12, 6))
    sns.histplot(places_df['similarity'].dropna(), bins=30, kde=True, color='purple')
    plt.title('Distribution of Similarity Scores')
    plt.xlabel('Similarity Score')
    plt.ylabel('Frequency')
    plt.show()
    
    # Visualization of top 10 recommended places
    plt.figure(figsize=(12, 6))
    sns.barplot(data=recommended_places.head(10), x='name', y='similarity', palette='viridis')
    plt.title('Top 10 Recommended Places')
    plt.xlabel('Place Name')
    plt.ylabel('Similarity Score')
    plt.xticks(rotation=45, ha='right')
    plt.show()

else:
    print("No similarity scores were computed.")


NameError: name 'places_df' is not defined

In [5]:
# Function to get recommendations for a user by user ID to get
def get_recommendations(user_id, top_n=10):#<--------------------------------------------- Specify how many recommendations are needed by top_n
    # Check if the user exists in the dataset(max id is 1000)
    if user_id not in preferences_df['User ID'].values:
        return f"User ID {user_id} not found."
    
    # Get the index of the user
    user_index = preferences_df[preferences_df['User ID'] == user_id].index[0]
    
    # Get the user's name
    user_name = preferences_df.loc[preferences_df['User ID'] == user_id, 'Name'].values[0]

    # Get the similarity scores for that user
    user_scores = similarity_scores[user_index]
    
    # Sort places by similarity score for this user
    recommended_indices = user_scores.argsort()[-top_n:][::-1]
    
    # Get the top n recommended places
    recommended_places = places_df.iloc[recommended_indices].copy()
    
    # Add a column for the user's name in the recommendations
    recommended_places['User Name'] = user_name
    
    # Reset index for readability (1-based index)
    recommended_places.index = recommended_places.index + 1
    
    # Sort recommendations by rating in descending order
    recommendations_sorted = recommended_places.sort_values(by='rating', ascending=False)
    
    return user_name, recommendations_sorted[['name', 'formatted_address', 'rating', 'latest_reviews']]

# Function to plot the recommendations as a bar chart
def plot_recommendations(recommendations_sorted, user_name):
    plt.figure(figsize=(20, 5))
    sns.barplot(x='rating', y='name', data=recommendations_sorted, palette='viridis')
    plt.title(f"Top Recommendations for {user_name} Based on Ratings")
    plt.xlabel('Rating')
    plt.ylabel('Place')
    plt.show()

# Example: Get top recommendations for a user and plot them
user_id = 80  # Example user ID, adjust based on your data <-----------------------CHANGE THE ID HERE TO TEST THE MODEL and SEE. MAX is 1000
user_name, recommendations_sorted = get_recommendations(user_id)

print(f"Top Recommendations for {user_name}")

# Display the recommendations in a table
display(recommendations_sorted)

# Plot the recommendations for the user
plot_recommendations(recommendations_sorted.head(10), user_name)  #<---------------change the number within head(#) to change the number of datapoints in graph


NameError: name 'preferences_df' is not defined

In [6]:
#SAVING the model

# Created a dictionary to store all components together to avoide it being messy
model_components = {
    'tfidf_vectorizer': tfidf_vectorizer,
    'similarity_scores': similarity_scores,
    'places_df': places_df,
    'preferences_df': preferences_df
}

# And saved all components into a single .pkl file
with open('combined_model.pkl', 'wb') as f:
    pickle.dump(model_components, f)

print("Model components saved successfully in 'combined_model.pkl'.")


NameError: name 'similarity_scores' is not defined

In [None]:
#LOADING the model


with open('combined_model.pkl', 'rb') as f:
    model_components = pickle.load(f)

#Or can access individual components 
tfidf_vectorizer = model_components['tfidf_vectorizer']
similarity_scores = model_components['similarity_scores']
places_df = model_components['places_df']
preferences_df = model_components['preferences_df']

print("Model components loaded successfully from 'combined_model.pkl'.")
