<a href="https://colab.research.google.com/github/Marxie-ops/Recommendation-systems/blob/main/Building_a_Hybrid_Recommendation_systems_For_Travel_Agencies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Creating Synthetic Data, Data Cleaning & EDA**

## Users Data

In [None]:
import random
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder, MinMaxScaler, LabelEncoder
from scipy.sparse import csr_matrix


# 1. Generate Users
import random

# 1. Generate Users with Enhanced Features
users = []
for user_id in range(1, 100001):  #100,000 users
    user = {
        'user_id': user_id,
        'age': random.randint(18, 65),
        'location': random.choice(['Kisii', 'Nairobi', 'Mombasa', 'Kisumu', 'Eldoret', 'Nakuru', 'Naivasha', 'Narok']),
        'frequent_traveler_score': round(random.uniform(0, 1), 2),  # 0 (rare) to 1 (frequent)
        'loyalty_membership': random.choice([0, 1]),  # 1 if enrolled in a loyalty program
        'total_bookings': random.randint(1,10),
        'preferred_hotel_type': random.choice(['Luxury', 'Budget', 'Business', 'Boutique', 'Resort']),
        'past_review_sentiment': round(random.uniform(-1, 1), 2),  # -1 (negative) to 1 (positive)
        'preferences': {
            'amenities': random.sample(['Pool', 'Gym', 'Free WiFi', 'Spa','conference'], 4)  # Select 4 random amenities
        }
    }
    users.append(user)



In [None]:
users = pd.DataFrame(users)
print(users.isna().sum())
users.head()

user_id                    0
age                        0
location                   0
frequent_traveler_score    0
loyalty_membership         0
total_bookings             0
preferred_hotel_type       0
past_review_sentiment      0
preferences                0
dtype: int64


Unnamed: 0,user_id,age,location,frequent_traveler_score,loyalty_membership,total_bookings,preferred_hotel_type,past_review_sentiment,preferences
0,1,43,Eldoret,0.17,1,5,Budget,-0.17,"{'amenities': ['conference', 'Pool', 'Free WiF..."
1,2,56,Eldoret,0.59,0,8,Luxury,0.74,"{'amenities': ['Free WiFi', 'conference', 'Spa..."
2,3,63,Eldoret,0.43,1,5,Luxury,0.49,"{'amenities': ['Gym', 'Pool', 'Spa', 'conferen..."
3,4,39,Eldoret,0.15,1,2,Boutique,-0.49,"{'amenities': ['conference', 'Spa', 'Free WiFi..."
4,5,60,Naivasha,0.9,1,3,Budget,0.95,"{'amenities': ['Pool', 'conference', 'Free WiF..."


## ***Data Cleaning & EDA for Users Data***



In [None]:
users[['amenities']] = pd.json_normalize(users['preferences'])
users.drop(columns = 'preferences', inplace = True)

In [None]:
users.head()

Unnamed: 0,user_id,age,location,frequent_traveler_score,loyalty_membership,total_bookings,preferred_hotel_type,past_review_sentiment,amenities
0,1,41,Kisii,0.32,1,6,Boutique,-0.67,"[Pool, conference, Spa, Gym]"
1,2,34,Eldoret,0.78,0,2,Budget,-0.26,"[Free WiFi, conference, Gym, Spa]"
2,3,34,Nairobi,0.72,0,4,Boutique,0.1,"[Spa, Pool, Gym, conference]"
3,4,31,Eldoret,0.68,1,5,Luxury,-0.46,"[Pool, conference, Gym, Free WiFi]"
4,5,36,Kisii,0.88,1,5,Budget,-0.53,"[Gym, conference, Spa, Pool]"


In [None]:
users['amenities'] = users['amenities'].astype(str)

In [None]:
users.head()

Unnamed: 0,user_id,age,location,frequent_traveler_score,loyalty_membership,total_bookings,preferred_hotel_type,past_review_sentiment,amenities
0,1,64,Mombasa,0.96,1,2,Budget,0.09,"['Free WiFi', 'Gym', 'Pool', 'Spa']"
1,2,53,Nairobi,0.13,0,5,Luxury,0.53,"['Spa', 'Pool', 'Gym', 'Free WiFi']"
2,3,65,Nakuru,0.68,1,3,Budget,0.53,"['Spa', 'conference', 'Gym', 'Pool']"
3,4,52,Nakuru,0.22,1,2,Boutique,0.79,"['conference', 'Free WiFi', 'Pool', 'Gym']"
4,5,48,Nakuru,0.92,0,3,Business,0.54,"['Pool', 'Spa', 'Gym', 'conference']"


## ***Save Users Synthensized Data to a CSV***

In [None]:
with open('users.csv', 'w', encoding='utf-8') as f:
    users.to_csv(f, index=False)

## Hotels Data

In [None]:
# Predefined hotel data
import random
import pandas as pd

# Generate Hotel Data
hotels = []
locations = ['Kisii', 'Nairobi', 'Mombasa', 'Kisumu', 'Eldoret', 'Nakuru', 'Naivasha', 'Narok']
amenities_list = ['Pool', 'Gym', 'Free WiFi', 'Spa', 'Bar', 'Restaurant', 'Parking', 'Conference Room']
room_types = ['Standard', 'Deluxe', 'Suite']
cancellation_policies = ['Flexible', 'Moderate', 'Strict']
hotel_names = ['HOTEL A', 'HOTEL B','HOTEL C','HOTEL D','HOTEL E','HOTEL F','HOTEL G','HOTEL H','HOTEL I','HOTEL J','HOTEL K','HOTEL L','HOTEL M','HOTEL N', 'HOTEL O','HOTEL P','HOTEL Q','HOTEL R','HOTEL S','HOTEL T','HOTEL U']

for hotel_id in range(1, 2501):  # 2,500 hotels
    hotel = {
        'hotel_id': hotel_id,
        'name': random.choice(hotel_names),
        'location': random.choice(locations),
        'price_per_night': round(random.uniform(30, 500), 2),  # Prices between $30 and $500
        'amenities': random.sample(amenities_list, random.randint(3, 6)),
        'rating': round(random.uniform(2.5, 5.0), 1),  # Ratings from 2.5 to 5.0
        'distance_from_city_center': round(random.uniform(0.5, 15), 1),  # Distance in km
        'cancellation_policy': random.choice(cancellation_policies),
        'occupancy_rate': round(random.uniform(30, 100), 1),  # Simulating hotel occupancy %
        'available_rooms': random.randint(5, 100),  # Available rooms left
        'room_type': random.choice(room_types),
        'customer_reviews_count': random.randint(10, 5000),  # Number of customer reviews
        'available_for_booking': random.choice([True, False]),  # Availability status
        'family_friendly': random.choice(['Yes', 'No'])  # Family-friendly indicator
    }
    hotels.append(hotel)

# Convert to DataFrame
hotels_df = pd.DataFrame(hotels)

# Display first few rows
hotels_df.head()



Unnamed: 0,hotel_id,name,location,price_per_night,amenities,rating,distance_from_city_center,cancellation_policy,occupancy_rate,available_rooms,room_type,customer_reviews_count,available_for_booking,family_friendly
0,1,HOTEL I,Nairobi,118.41,"[Parking, Conference Room, Bar, Gym]",3.7,9.5,Moderate,74.0,35,Deluxe,822,False,Yes
1,2,HOTEL T,Kisii,128.29,"[Conference Room, Restaurant, Spa, Parking]",4.0,1.5,Strict,79.6,87,Suite,2112,True,No
2,3,HOTEL O,Nairobi,117.76,"[Restaurant, Conference Room, Bar, Parking, Po...",3.2,11.6,Strict,65.0,53,Standard,4132,False,Yes
3,4,HOTEL H,Eldoret,432.39,"[Bar, Conference Room, Free WiFi, Gym, Parking...",3.3,12.2,Moderate,60.0,72,Deluxe,3247,True,Yes
4,5,HOTEL G,Eldoret,221.18,"[Free WiFi, Gym, Conference Room, Parking, Spa...",3.3,13.2,Flexible,54.3,54,Deluxe,1486,False,No


In [None]:
hotels_df.shape

(2500, 14)

In [None]:
hotels_df.isna().sum()

Unnamed: 0,0
hotel_id,0
name,0
location,0
price_per_night,0
amenities,0
rating,0
distance_from_city_center,0
cancellation_policy,0
occupancy_rate,0
available_rooms,0


In [None]:
hotels_df['amenities'] = hotels_df['amenities'].astype(str) # converts the column into a string

## ***Save the hotel synthesized Data to a csv***

In [None]:
with open('hotels.csv', 'w', encoding='utf-8') as f:
    hotels_df.to_csv(f, index=False)

## **Interactions between the Users & Hotels Data**

In [None]:
!pip install pyspark findspark



In [None]:
import findspark
print(findspark.init())

None


## ***Pyspark implemented for Interactions Dataset***

In [None]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("‚úÖ Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("‚ùå No GPU found, using CPU")


‚ùå No GPU found, using CPU


## ***üöÄ Approach Overview***
1Ô∏è‚É£ Content-Based Filtering (CBF):

Uses hotel features (e.g., location, price, amenities) to recommend similar hotels.
Uses TF-IDF & Cosine Similarity.

2Ô∏è‚É£ Collaborative Filtering (CF):

Uses user interactions (ratings) to recommend hotels.
Uses ALS - alternating Least Squares approach leverages Matrix Factorization

3Ô∏è‚É£ Hybrid Approach:

Combines CBF + CF Scores using a weighted approach.


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
hotels_df.head()

Unnamed: 0,hotel_id,name,location,price_per_night,amenities,rating,distance_from_city_center,cancellation_policy,occupancy_rate,available_rooms,room_type,customer_reviews_count,available_for_booking,family_friendly
0,1,HOTEL I,Nairobi,118.41,"['Parking', 'Conference Room', 'Bar', 'Gym']",3.7,9.5,Moderate,74.0,35,Deluxe,822,False,Yes
1,2,HOTEL T,Kisii,128.29,"['Conference Room', 'Restaurant', 'Spa', 'Park...",4.0,1.5,Strict,79.6,87,Suite,2112,True,No
2,3,HOTEL O,Nairobi,117.76,"['Restaurant', 'Conference Room', 'Bar', 'Park...",3.2,11.6,Strict,65.0,53,Standard,4132,False,Yes
3,4,HOTEL H,Eldoret,432.39,"['Bar', 'Conference Room', 'Free WiFi', 'Gym',...",3.3,12.2,Moderate,60.0,72,Deluxe,3247,True,Yes
4,5,HOTEL G,Eldoret,221.18,"['Free WiFi', 'Gym', 'Conference Room', 'Parki...",3.3,13.2,Flexible,54.3,54,Deluxe,1486,False,No


## ***1. Content Based Filtering Using TF-IDF & COSINE SIMILARITY***

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# **1. Process Text Features (TF-IDF)**
hotels_df["combined_features"] = hotels_df["name"] + hotels_df["location"] + hotels_df["amenities"] + hotels_df["location"] + hotels_df["room_type"] + hotels_df["cancellation_policy"]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(hotels_df["combined_features"])

# **2. Process Boolean Features**
hotels_df["available_for_booking"] = hotels_df["available_for_booking"].astype(int)

# **3. Encode Categorical Features**
hotels_df["family_friendly"] = hotels_df["family_friendly"].map({"Yes": 1, "No": 0})

# **4. Normalize Numerical Features**
scaler = MinMaxScaler()
numeric_features = ["price_per_night", "rating", "distance_from_city_center",
                    "occupancy_rate", "available_rooms", "available_for_booking", "family_friendly"]
scaled_numerical = scaler.fit_transform(hotels_df[numeric_features])

# **5. Combine All Features**
final_features = np.hstack((tfidf_matrix.toarray(), scaled_numerical))


# **6. Compute Cosine Similarity**
cosine_sim = cosine_similarity(final_features, final_features)


# **7. Recommendation Function**
def recommend_hotels(hotel_id, top_n=3): # Make sure the function definition is not indented.
    """
    Recommend similar hotels based on cosine similarity.

    Parameters:
    hotel_id (int): The hotel ID for which recommendations are needed.
    top_n (int): Number of recommendations to return.

    Returns:
    list: Names of top-N recommended hotels.
    """
    index = hotels_df[hotels_df["hotel_id"] == hotel_id].index[0]
    similarity_scores = list(enumerate(cosine_sim[index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[1:top_n+1]  # Exclude itself

    recommended_hotels = [hotels_df.iloc[i[0]]["name"] for i in similarity_scores]
    return recommended_hotels

# Example: Recommend similar hotels to Hotel ID 1
print(recommend_hotels(1))

['HOTEL F', 'HOTEL O', 'HOTEL I']


In [None]:
print(recommend_hotels(1))
print(recommend_hotels(2))
print(recommend_hotels(3))
print(recommend_hotels(4))
print(recommend_hotels(5))

['HOTEL F', 'HOTEL O', 'HOTEL I']
['HOTEL Q', 'HOTEL R', 'HOTEL T']
['HOTEL O', 'HOTEL O', 'HOTEL O']
['HOTEL B', 'HOTEL H', 'HOTEL L']
['HOTEL G', 'HOTEL G', 'HOTEL G']


In [None]:
print(recommend_hotels(5))

['HOTEL G', 'HOTEL G', 'HOTEL G']


In [None]:
hotel_id_2 = hotels_df.iloc[1]
# Filter the DataFrame for the desired hotels
filtered_hotels = hotels_df[hotels_df['name'].isin(['HOTEL U','HOTEL M', 'HOTEL J', 'HOTEL U'])]
#print(hotel_id_2)
df = pd.DataFrame(filtered_hotels)
print(df.head())

    hotel_id     name  location  price_per_night  \
15        16  HOTEL U     Kisii            46.47   
26        27  HOTEL U   Eldoret           218.15   
35        36  HOTEL M    Kisumu           262.74   
38        39  HOTEL J  Naivasha           403.52   
40        41  HOTEL J     Narok           207.32   

                                            amenities  rating  \
15                 ['Spa', 'Conference Room', 'Pool']     4.1   
26  ['Bar', 'Gym', 'Spa', 'Restaurant', 'Pool', 'P...     3.9   
35  ['Bar', 'Parking', 'Restaurant', 'Pool', 'Conf...     4.7   
38  ['Pool', 'Gym', 'Parking', 'Free WiFi', 'Spa',...     4.8   
40                             ['Pool', 'Bar', 'Gym']     4.8   

    distance_from_city_center cancellation_policy  occupancy_rate  \
15                        1.9            Moderate            50.7   
26                       10.6              Strict            70.4   
35                        6.5            Flexible            56.0   
38                  

In [None]:
# Assuming 'rating' is a column in your dataframe
relevant_hotels = hotels_df[hotels_df['rating'] >= 4]
relevant_hotels[['hotel_id','name']].head(10)

Unnamed: 0,hotel_id,name
1,2,HOTEL T
12,13,HOTEL E
13,14,HOTEL D
15,16,HOTEL U
17,18,HOTEL N
18,19,HOTEL P
19,20,HOTEL O
21,22,HOTEL N
29,30,HOTEL R
30,31,HOTEL Q


In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Hybrid Recommenders").getOrCreate()

# ‚úÖ Step 3: Convert Cosine Similarity to Spark DataFrame
cosine_sim_list = []
for i in range(len(hotels_df)):
    for j in range(len(hotels_df)):
        # Convert hotel_id values to native Python int
        cosine_sim_list.append((int(hotels_df.iloc[i]["hotel_id"]), int(hotels_df.iloc[j]["hotel_id"]), float(cosine_sim[i, j])))

schema = StructType([
    StructField("hotel_id_1", IntegerType(), True),
    StructField("hotel_id_2", IntegerType(), True),
    StructField("cosine_score", FloatType(), True)
])

cosine_similarity_df = spark.createDataFrame(cosine_sim_list, schema=schema)

## ***2. Collaborative Filtering Using ALS -(Alternating Least Squares)***

In [None]:
users.head()

Unnamed: 0,user_id,age,location,frequent_traveler_score,loyalty_membership,total_bookings,preferred_hotel_type,past_review_sentiment,amenities
0,1,43,Eldoret,0.17,1,5,Budget,-0.17,"['conference', 'Pool', 'Free WiFi', 'Gym']"
1,2,56,Eldoret,0.59,0,8,Luxury,0.74,"['Free WiFi', 'conference', 'Spa', 'Pool']"
2,3,63,Eldoret,0.43,1,5,Luxury,0.49,"['Gym', 'Pool', 'Spa', 'conference']"
3,4,39,Eldoret,0.15,1,2,Boutique,-0.49,"['conference', 'Spa', 'Free WiFi', 'Gym']"
4,5,60,Naivasha,0.9,1,3,Budget,0.95,"['Pool', 'conference', 'Free WiFi', 'Gym']"


In [None]:
hotels_df.head()

Unnamed: 0,hotel_id,name,location,price_per_night,amenities,rating,distance_from_city_center,cancellation_policy,occupancy_rate,available_rooms,room_type,customer_reviews_count,available_for_booking,family_friendly,combined_features
0,1,HOTEL I,Nairobi,118.41,"['Parking', 'Conference Room', 'Bar', 'Gym']",3.7,9.5,Moderate,74.0,35,Deluxe,822,0,1,"HOTEL INairobi['Parking', 'Conference Room', '..."
1,2,HOTEL T,Kisii,128.29,"['Conference Room', 'Restaurant', 'Spa', 'Park...",4.0,1.5,Strict,79.6,87,Suite,2112,1,0,"HOTEL TKisii['Conference Room', 'Restaurant', ..."
2,3,HOTEL O,Nairobi,117.76,"['Restaurant', 'Conference Room', 'Bar', 'Park...",3.2,11.6,Strict,65.0,53,Standard,4132,0,1,"HOTEL ONairobi['Restaurant', 'Conference Room'..."
3,4,HOTEL H,Eldoret,432.39,"['Bar', 'Conference Room', 'Free WiFi', 'Gym',...",3.3,12.2,Moderate,60.0,72,Deluxe,3247,1,1,"HOTEL HEldoret['Bar', 'Conference Room', 'Free..."
4,5,HOTEL G,Eldoret,221.18,"['Free WiFi', 'Gym', 'Conference Room', 'Parki...",3.3,13.2,Flexible,54.3,54,Deluxe,1486,0,0,"HOTEL GEldoret['Free WiFi', 'Gym', 'Conference..."


## ***Handling Big Data Using Pyspark***

In [None]:
from pyspark.sql import SparkSession

In [None]:
# Initialize Spark session
#spark = SparkSession.builder.appName("interactions data").getOrCreate()
# Load data
users_df = spark.read.csv("users.csv", header=True, inferSchema=True)
hotels_df1 = spark.read.csv("hotels.csv", header=True, inferSchema=True)

In [None]:
# Sample 30% of hotel interactions before merging
hotels_sampled_df = hotels_df1.sample(fraction=0.3, seed=42)  # Adjust seed for reproducibility

# Merge sampled hotels with users where interactions exist
merged_df = users_df.join(hotels_sampled_df, "location", "inner")

# Show results
merged_df.show()

+--------+-------+---+-----------------------+------------------+--------------+--------------------+---------------------+--------------------+--------+-------+---------------+--------------------+------+-------------------------+-------------------+--------------+---------------+---------+----------------------+---------------------+---------------+
|location|user_id|age|frequent_traveler_score|loyalty_membership|total_bookings|preferred_hotel_type|past_review_sentiment|           amenities|hotel_id|   name|price_per_night|           amenities|rating|distance_from_city_center|cancellation_policy|occupancy_rate|available_rooms|room_type|customer_reviews_count|available_for_booking|family_friendly|
+--------+-------+---+-----------------------+------------------+--------------+--------------------+---------------------+--------------------+--------+-------+---------------+--------------------+------+-------------------------+-------------------+--------------+---------------+---------+

In [None]:
interaction_df = merged_df.select("user_id", "hotel_id","rating")
interaction_df.show()

+-------+--------+------+
|user_id|hotel_id|rating|
+-------+--------+------+
|      1|    2482|   3.4|
|      1|    2465|   3.7|
|      1|    2431|   4.4|
|      1|    2399|   3.7|
|      1|    2398|   4.5|
|      1|    2361|   2.7|
|      1|    2292|   4.3|
|      1|    2167|   2.9|
|      1|    2147|   3.8|
|      1|    2142|   3.6|
|      1|    2132|   4.2|
|      1|    2129|   4.4|
|      1|    2076|   4.4|
|      1|    2063|   3.3|
|      1|    2029|   4.4|
|      1|    2015|   4.8|
|      1|    1994|   4.8|
|      1|    1985|   4.7|
|      1|    1888|   4.2|
|      1|    1886|   2.6|
+-------+--------+------+
only showing top 20 rows



In [None]:
num_rows = interaction_df.count()
num_cols = len(interaction_df.columns)
print(f"Number of rows: {num_rows}, Number of columns: {num_cols}")

Number of rows: 9638384, Number of columns: 3


## ***Using Spark ALS - Alternating Least Squares***

In [None]:
from pyspark.ml.recommendation import ALS
als = ALS(
    maxIter=10,        # Number of iterations
    regParam=0.1,      # Regularization parameter
    userCol="user_id",  # Column for users
    itemCol="hotel_id", # Column for hotels
    ratingCol="rating", # Explicit ratings
    coldStartStrategy="drop" # Handle new users/items
)

In [None]:
# Fit the model
model = als.fit(interaction_df)

# Generate hotel recommendations for all users
user_recommendations = model.recommendForAllUsers(10)  # Top 10 recommendations per user

# Show sample recommendations
user_recommendations.show(5, truncate=False)

+-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_id|recommendations                                                                                                                                                                      |
+-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1      |[{538, 4.669151}, {290, 4.669151}, {2015, 4.573862}, {1994, 4.573862}, {713, 4.573862}, {197, 4.573862}, {1985, 4.478573}, {1600, 4.478573}, {1335, 4.478573}, {1175, 4.478573}]     |
|3      |[{538, 4.669151}, {290, 4.669151}, {2015, 4.573862}, {1994, 4.573862}, {713, 4.573862}, {197, 4.573862}, {1985, 4.478573}, {1600, 4.478573}, {1335, 4.478573}, {1175, 4.478573}]     |
|5      |[{1870, 4.663618}, {1015, 4.663

## ***Hybrid Recommendation system ALS+Cosine similarity***

In [None]:
from pyspark.sql.functions import col, explode

# Explode ALS recommendations (since Spark stores them as a list)
als_recommendations = user_recommendations.withColumn("recommendation", explode("recommendations"))

# Extract hotelId and ALS score from nested recommendations column
als_recommendations = als_recommendations.select(
    col("user_id"),
    col("recommendation.hotel_id").alias("hotel_id"),
    col("recommendation.rating").alias("als_score")
)

# Show structure
als_recommendations.show(5)

+-------+--------+---------+
|user_id|hotel_id|als_score|
+-------+--------+---------+
|      1|     538| 4.669151|
|      1|     290| 4.669151|
|      1|    2015| 4.573862|
|      1|    1994| 4.573862|
|      1|     713| 4.573862|
+-------+--------+---------+
only showing top 5 rows



In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType

from pyspark.sql.functions import col, lit

def hybrid_recommendations(user_id, alpha=0.7, beta=0.3, top_n=3):
    """
    Generate hybrid recommendations by combining ALS (collaborative filtering) & Cosine Similarity (content-based).

    Parameters:
    user_id (int): ID of the user to recommend hotels for.
    alpha (float): Weight for ALS scores.
    beta (float): Weight for cosine similarity scores.
    top_n (int): Number of recommendations to return.

    Returns:
    Spark DataFrame: Recommended hotels with final hybrid scores.
    """

    # ‚úÖ Step 1: Get ALS recommendations for the user
    als_recs = als_recommendations.filter(col("user_id") == user_id).select("hotel_id", "als_score")

    # ‚úÖ Step 2: Get content-based recommendations for hotels recommended by ALS
    content_recs = cosine_similarity_df.alias("cosine").join(
        als_recs.alias("als"),
        col("cosine.hotel_id_1") == col("als.hotel_id"),
        "inner"
    ).select(
        col("cosine.hotel_id_2").alias("hotel_id"),
        col("cosine.cosine_score")
    )

    # ‚úÖ Step 3: Merge ALS & content-based recommendations
    hybrid_df = als_recs.join(content_recs, on="hotel_id", how="left").fillna(0)  # Fill missing cosine scores with 0

    # ‚úÖ Step 4: Compute Hybrid Score
    hybrid_df = hybrid_df.withColumn("hybrid_score",
                                     (col("als_score") * lit(alpha)) + (col("cosine_score") * lit(beta)))

    # ‚úÖ Step 5: Get top-N recommendations
    hybrid_df = hybrid_df.orderBy(col("hybrid_score").desc()).limit(top_n)

    return hybrid_df

# Example: Get hybrid recommendations for user 1
hybrid_recommendations(user_id=1).show()


+--------+---------+------------+------------------+
|hotel_id|als_score|cosine_score|      hybrid_score|
+--------+---------+------------+------------------+
|     290| 4.669151|         1.0|3.5684055805206296|
|     538| 4.669151|         1.0|3.5684055805206296|
|     290| 4.669151|  0.79106534|3.5057251811027528|
+--------+---------+------------+------------------+

