In [6]:
import pandas as pd

input_file = "Group4_Part1_preprocessed.csv"
output_file = "user_roomtypes_amenities.csv"

user_items = {
    1: [21, 70, 160, 440],
    2: [444, 579, 770],
    3: [751, 771, 779],
    4: [45, 84, 155],
    5: [81, 124, 702]
}

df = pd.read_csv(input_file)

results = []
for user_id, items in user_items.items():
    roomtypes = set()
    amenities = set()
    
    for item in items:
        matches = df[df["itemid"] == item]
        for _, row in matches.iterrows():
            roomtypes.add(str(row["roomtype_tokens_str"]).strip())
            amenities.update(a.strip(" '") for a in str(row["amenity_vector"]).strip("{}").split(",") if a.strip())
    
    results.append({
        "userID": user_id,
        "Itemids":", ".join(map(str,items)),
        "roomtypes": ", ".join(sorted(roomtypes)),
        "amenities": ", ".join(sorted(amenities))
    })

pd.DataFrame(results).to_csv(output_file, index=False)
print(f"Output saved to {output_file}")

Output saved to user_roomtypes_amenities.csv


In [17]:
# Let's implement a robust version that matches the actual column names and formats.

import pandas as pd

import re

from collections import defaultdict
 
user_df = pd.read_csv("user_roomtypes_amenities.csv")
hotel_df = pd.read_csv("Group4_Part1_preprocessed.csv")
 
def parse_itemid_list(x):
    if pd.isna(x):
        return []
    return [int(tok) for tok in re.split(r"[,\s]+", str(x).strip()) if tok.isdigit()]
 
def parse_roomtype_tokens(s):
    if pd.isna(s):
        return set()
    s = str(s).lower()
    parts = re.split(r"[,\|;/]+", s)
    return set(t.strip() for t in parts if t.strip())
 
def parse_amenities_vector(s):
    if pd.isna(s):
        return set()
    s = str(s).strip()
    if s.startswith("{") and s.endswith("}"):
        s = s[1:-1]
        
    toks = [t.strip().strip("'").strip('"').lower() for t in s.split(",")]
    return set(t for t in toks if t)
 
# Normalize hotel features

hotel_df = hotel_df.copy()
hotel_df["roomtype_set"] = hotel_df["roomtype_tokens_str"].apply(parse_roomtype_tokens)
hotel_df["amenity_set"] = hotel_df["amenity_vector"].apply(parse_amenities_vector)
hotel_df["feature_set"] = hotel_df.apply(lambda r: r["roomtype_set"] | r["amenity_set"], axis=1)
hotel_df.to_csv("newClean_hotel.csv")

 
# For fast lookup by itemid

item_to_features = dict(zip(hotel_df["itemid"].astype(int), hotel_df["feature_set"]))
item_to_hotelid = dict(zip(hotel_df["itemid"].astype(int), hotel_df["hotelid"].astype(int)))

# --- Prepare users ---

users = user_df["userID"].tolist()[:5]  # ensure 5 columns
user_to_items = {row["userID"]: parse_itemid_list(row["itemid"]) for _, row in user_df.iterrows()}
 
user_to_item_feature_sets = defaultdict(dict)
user_to_visited_hotelids = {}
 
for uid, items in user_to_items.items():
    visited_hotelids = set()
    for it in items:
        if it in item_to_features:
            user_to_item_feature_sets[uid][it] = item_to_features[it]
            visited_hotelids.add(item_to_hotelid[it])
            
    user_to_visited_hotelids[uid] = visited_hotelids
 


def jaccard(a, b):
    if not a and not b:
        return 0.0
    return len(a & b) / len(a | b)
 


rows = hotel_df["itemid"].astype(int).tolist()
cols = users
matrix = pd.DataFrame(index=rows, columns=cols)
for uid in users:

    visited_items = set(user_to_items.get(uid, []))

    visited_hotels = user_to_visited_hotelids.get(uid, set())

    # iterate every hotel room item

    for _, row in hotel_df.iterrows():

        item_id = int(row["itemid"])

        hotel_id = int(row["hotelid"])

        # If user has been to this hotel (any of its rooms), mark as Visited

        if item_id in visited_items or hotel_id in visited_hotels:

            matrix.loc[item_id, uid] = "Visited"

            continue

        hotel_set = row["feature_set"]

        # compare with all visited item feature sets of this user, take max similarity

        sims = [jaccard(hotel_set, fset) for fset in user_to_item_feature_sets[uid].values()]

        matrix.loc[item_id, uid] = round(max(sims), 3) if sims else 0.0
 
# Save and preview
matrix.to_csv("user_hotel_similarity.csv")
print("Saved similarity matrix to user_hotel_similarity.csv")

 

Saved similarity matrix to user_hotel_similarity.csv


In [3]:
import pandas as pd
 
user_df = pd.read_csv("user_roomtypes_amenities.csv")
hotel_df = pd.read_csv("Group4_Part1_preprocessed.csv")
 

def parse_itemid_list(x):
    if pd.isna(x):
        return []
    return [int(tok.strip()) for tok in str(x).split(",") if tok.strip().isdigit()]
 
def parse_roomtype_tokens(s):
    if pd.isna(s):
        return set()
    return set(t.strip().lower() for t in str(s).replace(";", ",").replace("|", ",").split(",") if t.strip())
 
def parse_amenities_vector(s):
    if pd.isna(s):
        return set()
    s = str(s).strip()
    if s.startswith("{") and s.endswith("}"):
        s = s[1:-1]   # remove curly braces
    toks = [t.strip().strip("'").strip('"').lower() for t in s.split(",")]
    return set(t for t in toks if t)


hotel_df = hotel_df.copy()
hotel_df["roomtype_set"] = hotel_df["roomtype_tokens_str"].apply(parse_roomtype_tokens)
hotel_df["amenity_set"] = hotel_df["amenity_vector"].apply(parse_amenities_vector)
hotel_df["feature_set"] = hotel_df.apply(lambda r: r["roomtype_set"] | r["amenity_set"], axis=1)



item_to_features = dict(zip(hotel_df["itemid"].astype(int), hotel_df["feature_set"]))
item_to_hotelid = dict(zip(hotel_df["itemid"].astype(int), hotel_df["hotelid"].astype(int)))

users = user_df["userID"].tolist()[:5]  
#user_to_items = {row["userID"]: parse_itemid_list(row["itemid"]) for _, row in user_df.iterrows()}
user_to_items = {row["userID"]:[ int(tok.strip())  for tok in str(row["itemid"]).split(",") if tok.strip().isdigit()] if not pd.isna(row["itemid"]) else []for _, row in user_df.iterrows()}

user_to_item_feature_sets = {}
user_to_visited_hotelids = {}
for uid, items in user_to_items.items():
    item_feature_map = {}
    visited_hotelids = set()
    for it in items:
        if it in item_to_features:
            item_feature_map[it] = item_to_features[it]
            visited_hotelids.add(item_to_hotelid[it])
    user_to_item_feature_sets[uid] = item_feature_map
    user_to_visited_hotelids[uid] = visited_hotelids


# --- Jaccard similarity ---
def jaccard(a, b):
    if not a and not b:
        return 0.0
    return len(a & b) / len(a | b)
 
# --- Build matrix ---

rows = hotel_df["itemid"].astype(int).tolist()
cols = users
matrix = pd.DataFrame(index=rows, columns=cols)
 
for uid in users:
    visited_items = set(user_to_items.get(uid, []))
    visited_hotels = user_to_visited_hotelids.get(uid, set())
    for _, row in hotel_df.iterrows():
        item_id = int(row["itemid"])
        hotel_id = int(row["hotelid"])
        if item_id in visited_items or hotel_id in visited_hotels:
            matrix.loc[item_id, uid] = "Visited"
        else:
            hotel_set = row["feature_set"]
            sims = [jaccard(hotel_set, fset) for fset in user_to_item_feature_sets[uid].values()]
            matrix.loc[item_id, uid] = round(max(sims), 3) if sims else 0.0
 
# Save output

matrix.to_csv("user_hotel_similarity1.csv")
print("Saved", matrix.shape)

Saved (780, 5)
