In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load data
df = pd.read_csv('./data/catalog.csv', encoding='ISO-8859-1')

# Specify the attributes you want to combine
attributes_to_combine = ['highlights', 'hotelstate', 'region']

# Check that all specified attributes exist in the DataFrame
missing_attrs = [attr for attr in attributes_to_combine if attr not in df.columns]
if missing_attrs:
    raise ValueError(f"Missing attributes in the DataFrame: {missing_attrs}")

# Combine specified attributes
def combine_attributes(row):
    combined_text = []
    for attr in attributes_to_combine:
        value = row[attr]
        combined_text.append(value if pd.notna(value) else '')
    return ' '.join(combined_text)

df['combined'] = df.apply(combine_attributes, axis=1)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['combined'])

# Use dot product similarity (not cosine normalized)
similarity_matrix = tfidf_matrix @ tfidf_matrix.T  # Matrix multiplication gives dot product similarity

# Ensure hotel names are unique
df['hotel_name_unique'] = df['hotel_name'] + ' - ' + df.index.astype(str)
hotel_names = df['hotel_name_unique'].values

# Create similarity matrix DataFrame
similarity_df = pd.DataFrame(similarity_matrix.toarray(), index=hotel_names, columns=hotel_names)



In [5]:
# Choose target hotel (match with unique names)
target_hotel_name = "Ginger Bangalore, IRR"
target_row = df[df['hotel_name'] == target_hotel_name]

if target_row.empty:
    print(f"Hotel '{target_hotel_name}' not found in dataset.")
else:
    target_unique_name = target_row['hotel_name_unique'].values[0]
    
    # Extract and filter similar hotels
    similarities = similarity_df[target_unique_name].copy()
    filtered_similarities = similarities[similarities >= 0.7].sort_values(ascending=False)

    # Convert to DataFrame
result_df = filtered_similarities.reset_index()
result_df.columns = ['hotel_name_unique', 'similarity_score']
result_df['hotel_name'] = result_df['hotel_name_unique'].apply(lambda x: x.split(' - ')[0])

# Drop duplicates by hotel_name and similarity_score
result_df = result_df[['hotel_name', 'similarity_score']].drop_duplicates()

# Display results
print(f"Hotels similar to '{target_hotel_name}' with similarity >= 0.7:")
print(result_df)


Hotels similar to 'Ginger Bangalore, IRR' with similarity >= 0.7:
                     hotel_name  similarity_score
0         Ginger Bangalore, IRR          1.000000
2                 Ginger Mysore          0.822660
3              Ginger Mangalore          0.822660
6  Ginger Bangalore, Whitefield          0.754183


In [6]:
# Define similarity range
lower_threshold = 0.6
upper_threshold = 1.0

# Step 1: Choose target hotel (match with unique names)
target_hotel_name = "Ginger Bangalore, IRR"
target_row = df[df['hotel_name'] == target_hotel_name]

if target_row.empty:
    print(f"Hotel '{target_hotel_name}' not found in dataset.")
else:
    target_unique_name = target_row['hotel_name_unique'].values[0]

    # Step 2: Extract and filter similar hotels within range (0.6 < sim < 1.0)
    similarities = similarity_df[target_unique_name].copy()
    filtered_similarities = similarities[(similarities > lower_threshold) & (similarities < upper_threshold)]
    filtered_similarities = filtered_similarities.sort_values(ascending=False)

    # Step 3: Convert to DataFrame
    result_df = filtered_similarities.reset_index()
    result_df.columns = ['hotel_name_unique', 'similarity_score']
    result_df['hotel_name'] = result_df['hotel_name_unique'].apply(lambda x: x.split(' - ')[0])

    # Step 4: Drop duplicates (optional)
    result_df = result_df[['hotel_name', 'similarity_score']].drop_duplicates()

    # Step 5: Display
    print(f"Hotels similar to '{target_hotel_name}' with similarity in range ({lower_threshold}, {upper_threshold}):")
    print(result_df)


Hotels similar to 'Ginger Bangalore, IRR' with similarity in range (0.6, 1.0):
                     hotel_name  similarity_score
0              Ginger Mangalore          0.822660
1                 Ginger Mysore          0.822660
4  Ginger Bangalore, Whitefield          0.754183
6            Ginger Pune, Wakad          0.648935
7                  Ginger Surat          0.645549


In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load data
df = pd.read_csv('./data/catalog.csv', encoding='ISO-8859-1')

# Specify the attributes you want to combine
attributes_to_combine = ['highlights', 'latitude', 'region']

# Check that all specified attributes exist in the DataFrame
missing_attrs = [attr for attr in attributes_to_combine if attr not in df.columns]
if missing_attrs:
    raise ValueError(f"Missing attributes in the DataFrame: {missing_attrs}")

# Combine specified attributes safely by converting all values to string
def combine_attributes(row):
    combined_text = []
    for attr in attributes_to_combine:
        value = row[attr]
        combined_text.append(str(value) if pd.notna(value) else '')
    return ' '.join(combined_text)


df['combined'] = df.apply(combine_attributes, axis=1)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['combined'])

# Use dot product similarity (not cosine normalized)
similarity_matrix = tfidf_matrix @ tfidf_matrix.T  # Matrix multiplication gives dot product similarity

# Ensure hotel names are unique
df['hotel_name_unique'] = df['hotel_name'] + ' - ' + df.index.astype(str)
hotel_names = df['hotel_name_unique'].values

# Create similarity matrix DataFrame
similarity_df = pd.DataFrame(similarity_matrix.toarray(), index=hotel_names, columns=hotel_names)



In [8]:
# Define similarity range
lower_threshold = 0.6
upper_threshold = 1.0

# Step 1: Choose target hotel (match with unique names)
target_hotel_name = "Ginger Bangalore, IRR"
target_row = df[df['hotel_name'] == target_hotel_name]

if target_row.empty:
    print(f"Hotel '{target_hotel_name}' not found in dataset.")
else:
    target_unique_name = target_row['hotel_name_unique'].values[0]

    # Step 2: Extract and filter similar hotels within range (0.6 < sim < 1.0)
    similarities = similarity_df[target_unique_name].copy()
    filtered_similarities = similarities[(similarities > lower_threshold) & (similarities < upper_threshold)]
    filtered_similarities = filtered_similarities.sort_values(ascending=False)

    # Step 3: Convert to DataFrame
    result_df = filtered_similarities.reset_index()
    result_df.columns = ['hotel_name_unique', 'similarity_score']
    result_df['hotel_name'] = result_df['hotel_name_unique'].apply(lambda x: x.split(' - ')[0])

    # Step 4: Drop duplicates (optional)
    result_df = result_df[['hotel_name', 'similarity_score']].drop_duplicates()

    # Step 5: Display
    print(f"Hotels similar to '{target_hotel_name}' with similarity in range ({lower_threshold}, {upper_threshold}):")
    print(result_df)


Hotels similar to 'Ginger Bangalore, IRR' with similarity in range (0.6, 1.0):
Empty DataFrame
Columns: [hotel_name, similarity_score]
Index: []


In [9]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load data
df = pd.read_csv('./data/catalog.csv', encoding='ISO-8859-1')

# Specify the attributes you want to combine
attributes_to_combine = ['highlights', 'hotelcity', 'region']

# Check that all specified attributes exist in the DataFrame
missing_attrs = [attr for attr in attributes_to_combine if attr not in df.columns]
if missing_attrs:
    raise ValueError(f"Missing attributes in the DataFrame: {missing_attrs}")

# Combine specified attributes safely by converting all values to string
def combine_attributes(row):
    combined_text = []
    for attr in attributes_to_combine:
        value = row[attr]
        combined_text.append(str(value) if pd.notna(value) else '')
    return ' '.join(combined_text)


df['combined'] = df.apply(combine_attributes, axis=1)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['combined'])

# Use dot product similarity (not cosine normalized)
similarity_matrix = tfidf_matrix @ tfidf_matrix.T  # Matrix multiplication gives dot product similarity

# Ensure hotel names are unique
df['hotel_name_unique'] = df['hotel_name'] + ' - ' + df.index.astype(str)
hotel_names = df['hotel_name_unique'].values

# Create similarity matrix DataFrame
similarity_df = pd.DataFrame(similarity_matrix.toarray(), index=hotel_names, columns=hotel_names)



In [10]:
# Define similarity range
lower_threshold = 0.6
upper_threshold = 1.0

# Step 1: Choose target hotel (match with unique names)
target_hotel_name = "Ginger Bangalore, IRR"
target_row = df[df['hotel_name'] == target_hotel_name]

if target_row.empty:
    print(f"Hotel '{target_hotel_name}' not found in dataset.")
else:
    target_unique_name = target_row['hotel_name_unique'].values[0]

    # Step 2: Extract and filter similar hotels within range (0.6 < sim < 1.0)
    similarities = similarity_df[target_unique_name].copy()
    filtered_similarities = similarities[(similarities > lower_threshold) & (similarities < upper_threshold)]
    filtered_similarities = filtered_similarities.sort_values(ascending=False)

    # Step 3: Convert to DataFrame
    result_df = filtered_similarities.reset_index()
    result_df.columns = ['hotel_name_unique', 'similarity_score']
    result_df['hotel_name'] = result_df['hotel_name_unique'].apply(lambda x: x.split(' - ')[0])

    # Step 4: Drop duplicates (optional)
    result_df = result_df[['hotel_name', 'similarity_score']].drop_duplicates()

    # Step 5: Display
    print(f"Hotels similar to '{target_hotel_name}' with similarity in range ({lower_threshold}, {upper_threshold}):")
    print(result_df)


Hotels similar to 'Ginger Bangalore, IRR' with similarity in range (0.6, 1.0):
                     hotel_name  similarity_score
0  Ginger Bangalore, Whitefield          0.777433


In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load data
df = pd.read_csv('./data/catalog.csv', encoding='ISO-8859-1')

# Specify the attributes you want to combine
attributes_to_combine = ['highlights', 'hotelstate', 'pin_code']

# Check that all specified attributes exist in the DataFrame
missing_attrs = [attr for attr in attributes_to_combine if attr not in df.columns]
if missing_attrs:
    raise ValueError(f"Missing attributes in the DataFrame: {missing_attrs}")

# Combine specified attributes safely by converting all values to string
def combine_attributes(row):
    combined_text = []
    for attr in attributes_to_combine:
        value = row[attr]
        combined_text.append(str(value) if pd.notna(value) else '')
    return ' '.join(combined_text)


df['combined'] = df.apply(combine_attributes, axis=1)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['combined'])

# Use dot product similarity (not cosine normalized)
similarity_matrix = tfidf_matrix @ tfidf_matrix.T  # Matrix multiplication gives dot product similarity

# Ensure hotel names are unique
df['hotel_name_unique'] = df['hotel_name'] + ' - ' + df.index.astype(str)
hotel_names = df['hotel_name_unique'].values

# Create similarity matrix DataFrame
similarity_df = pd.DataFrame(similarity_matrix.toarray(), index=hotel_names, columns=hotel_names)



In [12]:
# Define similarity range
lower_threshold = 0.6
upper_threshold = 1.0

# Step 1: Choose target hotel (match with unique names)
target_hotel_name = "Ginger Bangalore, IRR"
target_row = df[df['hotel_name'] == target_hotel_name]

if target_row.empty:
    print(f"Hotel '{target_hotel_name}' not found in dataset.")
else:
    target_unique_name = target_row['hotel_name_unique'].values[0]

    # Step 2: Extract and filter similar hotels within range (0.6 < sim < 1.0)
    similarities = similarity_df[target_unique_name].copy()
    filtered_similarities = similarities[(similarities > lower_threshold) & (similarities < upper_threshold)]
    filtered_similarities = filtered_similarities.sort_values(ascending=False)

    # Step 3: Convert to DataFrame
    result_df = filtered_similarities.reset_index()
    result_df.columns = ['hotel_name_unique', 'similarity_score']
    result_df['hotel_name'] = result_df['hotel_name_unique'].apply(lambda x: x.split(' - ')[0])

    # Step 4: Drop duplicates (optional)
    result_df = result_df[['hotel_name', 'similarity_score']].drop_duplicates()

    # Step 5: Display
    print(f"Hotels similar to '{target_hotel_name}' with similarity in range ({lower_threshold}, {upper_threshold}):")
    print(result_df)


Hotels similar to 'Ginger Bangalore, IRR' with similarity in range (0.6, 1.0):
Empty DataFrame
Columns: [hotel_name, similarity_score]
Index: []


In [13]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load data
df = pd.read_csv('./data/catalog.csv', encoding='ISO-8859-1')

# Specify the attributes you want to combine
attributes_to_combine = ['highlights', 'region', 'pin_code']

# Check that all specified attributes exist in the DataFrame
missing_attrs = [attr for attr in attributes_to_combine if attr not in df.columns]
if missing_attrs:
    raise ValueError(f"Missing attributes in the DataFrame: {missing_attrs}")

# Combine specified attributes safely by converting all values to string
def combine_attributes(row):
    combined_text = []
    for attr in attributes_to_combine:
        value = row[attr]
        combined_text.append(str(value) if pd.notna(value) else '')
    return ' '.join(combined_text)


df['combined'] = df.apply(combine_attributes, axis=1)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['combined'])

# Use dot product similarity (not cosine normalized)
similarity_matrix = tfidf_matrix @ tfidf_matrix.T  # Matrix multiplication gives dot product similarity

# Ensure hotel names are unique
df['hotel_name_unique'] = df['hotel_name'] + ' - ' + df.index.astype(str)
hotel_names = df['hotel_name_unique'].values

# Create similarity matrix DataFrame
similarity_df = pd.DataFrame(similarity_matrix.toarray(), index=hotel_names, columns=hotel_names)



In [14]:
# Define similarity range
lower_threshold = 0.6
upper_threshold = 1.0

# Step 1: Choose target hotel (match with unique names)
target_hotel_name = "Ginger Bangalore, IRR"
target_row = df[df['hotel_name'] == target_hotel_name]

if target_row.empty:
    print(f"Hotel '{target_hotel_name}' not found in dataset.")
else:
    target_unique_name = target_row['hotel_name_unique'].values[0]

    # Step 2: Extract and filter similar hotels within range (0.6 < sim < 1.0)
    similarities = similarity_df[target_unique_name].copy()
    filtered_similarities = similarities[(similarities > lower_threshold) & (similarities < upper_threshold)]
    filtered_similarities = filtered_similarities.sort_values(ascending=False)

    # Step 3: Convert to DataFrame
    result_df = filtered_similarities.reset_index()
    result_df.columns = ['hotel_name_unique', 'similarity_score']
    result_df['hotel_name'] = result_df['hotel_name_unique'].apply(lambda x: x.split(' - ')[0])

    # Step 4: Drop duplicates (optional)
    result_df = result_df[['hotel_name', 'similarity_score']].drop_duplicates()

    # Step 5: Display
    print(f"Hotels similar to '{target_hotel_name}' with similarity in range ({lower_threshold}, {upper_threshold}):")
    print(result_df)


Hotels similar to 'Ginger Bangalore, IRR' with similarity in range (0.6, 1.0):
              hotel_name  similarity_score
0  Ginger Bangalore, IRR               1.0


In [15]:
#dymanically giving hotel name not user intereaction

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load data
df = pd.read_csv('./data/catalog.csv', encoding='ISO-8859-1')

# Specify the attributes you want to combine
attributes_to_combine = ['highlights', 'hotelstate', 'region']

# Check that all specified attributes exist in the DataFrame
missing_attrs = [attr for attr in attributes_to_combine if attr not in df.columns]
if missing_attrs:
    raise ValueError(f"Missing attributes in the DataFrame: {missing_attrs}")

# Combine specified attributes safely by converting all values to string
def combine_attributes(row):
    combined_text = []
    for attr in attributes_to_combine:
        value = row[attr]
        combined_text.append(str(value) if pd.notna(value) else '')
    return ' '.join(combined_text)

df['combined'] = df.apply(combine_attributes, axis=1)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['combined'])

# Use dot product similarity (not cosine normalized)
similarity_matrix = tfidf_matrix @ tfidf_matrix.T

# Ensure hotel names are unique
df['hotel_name_unique'] = df['hotel_name'] + ' - ' + df.index.astype(str)
hotel_names = df['hotel_name_unique'].values

# Create similarity matrix DataFrame
similarity_df = pd.DataFrame(similarity_matrix.toarray(), index=hotel_names, columns=hotel_names)

# Define similarity range
lower_threshold = 0.6
upper_threshold = 1.0

# -------------------------
# Select target hotel dynamically (e.g., first hotel in the dataset)
target_row = df.iloc[2]
target_hotel_name = target_row['hotel_name']
target_unique_name = target_row['hotel_name_unique']
# -------------------------

# Extract and filter similar hotels within range
similarities = similarity_df[target_unique_name].copy()
filtered_similarities = similarities[(similarities > lower_threshold) & (similarities < upper_threshold)]
filtered_similarities = filtered_similarities.sort_values(ascending=False)

# Convert to DataFrame
result_df = filtered_similarities.reset_index()
result_df.columns = ['hotel_name_unique', 'similarity_score']
result_df['hotel_name'] = result_df['hotel_name_unique'].apply(lambda x: x.split(' - ')[0])

# Drop duplicates (optional)
result_df = result_df[['hotel_name', 'similarity_score']].drop_duplicates()

# Display
print(f"Hotels similar to '{target_hotel_name}' with similarity in range ({lower_threshold}, {upper_threshold}):")
print(result_df)


Hotels similar to 'Ginger Bangalore, IRR' with similarity in range (0.6, 1.0):
                     hotel_name  similarity_score
0              Ginger Mangalore          0.822660
1                 Ginger Mysore          0.822660
4  Ginger Bangalore, Whitefield          0.754183
6            Ginger Pune, Wakad          0.648935
7                  Ginger Surat          0.645549
