In [None]:
!pip3 install torch datasets faiss-cpu faiss-gpu transformers rank-bm25

Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2


In [None]:
import os
import re
import time
import torch
import faiss
import numpy as np
import pandas as pd
from tqdm import tqdm
from rank_bm25 import BM25Okapi
from datasets import load_dataset
from google.colab import userdata
from transformers import pipeline, PegasusTokenizer, AutoModel, AutoTokenizer

In [None]:
# Load the data
dataset = load_dataset("traversaal-ai-hackathon/hotel_datasets")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModel.from_pretrained("google/bigbird-roberta-base").to(device)
tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base")

pipe = pipeline("summarization", model="google/pegasus-large", device=0 if torch.cuda.is_available() else -1)
summarization_tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Convert to a DataFrame
df = dataset['train'].to_pandas()

In [None]:
df = df.drop(columns=['hotel_url', 'hotel_image', 'price_range', 'tripdate', 'rate'])
df['country'] = df['country'].replace(['Türkiye', 'Turkiye'], 'Turkey')

In [None]:
df.head(5)

Unnamed: 0,hotel_name,hotel_description,review_title,review_text,rating_value,review_count,street_address,locality,country
0,Romance Istanbul Hotel,Romance Istanbul Hotel has 39 rooms.Every room...,"An exceptional boutique hotel, great value for...",,5.0,4023,Hudavendigar Cd. No:5 Sirkeci,Istanbul,Turkey
1,Romance Istanbul Hotel,Romance Istanbul Hotel has 39 rooms.Every room...,You can’t get better than this.,,5.0,4023,Hudavendigar Cd. No:5 Sirkeci,Istanbul,Turkey
2,Romance Istanbul Hotel,Romance Istanbul Hotel has 39 rooms.Every room...,Exceeds all expectations,,5.0,4023,Hudavendigar Cd. No:5 Sirkeci,Istanbul,Turkey
3,Romance Istanbul Hotel,Romance Istanbul Hotel has 39 rooms.Every room...,"Great Location, Fantastic Accommodations",,5.0,4023,Hudavendigar Cd. No:5 Sirkeci,Istanbul,Turkey
4,Romance Istanbul Hotel,Romance Istanbul Hotel has 39 rooms.Every room...,Perfection. It is all in the details.,,5.0,4023,Hudavendigar Cd. No:5 Sirkeci,Istanbul,Turkey


In [None]:
df['all_text'] = (
    "Title: " + df['review_title'].fillna(' ') +
    " Review: " + df['review_text'].fillna(' ')
)

df.drop(columns=['review_title', 'review_text'], inplace=True)

for col in df.columns:
    if df[col].dtype == object:
      df[col] = df[col].apply(lambda x: re.sub('[^a-zA-Z0-9\s]', '', str(x)).lower())


df = df.groupby('hotel_name').agg({
    'hotel_description': 'first',
    'rating_value': 'first',
    'review_count': 'first',
    'street_address': 'first',
    'locality': 'first',
    'country': 'first',
    'all_text': ' '.join
}).reset_index()

df['all_text'] = (
    "Description: " + df['hotel_description'].fillna(' ') + " " +
    "Rating: " + df['rating_value'].fillna(' ').astype(str) + " " +
    "Reviews: " + df['review_count'].fillna(' ').astype(str) + " " +
    "Address: " + df['street_address'].fillna(' ') + " " +
    "Locality: " + df['locality'].fillna(' ') + " " +
    "Country: " + df['country'].fillna(' ') + " " +
    df['all_text']
)

df.to_pickle('/content/drive/MyDrive/week-2/day-5/df_ready_for_summarization_and_embedding.pkl')

In [None]:
df = pd.read_pickle('/content/drive/MyDrive/week-2/day-5/df_ready_for_summarization_and_embedding.pkl')

embeddings = []

for idx in tqdm(range(len(df)), desc="Processing embeddings"):
    doc_text = df.loc[idx, 'all_text']

    inputs = tokenizer(doc_text, return_tensors='pt', truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        doc_embedding = outputs[0].mean(dim=1).cpu().numpy().flatten()

    embeddings.append(doc_embedding)

df['embeddings'] = embeddings

df.to_pickle('/content/drive/MyDrive/week-2/day-5/df_with_embeddings.pkl')


Processing embeddings:  12%|█▏        | 18/150 [00:57<06:52,  3.12s/it]Attention type 'block_sparse' is not possible if sequence_length: 511 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...
Processing embeddings: 100%|██████████| 150/150 [19:14<00:00,  7.70s/it]


In [None]:
df = pd.read_pickle('/content/drive/MyDrive/week-2/day-5/df_with_embeddings.pkl')
df.head(5)

Unnamed: 0,hotel_name,hotel_description,rating_value,review_count,street_address,locality,country,all_text,embeddings
0,ace hotel new york,reception ace hotel nyc,4.5,2525,20 w 29th street broadway,new york city,united states,Description: reception ace hotel nyc Rating: ...,"[0.08239335, 0.17917216, 0.02131201, -0.341600..."
1,ajwa sultanahmet,more than a mere fivestar hotel ajwa sultanahm...,5.0,625,piyerloti caddesi no30 sultanahmet,istanbul,turkey,Description: more than a mere fivestar hotel a...,"[0.07009437, 0.15243739, 0.026863214, -0.26727..."
2,ameritania at times square,discover the ameritania hotel where fun and mo...,4.5,7569,230 w 54th street,new york city,united states,Description: discover the ameritania hotel whe...,"[0.053479265, 0.16862988, 0.0659188, -0.289952..."
3,argonaut hotel,located in the heart of san franciscos fisherm...,4.5,7814,495 jefferson st,san francisco,united states,Description: located in the heart of san franc...,"[0.08278599, 0.15106724, 0.04488371, -0.303002..."
4,arlo soho,arlo soho hotel features 325 thoughtfully desi...,4.5,2900,231 hudson st,new york city,united states,Description: arlo soho hotel features 325 thou...,"[0.055794816, 0.17793478, 0.05399605, -0.24915..."


In [None]:
df = pd.read_pickle('/content/drive/MyDrive/week-2/day-5/df_with_embeddings.pkl')

dimension = 768
embeddings = np.vstack(df['embeddings'].values).astype('float32')

nlist = 10

quantizer = faiss.IndexFlatL2(dimension)
index = faiss.IndexIVFFlat(quantizer, dimension, nlist)

faiss.normalize_L2(embeddings)

index.train(embeddings)

index.add(embeddings)

faiss_index_path = '/content/drive/MyDrive/week-2/day-5/faiss_ivfflat_index.index'
faiss.write_index(index, faiss_index_path)

print(f"FAISS IVFFlat index saved to {faiss_index_path}")

FAISS IVFFlat index saved to /content/drive/MyDrive/week-2/day-5/faiss_ivfflat_index.index


In [None]:
df = pd.read_pickle('/content/drive/MyDrive/week-2/day-5/df_with_embeddings.pkl')

faiss_index_path = '/content/drive/MyDrive/week-2/day-5/faiss_index.index'
index = faiss.read_index(faiss_index_path)

tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base")
tokenized_corpus = [tokenizer.tokenize(text) for text in df['all_text']]
bm25 = BM25Okapi(tokenized_corpus)

model = AutoModel.from_pretrained("google/bigbird-roberta-base").to(device)

Token indices sequence length is longer than the specified maximum sequence length for this model (5074 > 4096). Running this sequence through the model will result in indexing errors


In [None]:
bm25_top_k=20
faiss_top_k=3

def search(query, ):
    tokenized_query = tokenizer.tokenize(query)
    bm25_scores = bm25.get_scores(tokenized_query)

    bm25_top_indices = np.argsort(bm25_scores)[::-1][:bm25_top_k]
    bm25_top_embeddings = np.vstack(df['embeddings'].iloc[bm25_top_indices].values).astype('float32')

    query_tokens = tokenizer(query, return_tensors='pt', truncation=True, padding=True).to(device)
    with torch.no_grad():
        query_embedding = model(**query_tokens).last_hidden_state.mean(dim=1).cpu().numpy().flatten()

    faiss.normalize_L2(query_embedding.reshape(1, -1))

    temp_index = faiss.IndexFlatIP(768)
    faiss.normalize_L2(bm25_top_embeddings)
    temp_index.add(bm25_top_embeddings)

    _, faiss_top_indices = temp_index.search(query_embedding.reshape(1, -1), faiss_top_k)

    final_indices = bm25_top_indices[faiss_top_indices[0]]

    results = df.iloc[final_indices].copy()
    results = results.drop(columns=['all_text', 'embeddings'])

    return results

In [None]:
%%time
query = "Hotel near london bigben"
results = search(query)
results

CPU times: user 203 ms, sys: 22 µs, total: 203 ms
Wall time: 55 ms


Unnamed: 0,hotel_name,hotel_description,rating_value,review_count,street_address,locality,country
9,canopy by hilton london city,canopy by hilton london city is an impressive ...,5.0,911,1115 minories,london,united kingdom
137,travelodge london central city road,none,4.0,3056,123 city road,london,united kingdom
85,park grand london hyde park,none,4.5,4166,78 82 westbourne terrace paddington,london,united kingdom
141,travelodge london docklands central,none,4.0,1575,1 oregano drive,london,united kingdom
132,the royal horseguards,this majestic fivestar hotel presides over the...,4.5,7184,2 whitehall court,london,united kingdom
