In [3]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn

from transformers import BertTokenizer, BertModel, AutoTokenizer
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split

import pickle

In [4]:
# dir to the root
os.chdir('..')

In [5]:
# read data
df = pd.read_csv('dataset\listings_with_comments.csv')

In [6]:
df.comments.

0       ['Daniel is really cool. The place was nice an...
1       ['Nice apartment in a beautifull área of Amste...
2       ['Einfach große Klasse! Zentrale Lage, Einkauf...
3       ["This was by far the best Airbnb experience I...
4       ['Amazing house, perfect location and a dream ...
                              ...                        
6993                                                  NaN
6994                                                  NaN
6995                                                  NaN
6996                                                  NaN
6997                                                  NaN
Name: comments, Length: 6998, dtype: object

In [7]:
# load the model that is responsible for the text embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# create a device object
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')
# move the model to the device
model.to(device)




# parallelize the model across multiple GPUs(if available)
# model = nn.DataParallel(model)

Device: cuda


SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [8]:
# check for nan values
print(df.comments.isna().sum())
# fill nan values with empty string
df.comments.fillna('No review yet', inplace=True)


682


In [9]:
# a =  "['amazing stuff', 'unbelivelable host']"

# # convert the string to a list
# eval(a)
# # convert all comments to a list
# # df.comments = df.comments.apply(lambda x: eval(x))

## This method we have tried require a lot of memory and time to process, so we discarded it

In [10]:
import ast

# corvert the srting representation of a list to a list in the comments column
# e.g "['amazing stuff', 'unbelivelable host']" -> ['amazing stuff', 'unbelivelable host']
def try_literal_eval(x):
    try:
        return ast.literal_eval(x)
    except (ValueError, SyntaxError):
        return x

# df.comments.fillna('No review yet', inplace=True)
df.comments = df.comments.apply(try_literal_eval)

In [11]:
type(df.comments.values[6908])

str

In [None]:
# def get_row_embeddings(df, column):
#     # Initialize an empty list to store the embeddings for each listing
#     all_listing_embeddings = []

#     for reviews in df[column]:
#         if isinstance(reviews, list):  # If there are multiple reviews
#             # Get the embeddings for each review
#             embeddings = model.encode(reviews, show_progress_bar=True)
#         else:  # If there is only one review (or 'No review yet')
#             # Get the embeddings for the single review
#             embeddings = model.encode([reviews], show_progress_bar=True)

#         # Get the average of the embeddings
#         row_embeddings = np.mean(embeddings, axis=0)

#         # Add the row_embeddings to the list of all_listing_embeddings
#         all_listing_embeddings.append(row_embeddings)

#     return all_listing_embeddings

# # Call the function with the dataframe and the comments column
# all_listing_embeddings = get_row_embeddings(df, 'comments')

# # Convert the list of embeddings to a numpy array
# all_listing_embeddings = np.array(all_listing_embeddings)

### batching and parallel processing with ThreadPoolExecutor and  ProcessPoolExecutor


In [12]:

import numpy as np
import concurrent.futures


def encode_reviews(reviews, model, batch_size=32):
    all_review_embeddings = []
    for i in range(0, len(reviews), batch_size):
        batch_reviews = reviews[i:i + batch_size]
        embeddings = model.encode(batch_reviews, show_progress_bar=True)
        all_review_embeddings.extend(embeddings)
    return all_review_embeddings

def get_row_embeddings(df, column, model, batch_size=32, n_workers=4):
    all_reviews = []
    review_indices = []
    current_index = 0
    for reviews in df[column]:
        if isinstance(reviews, list):
            all_reviews.extend(reviews)
            review_indices.append((current_index, current_index + len(reviews)))
            current_index += len(reviews)
        else:
            all_reviews.append(reviews)
            review_indices.append((current_index, current_index + 1))
            current_index += 1

    # Split the data into chunks for parallel processing
    chunks = np.array_split(all_reviews, n_workers)

    # Encode reviews using multiple workers
    with concurrent.futures.ThreadPoolExecutor(max_workers=n_workers) as executor:
        futures = [executor.submit(encode_reviews, chunk, model, batch_size) for chunk in chunks]
        all_review_embeddings = []
        for future in concurrent.futures.as_completed(futures):
            all_review_embeddings.extend(future.result())

    # Calculate the mean embeddings for each row
    mean_embeddings = np.array([np.mean(all_review_embeddings[start:end], axis=0) for start, end in review_indices])

    # Get the individual embeddings for each row
    all_listing_individual_embeddings = [all_review_embeddings[start:end] for start, end in review_indices]

    return all_listing_individual_embeddings, mean_embeddings

# Call the function with the dataframe, the comments column, and the model
all_listing_individual_embeddings, all_listing_mean_embeddings = get_row_embeddings(df, 'comments', model)

In [16]:

import numpy as np
import concurrent.futures
import torch

def encode_listing_reviews(reviews, model, batch_size=32, device='cuda'):
    if not isinstance(reviews, list):
        reviews = [reviews]

    # Move the model to the GPU
    model.to(device)

    all_review_embeddings = []
    for i in range(0, len(reviews), batch_size):
        batch_reviews = reviews[i:i + batch_size]
        embeddings = model.encode(batch_reviews, show_progress_bar=True, device=device)
        all_review_embeddings.extend(embeddings)

    mean_embeddings = np.mean(all_review_embeddings, axis=0)

    return all_review_embeddings, mean_embeddings

def get_row_embeddings(df, column, model, batch_size=32, n_workers=16):
    all_reviews = []
    review_indices = []
    current_index = 0
    for reviews in df[column]:
        if isinstance(reviews, list):
            all_reviews.extend(reviews)
            review_indices.append((current_index, current_index + len(reviews)))
            current_index += len(reviews)
        else:
            all_reviews.append(reviews)
            review_indices.append((current_index, current_index + 1))
            current_index += 1

    # Split the data into chunks for parallel processing
    chunks = np.array_split(all_reviews, n_workers)

    with concurrent.futures.ProcessPoolExecutor(max_workers=n_workers) as executor:
        futures = [executor.submit(encode_listing_reviews, reviews, model, batch_size, device='cuda') for reviews in all_reviews]
        results = [future.result() for future in concurrent.futures.as_completed(futures)]

    # Get the individual embeddings and mean embeddings for each row
    all_listing_individual_embeddings = [result[0] for result in results]
    all_listing_mean_embeddings = np.array([result[1] for result in results])

    return all_listing_individual_embeddings, all_listing_mean_embeddings


In [17]:
all_listing_individual_embeddings, all_listing_mean_embeddings = get_row_embeddings(df, 'comments', model)

MemoryError: Unable to allocate 321. GiB for an array with shape (336757,) and data type <U256122

## We enforced to encode all the reviews as string listing representation

In [18]:
# each row have multiple comments, so we need to get the embeddings for each comment and then average them
# to get the embedding for the row

def get_row_embeddings(df, column):
    # convert comments to strings
    comments = df[column]
    # get the embeddings for each comment
    embeddings = model.encode(comments.values, show_progress_bar=True)
    # get the average of the embeddings
    row_embeddings = np.mean(embeddings, axis=0)
    return embeddings, row_embeddings



In [26]:
print(df.comments.values[0])
df.comments.values[6980]

['Daniel is really cool. The place was nice and clean. Very quiet neighborhood. He had maps and a lonely planet guide book in the room for you to use. I didnt have any trouble finding the place from Central Station. I would defintely come back! Thanks!', 'Daniel is the most amazing host! His place is extremely clean, and he provides everything you could possibly want (comfy bed, guidebooks & maps, mini-fridge, towels, even toiletries). He is extremely friendly and helpful, and will go out of his way to help you if needed, or just let you come and go as you please. Highly recommended!', 'We had such a great time in Amsterdam. Daniel is an excellent host! Very friendly and everytime helpful. The room was cozy, comfortable, and VERY clean. Maps, towels, mini-fridge, coffee (very tasty) and tea supplies were provided. Bathroom with all the bathroom requires. Good location - next to the tram stop and it took just around 10-15 minutes to get to the city center.\r<br/>Totally recommendable! '

'No review yet'

In [28]:
model.encode(df.comments.values[0]).shape

(384,)

In [29]:
model.encode(df.comments.values[6980]).shape

(384,)

In [30]:
type(df.comments.values[0])

str

In [19]:

# get the embeddings for the comments
comments_embeddings, mean_emb = get_row_embeddings(df, 'comments')

Batches: 100%|██████████| 219/219 [00:53<00:00,  4.11it/s]


In [20]:
comments_embeddings.shape

(6998, 384)

In [27]:
import pickle
with open('comments_embeddings.pkl', 'wb') as f:
    pickle.dump(comments_embeddings, f)

with open('mean_emb.pkl', 'wb') as f:
    pickle.dump(mean_emb, f)