In [239]:
import os
from os import path
import sys
sys.path.append(os.path.abspath(path.join(os.getcwd(), os.pardir)))

from src import project_config as pc

import pandas as pd

import torch
from transformers import AutoTokenizer, AutoModel

In [206]:
nyc_listings_file_url = 'http://data.insideairbnb.com/united-states/ny/new-york-city/2023-06-05/data/listings.csv.gz'

In [207]:
data_df = pd.read_csv(nyc_listings_file_url)

In [210]:
listing_df_cols = [
    'id',
    'listing_url',
    'room_type',
    'neighbourhood_group_cleansed',
    'neighbourhood_cleansed',
    'bathrooms_text',
    'bedrooms',
    'beds',
    'accommodates',
    'price',
    'review_scores_rating',
    'number_of_reviews',
    'latitude',
    'longitude',
    'property_type',
    'description',
    'amenities',
    'neighborhood_overview',
    'host_about'
]

In [211]:
data_df = data_df[listing_df_cols]

In [240]:
def construct_listing_summary(row):
    summary = []
    type_and_loc = f'{row.room_type} in {row.neighbourhood_cleansed}, {row.neighbourhood_group_cleansed}'
    summary.append(type_and_loc)
    
    # bed and bath info
    bed_bath = ''
    if not pd.isna(row.bedrooms):
        bed_bath = f'{int(row.bedrooms)} bedroom' + 's' if int(row.bedrooms) > 1 else ''
    if not pd.isna(row.beds):
        bed_bath += ' with ' if bed_bath != '' else ''
        bed_bath += f'{int(row.beds)} bed' + 's' if int(row.beds) > 1 else ''
    if not pd.isna(row.bathrooms_text):
        bed_bath += ' and ' if bed_bath != '' else ''
        bed_bath += row.bathrooms_text
    summary.append(bed_bath)
    
    max_occupancy = ''
    if row.accommodates > 0:
        max_occupancy = f'Accommodates up to {row.accommodates} people' if row.accommodates > 1 else 'Accommodates 1 person'
    summary.append(max_occupancy)
    
    cost = f'Costs {row.price}'
    summary.append(cost)
    
    summary = [s for s in summary if s]
    summary = '. '.join(summary)
    
    return summary

In [241]:
data_df['listing_summary'] = data_df.apply(construct_listing_summary, axis=1)

In [242]:
sentences = data_df['listing_summary'][:10].tolist()

In [16]:
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-en', cache_dir=pc.HUGGING_FACE_CACHE_DIR)
model = AutoModel.from_pretrained('BAAI/bge-large-en', cache_dir=pc.HUGGING_FACE_CACHE_DIR).to('cuda:0')

In [243]:
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt').to('cuda:0')

In [245]:
# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)
    # Perform pooling. In this case, cls pooling.
    sentence_embeddings = model_output[0][:, 0]
# normalize embeddings
sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)

In [246]:
sentence_embeddings

tensor([[-0.0155,  0.0272, -0.0254,  ..., -0.0198, -0.0258, -0.0017],
        [-0.0102,  0.0116, -0.0052,  ..., -0.0225, -0.0115, -0.0201],
        [-0.0061,  0.0171, -0.0174,  ..., -0.0038, -0.0358, -0.0078],
        ...,
        [-0.0231,  0.0261, -0.0121,  ..., -0.0194, -0.0168, -0.0048],
        [-0.0103,  0.0108, -0.0225,  ..., -0.0139, -0.0405, -0.0059],
        [-0.0078,  0.0143, -0.0260,  ...,  0.0008, -0.0183, -0.0256]],
       device='cuda:0')

In [247]:
scores = sentence_embeddings @ sentence_embeddings.T

In [248]:
scores

tensor([[1.0000, 0.8856, 0.9138, 0.8936, 0.9397, 0.9368, 0.9288, 0.9476, 0.9504,
         0.8865],
        [0.8856, 1.0000, 0.9279, 0.9025, 0.8951, 0.8978, 0.8961, 0.9009, 0.9315,
         0.9500],
        [0.9138, 0.9279, 1.0000, 0.8785, 0.9403, 0.9041, 0.9014, 0.9365, 0.9437,
         0.9348],
        [0.8936, 0.9025, 0.8785, 1.0000, 0.9200, 0.9213, 0.9017, 0.9162, 0.8717,
         0.8961],
        [0.9397, 0.8951, 0.9403, 0.9200, 1.0000, 0.9496, 0.9432, 0.9646, 0.9098,
         0.8822],
        [0.9368, 0.8978, 0.9041, 0.9213, 0.9496, 1.0000, 0.9395, 0.9391, 0.9109,
         0.8810],
        [0.9288, 0.8961, 0.9014, 0.9017, 0.9432, 0.9395, 1.0000, 0.9398, 0.9022,
         0.8800],
        [0.9476, 0.9009, 0.9365, 0.9162, 0.9646, 0.9391, 0.9398, 1.0000, 0.9149,
         0.8911],
        [0.9504, 0.9315, 0.9437, 0.8717, 0.9098, 0.9109, 0.9022, 0.9149, 1.0000,
         0.9261],
        [0.8865, 0.9500, 0.9348, 0.8961, 0.8822, 0.8810, 0.8800, 0.8911, 0.9261,
         1.0000]], device='c

In [111]:
scores

tensor([[1.0000, 0.9386, 0.9247, 0.9153, 0.9690],
        [0.9386, 1.0000, 0.9496, 0.9497, 0.9274],
        [0.9247, 0.9496, 1.0000, 0.9280, 0.8994],
        [0.9153, 0.9497, 0.9280, 1.0000, 0.9204],
        [0.9690, 0.9274, 0.8994, 0.9204, 1.0000]], device='cuda:0')

In [249]:
sentences

['Entire home/apt in Midtown, Manhattan. 1 bath. Accommodates 1 person. Costs $240.00',
 'Private room in Bedford-Stuyvesant, Brooklyn. Accommodates up to 2 people. Costs $60.00',
 'Private room in Lower East Side, Manhattan. 1 bath. Accommodates 1 person. Costs $120.00',
 'Entire home/apt in Sunset Park, Brooklyn. 2 bedrooms with 2 beds and 1.5 baths. Accommodates up to 4 people. Costs $276.00',
 'Entire home/apt in Lower East Side, Manhattan. 2 beds and 1 bath. Accommodates up to 3 people. Costs $315.00',
 'Entire home/apt in Chinatown, Manhattan. 2 bedrooms with 2 beds and 1 bath. Accommodates up to 5 people. Costs $325.00',
 'Entire home/apt in Tribeca, Manhattan. 3 bedrooms with 3 beds and 1 bath. Accommodates up to 6 people. Costs $500.00',
 'Entire home/apt in East Village, Manhattan. 1 bath. Accommodates up to 2 people. Costs $160.00',
 'Private room in Midtown, Manhattan. 1 bath. Accommodates up to 2 people. Costs $68.00',
 'Private room in Williamsburg, Brooklyn. 1 shared bat

In [31]:
for s in a:
    print(len(s))
    print(s)

1000
Beautiful, spacious skylit studio in the heart of Midtown, Manhattan. <br /><br />1 BED / FULL BATH / FULL KITCHEN / CENTRALLY LOCATED / HIGH SPEED WIFI<br /><br /><b>The space</b><br />- Spacious, immaculate and nicely furnished & designed studio.<br />- Tuck yourself into the ultra comfortable bed.<br />- Stunning architectural details, soaring high vaulted ceilings, exposed brick, floor seating area with natural zafu cushions, modern style mixed with eclectic art & antique treasures, large full bath, newly renovated kitchen, air conditioning/heat, high speed WiFi Internet, ergonomic office desk and chair, Roku.<br />- Centrally located in the heart of Midtown Manhattan just a few blocks from all subway connections in the very desirable Midtown location a few minutes walk to Times Square, the Theater District, Bryant Park and Herald Square.<br />- This is a walk-up building (no elevator).<br />- Laundry pickup and delivery service available.<br />- Parking garage on the block an