In [348]:
import os
from os import path
import sys
sys.path.append(os.path.abspath(path.join(os.getcwd(), os.pardir)))

from src import project_config as pc

import re
import numpy as np
import pandas as pd

import torch
from transformers import AutoTokenizer, AutoModel

In [206]:
nyc_listings_file_url = 'http://data.insideairbnb.com/united-states/ny/new-york-city/2023-06-05/data/listings.csv.gz'

In [207]:
data_df = pd.read_csv(nyc_listings_file_url)

In [210]:
listing_df_cols = [
    'id',
    'listing_url',
    'room_type',
    'neighbourhood_group_cleansed',
    'neighbourhood_cleansed',
    'bathrooms_text',
    'bedrooms',
    'beds',
    'accommodates',
    'price',
    'review_scores_rating',
    'number_of_reviews',
    'latitude',
    'longitude',
    'property_type',
    'description',
    'amenities',
    'neighborhood_overview',
    'host_about'
]

In [211]:
data_df = data_df[listing_df_cols]

In [253]:
def construct_listing_summary(row):
    summary = []
    type_and_loc = f'{row.room_type} in {row.neighbourhood_group_cleansed}'
    summary.append(type_and_loc)
    
    # bed and bath info
    bed_bath = ''
    if not pd.isna(row.bedrooms):
        bed_bath = f'{int(row.bedrooms)} bedroom' + 's' if int(row.bedrooms) > 1 else ''
    if not pd.isna(row.beds):
        bed_bath += ' with ' if bed_bath != '' else ''
        bed_bath += f'{int(row.beds)} bed' + 's' if int(row.beds) > 1 else ''
    if not pd.isna(row.bathrooms_text):
        bed_bath += ' and ' if bed_bath != '' else ''
        bed_bath += row.bathrooms_text
    summary.append(bed_bath)
    
    max_occupancy = ''
    if row.accommodates > 0:
        max_occupancy = f'Accommodates up to {row.accommodates} people' if row.accommodates > 1 else 'Accommodates 1 person'
    summary.append(max_occupancy)
    
    cost = f'Costs {row.price}'
    summary.append(cost)
    
    summary = [s for s in summary if s]
    summary = '. '.join(summary)
    return summary

In [373]:
data_df.longitude

0       -73.985590
1       -73.955120
2       -73.989760
3       -73.994540
4       -73.993668
           ...    
43561   -73.924019
43562   -73.976717
43563   -73.920519
43564   -73.951268
43565   -73.999289
Name: longitude, Length: 43566, dtype: float64

In [254]:
data_df['listing_summary'] = data_df.apply(construct_listing_summary, axis=1)

In [None]:
listing_df_cols = [
    'id',
    'listing_url',
    'room_type',
    'neighbourhood_group_cleansed',
    'neighbourhood_cleansed',
    'bathrooms_text',
    'bedrooms',
    'beds',
    'accommodates',
    'price',
    'review_scores_rating',
    'number_of_reviews',
    'latitude',
    'longitude',
    'property_type',
    'description',
    'amenities',
    'neighborhood_overview',
    'host_about'
]

In [392]:
data_df.amenities[0]

['Dedicated workspace',
 ' Baking sheet',
 ' Extra pillows and blankets',
 ' Cleaning available during stay',
 ' Essentials',
 ' Hair dryer',
 ' Free street parking',
 ' TV',
 ' Air conditioning',
 ' Long term stays allowed',
 ' Refrigerator',
 ' Keypad',
 ' Heating',
 ' Ethernet connection',
 ' Paid parking off premises',
 ' Stove',
 ' Bed linens',
 ' Bathtub',
 ' Iron',
 ' Dishes and silverware',
 ' Self check-in',
 ' Smoke alarm',
 ' Fire extinguisher',
 ' Wifi',
 ' Cooking basics',
 ' Hangers',
 ' Kitchen',
 ' Oven',
 ' Carbon monoxide alarm',
 ' Luggage dropoff allowed',
 ' Coffee maker',
 ' Hot water']

In [390]:
data_df.amenities.str.replace('\[|\]', '', regex=True).str.replace('"', '', regex=False).str.split(',')

0        [Dedicated workspace,  Baking sheet,  Extra pi...
1            [Kitchen,  Heating,  Air conditioning,  Wifi]
2        [Air conditioning,  Refrigerator,  Elevator,  ...
3        [Private patio or balcony,  Air conditioning, ...
4        [Dedicated workspace,  Conditioner,  Baking sh...
                               ...                        
43561    [Dedicated workspace,  Cleaning products,  Hai...
43562    [Dedicated workspace,  TV with standard cable,...
43563    [Dedicated workspace,  Cleaning products,  Hai...
43564    [First aid kit,  Dedicated workspace,  Air con...
43565    [Air conditioning,  Kitchen,  Piano,  Pool tab...
Name: amenities, Length: 43566, dtype: object

In [331]:
def construct_description_summary(row):
    summary = []
    
    def clean_html(txt):
        return re.sub('<.*?>', '', txt)
    
    property_type = f'This is a{"n" if row.property_type[0].lower() in ["a", "e", "i", "o", "u"] else ""} '
    property_type += row.property_type.lower()
    summary.append(property_type)
    
    if not pd.isna(row.description):
        summary.append(clean_html(row.description))
    
    if not pd.isna(row.amenities):
        amenities = 'Amenities include: ' + re.sub('"', '', row.amenities[1:-1]).lower()
        summary.append(amenities)
    
    if not pd.isna(row.neighborhood_overview):
        neigh_overview = 'A little about the neighborhood. ' + clean_html(row.neighborhood_overview)
        summary.append(neigh_overview)
    
    if not pd.isna(row.host_about):
        host_info = 'Host information: ' + clean_html(row.host_about)
        summary.append(host_info)
    
    summary = [s for s in summary if s]
    summary = '. '.join(summary)
    
    # remove extra spaces
    summary = summary.strip()
    summary = re.sub('\s+', ' ', summary)
        
    return summary

In [341]:
construct_description_summary(data_df.iloc[9])

"This is a private room in rental unit. A home away from home. Its a very cute and charming.full size bed (sheets and towels included). VERY CLEAN You are welcome to use the kitchen to prepare light meals, coffee, tea. That said at the moment I don't have a dinning table. Just one chair in the kitchen but you have a desk in your room.*Not ideal to work from home. It is a small apartment.Amazing stores, restaurants and cafes. 2 blocks away from Domino park and to the Williamsburg bridge.The spaceBest location in Williamsburg - Brooklyn, first stop on the L, J,Z.M 10 to 15 min walk.Close to Bedford av. The apartment is overall a welcoming space. Small and charming.No ideal if you are planing to work from home.You will get a private room, my room is at the oposite end. Be ready to go up 4 floors. A good way to keep your legs in shape ;)*No smoking and no pets.*1 month stay Max.During your stayFor. Amenities include: first aid kit, air conditioning, lockbox, lock on bedroom door, refrigera

In [333]:
data_df['listing_desc_summary'] = data_df.apply(construct_description_summary, axis=1)

In [336]:
desc_sum = data_df['listing_desc_summary'][:10].tolist()

In [16]:
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-en', cache_dir=pc.HUGGING_FACE_CACHE_DIR)
model = AutoModel.from_pretrained('BAAI/bge-large-en', cache_dir=pc.HUGGING_FACE_CACHE_DIR).to('cuda:0')

In [337]:
encoded_input = tokenizer(desc_sum, padding=True, truncation=True, return_tensors='pt').to('cuda:0')

In [342]:
# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)
    # Perform pooling. In this case, cls pooling.
    sentence_embeddings = model_output[0][:, 0]
# normalize embeddings
sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)

In [343]:
sentence_embeddings

tensor([[-0.0143,  0.0252, -0.0238,  ..., -0.0229, -0.0350,  0.0010],
        [-0.0103,  0.0041, -0.0284,  ..., -0.0239, -0.0185, -0.0005],
        [ 0.0092,  0.0124, -0.0172,  ..., -0.0192, -0.0307, -0.0084],
        ...,
        [-0.0148,  0.0192, -0.0074,  ..., -0.0196, -0.0092, -0.0138],
        [ 0.0047,  0.0160, -0.0228,  ..., -0.0194, -0.0316, -0.0067],
        [-0.0035, -0.0108, -0.0283,  ..., -0.0174, -0.0123, -0.0269]],
       device='cuda:0')

In [344]:
scores = sentence_embeddings @ sentence_embeddings.T

In [345]:
scores

tensor([[1.0000, 0.8784, 0.8867, 0.9048, 0.8997, 0.9015, 0.9033, 0.8862, 0.8779,
         0.8925],
        [0.8784, 1.0000, 0.8880, 0.9060, 0.8945, 0.9106, 0.8963, 0.8675, 0.8489,
         0.9017],
        [0.8867, 0.8880, 1.0000, 0.8905, 0.8909, 0.9096, 0.8959, 0.8790, 0.8856,
         0.8894],
        [0.9048, 0.9060, 0.8905, 1.0000, 0.9337, 0.9448, 0.9342, 0.8874, 0.8587,
         0.9398],
        [0.8997, 0.8945, 0.8909, 0.9337, 1.0000, 0.9499, 0.9383, 0.8854, 0.8658,
         0.9200],
        [0.9015, 0.9106, 0.9096, 0.9448, 0.9499, 1.0000, 0.9437, 0.9087, 0.8771,
         0.9345],
        [0.9033, 0.8963, 0.8959, 0.9342, 0.9383, 0.9437, 1.0000, 0.8974, 0.8556,
         0.9253],
        [0.8862, 0.8675, 0.8790, 0.8874, 0.8854, 0.9087, 0.8974, 1.0000, 0.8529,
         0.8762],
        [0.8779, 0.8489, 0.8856, 0.8587, 0.8658, 0.8771, 0.8556, 0.8529, 1.0000,
         0.8630],
        [0.8925, 0.9017, 0.8894, 0.9398, 0.9200, 0.9345, 0.9253, 0.8762, 0.8630,
         1.0000]], device='c

In [None]:
'id',
'listing_url',
'room_type',
'neighbourhood_group_cleansed',
'neighbourhood_cleansed',
'bathrooms_text',
'bedrooms',
'beds',
'accommodates',
'price',
'review_scores_rating',
'number_of_reviews',
'latitude',
'longitude',
'property_type',
'description',
'amenities',
'neighborhood_overview',
'host_about'

In [None]:
# Exlude same neighborhood
# bedrooms not the same
# accomodates 
# price 

In [362]:
np.log10(60) - np.log10(110)

-0.2632414347745813

In [356]:
900 + 10**1.2

915.8489319246112

In [358]:
np.log10(300)

2.4771212547196626