Initial Exploraiton
===================

Quick and dirty exploration of the data and potential modeling approaches.

The notebook should not be used as a reference.

In [34]:
import os, sys
from os import path
sys.path.append(os.path.abspath(path.join(os.getcwd(), os.pardir, 'src')))

import project_config as pc

import re
import numpy as np
import pandas as pd

import torch
from transformers import AutoTokenizer, AutoModel

In [3]:
nyc_listings_file_url = 'http://data.insideairbnb.com/united-states/ny/new-york-city/2023-06-05/data/listings.csv.gz'

In [4]:
data_df = pd.read_csv(nyc_listings_file_url)

In [5]:
listing_df_cols = [
    'id',
    'listing_url',
    'room_type',
    'neighbourhood_group_cleansed',
    'neighbourhood_cleansed',
    'bathrooms_text',
    'bedrooms',
    'beds',
    'accommodates',
    'price',
    'review_scores_rating',
    'number_of_reviews',
    'latitude',
    'longitude',
    'property_type',
    'description',
    'amenities',
    'neighborhood_overview',
    'host_about'
]

In [6]:
data_df = data_df[listing_df_cols]

In [10]:
data_df.neighborhood_overview.isna().sum()

18365

In [12]:
def construct_listing_summary(row):
    summary = []
    type_and_loc = f'{row.room_type} in {row.neighbourhood_group_cleansed}'
    summary.append(type_and_loc)
    
    # bed and bath info
    bed_bath = ''
    if not pd.isna(row.bedrooms):
        bed_bath = f'{int(row.bedrooms)} bedroom' + 's' if int(row.bedrooms) > 1 else ''
    if not pd.isna(row.beds):
        bed_bath += ' with ' if bed_bath != '' else ''
        bed_bath += f'{int(row.beds)} bed' + 's' if int(row.beds) > 1 else ''
    if not pd.isna(row.bathrooms_text):
        bed_bath += ' and ' if bed_bath != '' else ''
        bed_bath += row.bathrooms_text
    summary.append(bed_bath)
    
    max_occupancy = ''
    if row.accommodates > 0:
        max_occupancy = f'Accommodates up to {row.accommodates} people' if row.accommodates > 1 else 'Accommodates 1 person'
    summary.append(max_occupancy)
    
    cost = f'Costs {row.price}'
    summary.append(cost)
    
    summary = [s for s in summary if s]
    summary = '. '.join(summary)
    return summary

In [13]:
data_df['listing_summary'] = data_df.apply(construct_listing_summary, axis=1)

In [14]:
listing_df_cols = [
    'id',
    'listing_url',
    'room_type',
    'neighbourhood_group_cleansed',
    'neighbourhood_cleansed',
    'bathrooms_text',
    'bedrooms',
    'beds',
    'accommodates',
    'price',
    'review_scores_rating',
    'number_of_reviews',
    'latitude',
    'longitude',
    'property_type',
    'description',
    'amenities',
    'neighborhood_overview',
    'host_about'
]

In [15]:
data_df.amenities[0]

'["Dedicated workspace", "Baking sheet", "Extra pillows and blankets", "Cleaning available during stay", "Essentials", "Hair dryer", "Free street parking", "TV", "Air conditioning", "Long term stays allowed", "Refrigerator", "Keypad", "Heating", "Ethernet connection", "Paid parking off premises", "Stove", "Bed linens", "Bathtub", "Iron", "Dishes and silverware", "Self check-in", "Smoke alarm", "Fire extinguisher", "Wifi", "Cooking basics", "Hangers", "Kitchen", "Oven", "Carbon monoxide alarm", "Luggage dropoff allowed", "Coffee maker", "Hot water"]'

In [33]:
type(data_df.iloc[0: 15])

pandas.core.frame.DataFrame

In [16]:
data_df.description

0        Beautiful, spacious skylit studio in the heart...
1        One room available for rent in a 2 bedroom apt...
2        Room for rent in my Manhattan apartment. The a...
3        We welcome you to stay in our lovely 2 br dupl...
4        Greetings! <br /><br />Come relax here after y...
                               ...                        
43561    Cozy rooms in a 4BR1BA Sunnyside townhouse.<br...
43562    Looking for great accommodations for your New ...
43563    This 4BR place is situated in a prime Queens l...
43564    Take it easy at this unique and tranquil getaway.
43565    Welcome to my oversized studio in the heart of...
Name: description, Length: 43566, dtype: object

In [17]:
def construct_description_summary(row):
    summary = []
    
    def clean_html(txt):
        return re.sub('<.*?>', '', txt)
    
    property_type = f'This is a{"n" if row.property_type[0].lower() in ["a", "e", "i", "o", "u"] else ""} '
    property_type += row.property_type.lower()
    summary.append(property_type)
    
    if not pd.isna(row.description):
        summary.append(clean_html(row.description))
    
    if not pd.isna(row.amenities):
        amenities = 'Amenities include: ' + re.sub('"', '', row.amenities[1:-1]).lower()
        summary.append(amenities)
    
    if not pd.isna(row.neighborhood_overview):
        neigh_overview = 'A little about the neighborhood. ' + clean_html(row.neighborhood_overview)
        summary.append(neigh_overview)
    
    if not pd.isna(row.host_about):
        host_info = 'Host information: ' + clean_html(row.host_about)
        summary.append(host_info)
    
    summary = [s for s in summary if s]
    summary = '. '.join(summary)
    
    # remove extra spaces
    summary = summary.strip()
    summary = re.sub('\s+', ' ', summary)
        
    return summary

In [18]:
construct_description_summary(data_df.iloc[9])

"This is a private room in rental unit. A home away from home. Its a very cute and charming.full size bed (sheets and towels included). VERY CLEAN You are welcome to use the kitchen to prepare light meals, coffee, tea. That said at the moment I don't have a dinning table. Just one chair in the kitchen but you have a desk in your room.*Not ideal to work from home. It is a small apartment.Amazing stores, restaurants and cafes. 2 blocks away from Domino park and to the Williamsburg bridge.The spaceBest location in Williamsburg - Brooklyn, first stop on the L, J,Z.M 10 to 15 min walk.Close to Bedford av. The apartment is overall a welcoming space. Small and charming.No ideal if you are planing to work from home.You will get a private room, my room is at the oposite end. Be ready to go up 4 floors. A good way to keep your legs in shape ;)*No smoking and no pets.*1 month stay Max.During your stayFor. Amenities include: first aid kit, air conditioning, lockbox, lock on bedroom door, refrigera

In [19]:
data_df['listing_desc_summary'] = data_df.apply(construct_description_summary, axis=1)

In [20]:
desc_sum = data_df['listing_summary'][220:230].tolist()

In [21]:
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-en', cache_dir=pc.HUGGING_FACE_CACHE_DIR)
model = AutoModel.from_pretrained('BAAI/bge-large-en', cache_dir=pc.HUGGING_FACE_CACHE_DIR).to('cuda:0')

In [22]:
desc_sum.append('')

In [23]:
encoded_input = tokenizer(desc_sum, padding=True, truncation=True, return_tensors='pt').to('cuda:0')

In [24]:
# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)
    # Perform pooling. In this case, cls pooling.
    sentence_embeddings = model_output[0][:, 0]
# normalize embeddings
# sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)

In [25]:
def get_cos_similarity(embeds):
    embeds_norm = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
    return embeds_norm @ embeds_norm.T

In [26]:
sim = get_cos_similarity(sentence_embeddings)
sim

tensor([[1.0000, 0.9788, 0.9539, 0.9410, 0.9062, 0.8983, 0.9491, 0.9514, 0.9509,
         0.9879, 0.7971],
        [0.9788, 1.0000, 0.9415, 0.9426, 0.9112, 0.9117, 0.9373, 0.9493, 0.9486,
         0.9780, 0.7962],
        [0.9539, 0.9415, 1.0000, 0.9765, 0.9367, 0.9289, 0.9843, 0.9847, 0.9540,
         0.9510, 0.8021],
        [0.9410, 0.9426, 0.9765, 1.0000, 0.9335, 0.9347, 0.9645, 0.9735, 0.9464,
         0.9421, 0.8029],
        [0.9062, 0.9112, 0.9367, 0.9335, 1.0000, 0.9866, 0.9300, 0.9361, 0.9069,
         0.9008, 0.7929],
        [0.8983, 0.9117, 0.9289, 0.9347, 0.9866, 1.0000, 0.9240, 0.9306, 0.9019,
         0.8994, 0.7824],
        [0.9491, 0.9373, 0.9843, 0.9645, 0.9300, 0.9240, 1.0000, 0.9857, 0.9472,
         0.9517, 0.7996],
        [0.9514, 0.9493, 0.9847, 0.9735, 0.9361, 0.9306, 0.9857, 1.0000, 0.9510,
         0.9550, 0.8032],
        [0.9509, 0.9486, 0.9540, 0.9464, 0.9069, 0.9019, 0.9472, 0.9510, 1.0000,
         0.9504, 0.7970],
        [0.9879, 0.9780, 0.9510, 0.94

In [140]:
def get_cos_similarity_mean_subtracted(embeds):
    avg_emb = torch.mean(embeds, axis=0)
    embeds_norm = torch.nn.functional.normalize(embeds - avg_emb, p=2, dim=1)
    return embeds_norm @ embeds_norm.T

In [141]:
sim_sub = get_cos_similarity_mean_subtracted(sentence_embeddings)
sim_sub

tensor([[ 1.0000,  0.5538, -0.2448, -0.3797, -0.4956, -0.5499, -0.2503, -0.3408,
          0.0587,  0.7413],
        [ 0.5538,  1.0000, -0.5274, -0.2901, -0.3534, -0.2833, -0.4793, -0.3418,
          0.0542,  0.5389],
        [-0.2448, -0.5274,  1.0000,  0.3013, -0.1958, -0.2817,  0.5043,  0.4387,
         -0.0745, -0.3194],
        [-0.3797, -0.2901,  0.3013,  1.0000, -0.1123, -0.0335,  0.0464,  0.1973,
         -0.1011, -0.3514],
        [-0.4956, -0.3534, -0.1958, -0.1123,  1.0000,  0.8451, -0.2188, -0.2198,
         -0.3315, -0.5805],
        [-0.5499, -0.2833, -0.2817, -0.0335,  0.8451,  1.0000, -0.2615, -0.2599,
         -0.3421, -0.5314],
        [-0.2503, -0.4793,  0.5043,  0.0464, -0.2188, -0.2615,  1.0000,  0.5422,
         -0.1321, -0.1835],
        [-0.3408, -0.3418,  0.4387,  0.1973, -0.2198, -0.2599,  0.5422,  1.0000,
         -0.1659, -0.2328],
        [ 0.0587,  0.0542, -0.0745, -0.1011, -0.3315, -0.3421, -0.1321, -0.1659,
          1.0000,  0.0515],
        [ 0.7413,  

In [142]:
desc_sum

['Entire home/apt in Brooklyn. 2 bedrooms with 2 beds and 1 bath. Accommodates up to 2 people. Costs $185.00',
 'Entire home/apt in Brooklyn. 1 bath. Accommodates up to 2 people. Costs $97.00',
 'Entire home/apt in Manhattan. 2 bedrooms with 2 beds and 1.5 baths. Accommodates up to 4 people. Costs $550.00',
 'Entire home/apt in Manhattan. 4 beds and 1 bath. Accommodates up to 5 people. Costs $160.00',
 'Private room in Manhattan. 1 private bath. Accommodates up to 2 people. Costs $189.00',
 'Private room in Manhattan. 1 private bath. Accommodates 1 person. Costs $175.00',
 'Entire home/apt in Manhattan. 2 bedrooms with 3 beds and 1 bath. Accommodates up to 6 people. Costs $542.00',
 'Entire home/apt in Manhattan. 2 bedrooms with  and 1 bath. Accommodates up to 3 people. Costs $110.00',
 'Entire home/apt in Bronx. 2 beds and 1 bath. Accommodates up to 5 people. Costs $114.00',
 'Entire home/apt in Brooklyn. 2 beds and 1 bath. Accommodates up to 3 people. Costs $225.00']

In [143]:
def is_order_same(sim_a, sim_b):
    print(torch.argsort(sim_a, axis=1))
    print(torch.argsort(sim_b, axis=1))

In [144]:
is_order_same(sim, sim_sub)

tensor([[5, 4, 3, 6, 8, 7, 2, 1, 9, 0],
        [4, 5, 6, 2, 3, 8, 7, 9, 0, 1],
        [5, 4, 1, 9, 0, 8, 3, 6, 7, 2],
        [4, 5, 0, 9, 1, 8, 6, 7, 2, 3],
        [9, 0, 8, 1, 6, 3, 7, 2, 5, 4],
        [0, 9, 8, 1, 6, 2, 7, 3, 4, 5],
        [5, 4, 1, 8, 0, 9, 3, 2, 7, 6],
        [5, 4, 1, 8, 0, 9, 3, 2, 6, 7],
        [5, 4, 3, 6, 1, 9, 0, 7, 2, 8],
        [5, 4, 3, 8, 2, 6, 7, 1, 0, 9]], device='cuda:0')
tensor([[5, 4, 3, 7, 6, 2, 8, 1, 9, 0],
        [2, 6, 4, 7, 3, 5, 8, 9, 0, 1],
        [1, 9, 5, 0, 4, 8, 3, 7, 6, 2],
        [0, 9, 1, 4, 8, 5, 6, 7, 2, 3],
        [9, 0, 1, 8, 7, 6, 2, 3, 5, 4],
        [0, 9, 8, 1, 2, 6, 7, 3, 4, 5],
        [1, 5, 0, 4, 9, 8, 3, 2, 7, 6],
        [1, 0, 5, 9, 4, 8, 3, 2, 6, 7],
        [5, 4, 7, 6, 3, 2, 9, 1, 0, 8],
        [4, 5, 3, 2, 7, 6, 8, 1, 0, 9]], device='cuda:0')


In [42]:
scores = norm_sentence_embeddings @ norm_sentence_embeddings.T

In [43]:
scores

tensor([[1.0000, 0.8784, 0.8867, 0.9048, 0.8997, 0.9015, 0.9033, 0.8862, 0.8779,
         0.8925],
        [0.8784, 1.0000, 0.8880, 0.9060, 0.8945, 0.9106, 0.8963, 0.8675, 0.8489,
         0.9017],
        [0.8867, 0.8880, 1.0000, 0.8905, 0.8909, 0.9096, 0.8959, 0.8790, 0.8856,
         0.8894],
        [0.9048, 0.9060, 0.8905, 1.0000, 0.9337, 0.9448, 0.9342, 0.8874, 0.8587,
         0.9398],
        [0.8997, 0.8945, 0.8909, 0.9337, 1.0000, 0.9499, 0.9383, 0.8854, 0.8658,
         0.9200],
        [0.9015, 0.9106, 0.9096, 0.9448, 0.9499, 1.0000, 0.9437, 0.9087, 0.8771,
         0.9345],
        [0.9033, 0.8963, 0.8959, 0.9342, 0.9383, 0.9437, 1.0000, 0.8974, 0.8556,
         0.9253],
        [0.8862, 0.8675, 0.8790, 0.8874, 0.8854, 0.9087, 0.8974, 1.0000, 0.8529,
         0.8762],
        [0.8779, 0.8489, 0.8856, 0.8587, 0.8658, 0.8771, 0.8556, 0.8529, 1.0000,
         0.8630],
        [0.8925, 0.9017, 0.8894, 0.9398, 0.9200, 0.9345, 0.9253, 0.8762, 0.8630,
         1.0000]], device='c

In [44]:
avg_emb = torch.mean(sentence_embeddings, axis=0)

In [45]:
avg_emb

tensor([-0.0963,  0.2045, -0.4324,  ..., -0.3783, -0.3772, -0.2139],
       device='cuda:0')

In [46]:
sentence_embeddings_sub = torch.nn.functional.normalize(sentence_embeddings_sub - avg_emb, p=2, dim=1)

In [47]:
scores_sub = sentence_embeddings_sub @ sentence_embeddings_sub.T

In [48]:
scores_sub

tensor([[1.0000, 0.9997, 0.9997, 0.9997, 0.9997, 0.9997, 0.9997, 0.9998, 0.9997,
         0.9997],
        [0.9997, 1.0000, 0.9997, 0.9997, 0.9997, 0.9998, 0.9997, 0.9997, 0.9997,
         0.9997],
        [0.9997, 0.9997, 1.0000, 0.9997, 0.9997, 0.9998, 0.9997, 0.9997, 0.9998,
         0.9997],
        [0.9997, 0.9997, 0.9997, 1.0000, 0.9998, 0.9998, 0.9998, 0.9997, 0.9997,
         0.9998],
        [0.9997, 0.9997, 0.9997, 0.9998, 1.0000, 0.9999, 0.9998, 0.9997, 0.9997,
         0.9997],
        [0.9997, 0.9998, 0.9998, 0.9998, 0.9999, 1.0000, 0.9998, 0.9998, 0.9997,
         0.9998],
        [0.9997, 0.9997, 0.9997, 0.9998, 0.9998, 0.9998, 1.0000, 0.9997, 0.9996,
         0.9997],
        [0.9998, 0.9997, 0.9997, 0.9997, 0.9997, 0.9998, 0.9997, 1.0000, 0.9997,
         0.9997],
        [0.9997, 0.9997, 0.9998, 0.9997, 0.9997, 0.9997, 0.9996, 0.9997, 1.0000,
         0.9997],
        [0.9997, 0.9997, 0.9997, 0.9998, 0.9997, 0.9998, 0.9997, 0.9997, 0.9997,
         1.0000]], device='c

In [None]:
'id',
'listing_url',
'room_type',
'neighbourhood_group_cleansed',
'neighbourhood_cleansed',
'bathrooms_text',
'bedrooms',
'beds',
'accommodates',
'price',
'review_scores_rating',
'number_of_reviews',
'latitude',
'longitude',
'property_type',
'description',
'amenities',
'neighborhood_overview',
'host_about'

In [None]:
# Exlude same neighborhood
# bedrooms not the same
# accomodates 
# price 

In [362]:
np.log10(60) - np.log10(110)

-0.2632414347745813

In [356]:
900 + 10**1.2

915.8489319246112

In [358]:
np.log10(300)

2.4771212547196626

In [65]:
def cos_sim(a, b):
    return (a @ b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [66]:
cos_sim(a, b)

0.9899494936611667

In [68]:
cos_sim(np.array([6, 6]), np.array([6, 8]))

0.9899494936611667

In [60]:
a = np.array([3, 3])
b = np.array([3, 4])

In [63]:
np.linalg.norm(a) . 

4.242640687119285

In [30]:
np.ones((5, 1))

array([[1.],
       [1.],
       [1.],
       [1.],
       [1.]])