In [1]:
import os, sys
from os import path
sys.path.append(os.path.abspath(path.join(os.getcwd(), os.pardir, 'src')))

import project_config as pc

from data import utils as data_utils
from models.listing_embedding import ListingEmbedder

import re
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = data_utils.load_nyc_listings()

In [3]:
listing_embedder = ListingEmbedder(device='cuda:0')

In [4]:
embeddings = listing_embedder.from_dataframe(df, 400)

100%|█████████████████████████████████████████| 109/109 [06:58<00:00,  3.84s/it]


In [5]:
embeddings.shape

(43566, 1024)

In [6]:
a = df[df.bedrooms.eq(0)]

In [10]:
df.iloc[512]

id                                                                         433914
listing_url                                   https://www.airbnb.com/rooms/433914
room_type                                                         Entire home/apt
neighbourhood_group_cleansed                                             Brooklyn
neighbourhood_cleansed                                              East New York
bathrooms_text                                                             1 bath
bedrooms                                                                        1
beds                                                                            4
accommodates                                                                    4
price                                                                       105.0
latitude                                                                40.673219
longitude                                                              -73.881086
property_type   

In [11]:
df = df[:500]

In [12]:
embeddings

array([[-0.01053734,  0.02992929, -0.02854822, ..., -0.02058447,
        -0.02488858, -0.00536096],
       [-0.01013899,  0.0089483 , -0.02855307, ..., -0.02224569,
        -0.01729184, -0.01087915],
       [ 0.00449256,  0.01541421, -0.0204808 , ..., -0.01774506,
        -0.02749335, -0.01489215],
       ...,
       [ 0.01111768,  0.01973216, -0.02297438, ..., -0.01283255,
        -0.03752194, -0.00829629],
       [-0.00870882,  0.01095518, -0.02508694, ..., -0.02465589,
        -0.02248235, -0.0080701 ],
       [-0.00099459,  0.01030623, -0.0142141 , ..., -0.02049175,
        -0.01991566, -0.01334427]], dtype=float32)

In [6]:
scores = embeddings @ embeddings.T

In [7]:
np.mean(scores)

0.95169115

In [55]:
df.iloc[2]

id                                                                          14991
listing_url                                    https://www.airbnb.com/rooms/14991
room_type                                                            Private room
neighbourhood_group_cleansed                                            Manhattan
neighbourhood_cleansed                                            Lower East Side
bathrooms_text                                                             1 bath
bedrooms                                                                        0
beds                                                                            1
accommodates                                                                    1
price                                                                       120.0
latitude                                                                 40.72207
longitude                                                               -73.98976
property_type   

In [56]:
df.iloc[365]

id                                                                         353317
listing_url                                   https://www.airbnb.com/rooms/353317
room_type                                                            Private room
neighbourhood_group_cleansed                                            Manhattan
neighbourhood_cleansed                                                  Chinatown
bathrooms_text                                                     1 private bath
bedrooms                                                                        0
beds                                                                            1
accommodates                                                                    2
price                                                                       140.0
latitude                                                                 40.71481
longitude                                                               -73.99059
property_type   

In [14]:
def filter_by_price(df):
    prices = df.price.to_numpy()
    prices[prices == 0] = 1
    log_prices = np.log10(prices)
    return np.abs(log_prices[:, None] - log_prices) <= 0.3

In [58]:
# filter_by_price(df)

array([[ True, False, False, ..., False,  True, False],
       [False,  True, False, ..., False, False,  True],
       [False, False,  True, ...,  True, False,  True],
       ...,
       [False, False,  True, ...,  True, False,  True],
       [ True, False, False, ..., False,  True, False],
       [False,  True,  True, ...,  True, False,  True]])

In [15]:
def filter_by_neighborhood(df):
    neighbourhoods = df.neighbourhood_cleansed.to_numpy()
    return neighbourhoods[:, None] != neighbourhoods

In [60]:
# filter_by_neighborhood(df)

array([[False,  True,  True, ...,  True,  True,  True],
       [ True, False,  True, ...,  True,  True,  True],
       [ True,  True, False, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ..., False,  True,  True],
       [ True,  True,  True, ...,  True, False,  True],
       [ True,  True,  True, ...,  True,  True, False]])

In [16]:
def apply_heuristic_filters(scores, df):
    scores *= filter_by_price(df)
    scores *= filter_by_neighborhood(df)
    scores[scores < 0.9] = 0
    return scores

In [17]:
scores = apply_heuristic_filters(scores, df)

In [13]:
scores

array([[0.       , 0.       , 0.       , ..., 0.9432435, 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.9575317,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       ...,
       [0.9432435, 0.       , 0.       , ..., 0.       , 0.       ,
        0.9580718],
       [0.       , 0.9575317, 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.9580718, 0.       ,
        0.       ]], dtype=float32)

In [19]:
ranking = np.argsort(scores, axis=1)[:, -10:]

In [20]:
ranking.shape

(43566, 10)