In [2]:
# Mount Drive
from google.colab import drive
MOUNT_PATH = '/content/drive/'
drive.mount(MOUNT_PATH)

import sys
PROJECT_PATH = 'MyDrive/UCY/NLP/AirBnB_project'
EFFECTIVE_PROJECT_PATH = f'{MOUNT_PATH}{PROJECT_PATH}'
DATASET_PATH = f'{EFFECTIVE_PROJECT_PATH}/Dataset'

sys.path.append(EFFECTIVE_PROJECT_PATH)
sys.path.append(DATASET_PATH)

Mounted at /content/drive/


In [3]:
!pip install transformers
!pip install sentence-transformers
%clear

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m54.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m93.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 KB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.27.4
Looking in indexes: https://pypi.org/simple, http

In [4]:
%load_ext tensorboard

In [5]:
from dataloader import Dataloader
data = Dataloader(
    listing_path = f'{DATASET_PATH}/listings',
    comments_path = f'{DATASET_PATH}/comments',
)
# Aproach 1 only listing data 
# Aproach 2 only reviews (Advanced analysis)
# Aproach 3 comninede

# Develop clustering o present vocabulary to express the different dimension (clean, communication checkin, value)
# Use bucket of 0-1 1-2 2-3 3-4 4-5
# 1 model for each dimension 

# Test more cities

In [42]:
# more than 1 listing
listings = data.getListings()
listings = listings.sample(1000)
listings.sample(3)['amenities']

2043    ["Books and reading material", "Smoke alarm", ...
2198    ["Smoke alarm", "Fire extinguisher", "Waterfro...
591     ["Books and reading material", "Smoke alarm", ...
Name: amenities, dtype: object

In [7]:
rating_cols = [col for col in listings.columns if "scores" in col]
listings[rating_cols]

Unnamed: 0,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
4927,,,,,,,
2689,4.83,4.74,4.57,4.87,4.83,5.00,4.65
2345,4.89,4.94,4.81,4.97,4.97,4.94,4.64
5200,5.00,5.00,5.00,5.00,5.00,5.00,5.00
6017,4.67,5.00,4.33,5.00,5.00,5.00,5.00
...,...,...,...,...,...,...,...
2188,4.67,4.81,4.77,4.79,4.92,4.42,4.58
1087,4.78,4.67,4.77,4.78,4.82,4.88,4.66
3041,5.00,5.00,5.00,5.00,5.00,5.00,5.00
148,4.73,4.86,4.92,4.96,4.99,4.86,4.65


# Sentence Transformes Aproach

In [51]:
%%time
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import json

# used a fast model
model = SentenceTransformer('all-MiniLM-L12-v2')
%clear

scores = listings[rating_cols]
encodings = {}
strings = {}
for idx, listing in tqdm(listings.iterrows()):
  amenities = json.loads(listing['amenities'])
  a_string = " ".join(amenities)
  encoding = model.encode(a_string)
  encodings[idx] = encoding

# Extract the amenities from the data frame and explode them in a list for each element

%clear

[H[2J

1000it [02:18,  7.24it/s]

[H[2J




CPU times: user 2min 9s, sys: 878 ms, total: 2min 10s
Wall time: 2min 19s


In [52]:
# Visualization
import plotly.graph_objs as go
def visualize_embedding_3D(components, ratings, title):
  trace = go.Scatter3d(
      x=components[:,0],
      y=components[:,1],
      z=components[:,2],
      mode='markers',
      marker=dict(
          color=ratings,
          size=5,
          colorscale='Viridis',
          opacity=0.8,
          colorbar=dict(title=title)
      )
  )

  # create the layout for the plot
  layout = go.Layout(
      margin=dict(
          l=0,
          r=0,
          b=0,
          t=0
      ),
      scene=dict(
          xaxis_title='X',
          yaxis_title='Y',
          zaxis_title='Z'
      ),
      title=dict(text=title)
  )

  # create the figure and plot it
  fig = go.Figure(data=[trace], layout=layout)
  fig.show()

def visualize_embedding_2D(components, ratings, title):
  trace = go.Scatter(
      x=components[:,0],
      y=components[:,1],
      mode='markers',
      marker=dict(
          color=ratings,
          size=5,
          colorscale='Viridis',
          opacity=0.8,
          colorbar=dict(title=title)
      )
  )

  # create the layout for the plot
  layout = go.Layout(
      margin=dict(
          l=0,
          r=0,
          b=0,
          t=0
      ),
      scene=dict(
          xaxis_title='X',
          yaxis_title='Y',
      ),
      title=dict(text=title)
  )

  # create the figure and plot it
  fig = go.Figure(data=[trace], layout=layout)
  fig.show()

In [53]:
#embedding visualization, to se if we can detect something
import numpy as np
from sklearn.manifold import TSNE


tsne = TSNE(n_components=2, perplexity=30.0, random_state=0)
vectors = np.vstack(encodings.values())
reduced_vectors = tsne.fit_transform(vectors)
print(reduced_vectors.shape)

for score in rating_cols:
  visualize_embedding_2D(reduced_vectors, listings[score], score)

  vectors = np.vstack(encodings.values())


(1000, 2)


# Clustering

We want to try to extract a clustering approach to se if we can use the appartence to clusters as the effective feature to give to the model

In [54]:
# Pre-Processing Functions
import nltk
import itertools
import string
import re
from collections import Counter
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
from nltk.stem import PorterStemmer
import pickle
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import json

nltk.download("stopwords")
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

from tqdm import tqdm

# The amenities are a list of smal descriptions, as such first we have to tokenize them then i will generate a sngle array with all the dscrioom on the documents
def tokenize_amenities(listings):
  tokens_list = []
  for idx, listing in tqdm(listings.iterrows()):
      amenities_list = json.loads(listing['amenities'])
      amenities = [ word_tokenize(uncontract(a)) for a in  amenities_list]
      tokens_list.append(amenities)

  return tokens_list


# Uncontract contracted forms
def uncontract(text):    
  text = re.sub(r"(\b)([Aa]re|[Cc]ould|[Dd]id|[Dd]oes|[Dd]o|[Hh]ad|[Hh]as|[Hh]ave|[Ii]s|[Mm]ight|[Mm]ust|[Ss]hould|[Ww]ere|[Ww]ould)n't", r"\1\2 not", text)
  text = re.sub(r"(\b)([Hh]e|[Ii]|[Ss]he|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'ll", r"\1\2 will", text)
  text = re.sub(r"(\b)([Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'re", r"\1\2 are", text)
  text = re.sub(r"(\b)([Ii]|[Ss]hould|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Ww]ould|[Yy]ou)'ve", r"\1\2 have", text)
  
  text = re.sub(r"(\b)([Cc]a)n't", r"\1\2n not", text)
  text = re.sub(r"(\b)([Ii])'m", r"\1\2 am", text)
  text = re.sub(r"(\b)([Ll]et)'s", r"\1\2 us", text)
  text = re.sub(r"(\b)([Tt]here)'s", r"\1\2 is", text)
  text = re.sub(r"(\b)([Ww])on't", r"\1\2ill not", text)
  text = re.sub(r"(\b)([Ss])han't", r"\1\2hall not", text)
  text = re.sub(r"(\b)([Yy])(?:'all|a'll)", r"\1\2ou all", text)
  text = text.replace('"', " ") # Removes " Characters
  
  return text

# Given an original token list return an valutation function to compare against it
def gen_dimensionality_info(orignial_tokens, name):
  orignial_tokens = list(itertools.chain.from_iterable(orignial_tokens))
  def dimensionality_info(token_list):
    token_list = list(itertools.chain.from_iterable(list(itertools.chain.from_iterable(token_list))))
    print(f'Number of {name} Tokens:               ', len(token_list), '/', len(orignial_tokens))
    print(f'Unique {name} Tokens:                  ', len(set(token_list)), '/', len(set(orignial_tokens)))
    print()
    print('Reduction pecentage:            ', 1-(len(token_list) / len(orignial_tokens)))
    print('Reduction pecentage Unique:     ', 1-(len(set(token_list)) / len(set(orignial_tokens))))
    print()
    print('Most Common of current filter:')
    for t, f in Counter(token_list).most_common(10): print('{0:25} {1}'.format(t, f))
    print('################################################################')
    print()
  
  return dimensionality_info

# Remove all the punctuation only tokens
def remove_puctuation_only_token(tokens):
  return list(filter(lambda t: not all(c in string.punctuation for c in t), tokens))

# Remove Numbers
def remove_numbers(tokens):
  return list(filter(lambda t: not t.isnumeric() , tokens))

# remove the stop words from the tokens
def remove_stop_words(tokens):
  stop_word_regex = '|'.join(['^{}$'.format(s) for s in stopwords.words('english')])
  stop_word_regex = re.compile(stop_word_regex)
  return list(filter(lambda t: not stop_word_regex.match(t), tokens))

nltk.download('wordnet')
nltk.download('omw-1.4')
def lemmatize(tokens):
  lemmatizer = WordNetLemmatizer()
  return [lemmatizer.lemmatize(t) for t in tokens]

# Lowercase the text
def lowercase(tokens):
  return [t.lower() for t in tokens]

# Remove the the tokens made only by ’
def remove_special_tokens(tokens):
  return [t for t in tokens if  '’' != t]

# Sequentially Apply a list of token processing funtions
def apply_process_list(tokens, processing):
  for p in processing:
    tokens = [p(token) for token in tokens]
  return tokens

PRE_PROCESSING = [lowercase, remove_puctuation_only_token, remove_numbers, remove_stop_words, remove_special_tokens, lemmatize]
def pre_process(docs, remove_less_than=3):
  docs_tokens = tokenize_amenities(docs)
  docs_tokens = [apply_process_list(doc, PRE_PROCESSING) for doc in docs_tokens]
  return docs_tokens


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [57]:
# Generate the raw token info to make the info function
tokenize_amenities_raw = list(itertools.chain.from_iterable(tokenize_amenities(listings)))
print_dim_info = gen_dimensionality_info(tokenize_amenities_raw, name="Amenities")

1000it [00:03, 283.51it/s]


In [56]:
test = list(tokenize_amenities(listings))
test[0]

1000it [00:03, 278.66it/s]


[['Smoke', 'alarm'],
 ['Lake', 'view'],
 ['Rice', 'maker'],
 ['Dedicated', 'workspace'],
 ['TV'],
 ['Oven'],
 ['Blender'],
 ['Bed', 'linens'],
 ['Wifi'],
 ['Microwave'],
 ['Washer'],
 ['Mini', 'fridge'],
 ['Essentials'],
 ['Ethernet', 'connection'],
 ['Central', 'heating'],
 ['Coffee', 'maker', ':', 'espresso', 'machine'],
 ['Cleaning', 'products'],
 ['Air', 'conditioning'],
 ['Carbon', 'monoxide', 'alarm'],
 ['Extra', 'pillows', 'and', 'blankets'],
 ['Cooking', 'basics'],
 ['Body', 'soap'],
 ['Shampoo'],
 ['Garden', 'view'],
 ['Hair', 'dryer'],
 ['AEG', 'refrigerator'],
 ['Wine', 'glasses'],
 ['Hangers'],
 ['Kitchen'],
 ['Dining', 'table'],
 ['Pets', 'allowed'],
 ['Long', 'term', 'stays', 'allowed'],
 ['Toaster'],
 ['Iron'],
 ['Coffee'],
 ['Freezer']]

In [58]:
amenities_tokens = pre_process(listings)
amenities_tokens[0]

1000it [00:04, 249.27it/s]


[['smoke', 'alarm'],
 ['lake', 'view'],
 ['rice', 'maker'],
 ['dedicated', 'workspace'],
 ['tv'],
 ['oven'],
 ['blender'],
 ['bed', 'linen'],
 ['wifi'],
 ['microwave'],
 ['washer'],
 ['mini', 'fridge'],
 ['essential'],
 ['ethernet', 'connection'],
 ['central', 'heating'],
 ['coffee', 'maker', 'espresso', 'machine'],
 ['cleaning', 'product'],
 ['air', 'conditioning'],
 ['carbon', 'monoxide', 'alarm'],
 ['extra', 'pillow', 'blanket'],
 ['cooking', 'basic'],
 ['body', 'soap'],
 ['shampoo'],
 ['garden', 'view'],
 ['hair', 'dryer'],
 ['aeg', 'refrigerator'],
 ['wine', 'glass'],
 ['hanger'],
 ['kitchen'],
 ['dining', 'table'],
 ['pet', 'allowed'],
 ['long', 'term', 'stay', 'allowed'],
 ['toaster'],
 ['iron'],
 ['coffee'],
 ['freezer']]

In [59]:
# As is expected since thei are more or less few specific words
print_dim_info(amenities_tokens)

Number of Amenities Tokens:                57726 / 63597
Unique Amenities Tokens:                   439 / 656

Reduction pecentage:             0.09231567526770135
Reduction pecentage Unique:      0.3307926829268293

Most Common of current filter:
alarm                     1466
dryer                     1151
allowed                   1131
hot                       1064
wifi                      1045
water                     1037
coffee                    968
private                   904
smoke                     898
heating                   896
################################################################



In [None]:
# # Remove tokens that appear less than x
# def remove_token_by_count(docs, lesser_eq_than):
#   tokens_linear = itertools.chain.from_iterable(list(itertools.chain.from_iterable(docs)))
#   print(list(tokens_linear))
#   token_freq = dict(Counter(tokens_linear))
#   tokens_appeared_leq = set([t for t, v in token_freq.items() if v <= lesser_eq_than])
#   filtered = []
#   for doc in docs:
#     filtered = [list(filter(lambda t: t not in tokens_appeared_leq, a)) for a in doc]
#   return filtered

# amenities_tokens_no_rares = remove_token_by_count(amenities_tokens, 1)

We will now try to see if cluster with similar amenities are present, to do so we flatt the structure (Listing-> amenities embeddings list ) to ( amenities embeddings list) and work on those.

In [98]:
vectorize_model = None
def unqies_amenities(amenities):
  res_list = []
  test = []

  for item in amenities: 
      if "".join(item) not in test:
          res_list.append(item)
          test.append("".join(item))

  return  res_list

In [101]:
import gensim.downloader
# Apply of w2v to extract vectorization of amenities
def vectorize(docs, model_name='glove-wiki-gigaword', size=300, w2v_model=None):
  if w2v_model == None: w2v_model =  gensim.downloader.load(f'{model_name}-{size}')
  documents_vector = []
  for d in docs:
    vectorized_amenity_tokens = [w2v_model[t] for t in d if t in w2v_model]
    documents_vector.append(np.zeros(shape=(size)) if len(vectorized_amenity_tokens) == 0 else np.mean(vectorized_amenity_tokens, axis=0))

 
  return documents_vector, w2v_model

amenities_flat = list(itertools.chain.from_iterable(amenities_tokens))
print("With copies: ", len(amenities_flat))
amenities_flat = unqies_amenities(amenities_flat)
print("Uniques: ", len(amenities_flat))
vectorized_amenities, vectorize_model = vectorize(amenities_flat, w2v_model=vectorize_model)

With copies:  29420
Uniques:  556


In [None]:
amenities_flat%%time
# Cluestering 
from scipy.cluster.hierarchy import ward, fcluster, linkage
Z = linkage(vectorized_amenities,'average', metric='cosine')

In [112]:
clusters = fcluster(Z, 0.50, criterion='distance')
print('Number of Clusters:', len(set(clusters)), '/', len(vectorized_amenities))

Number of Clusters: 90 / 556


In [113]:
%%time
###############################
# Define a cluster dictionary #
###############################

labels = listings['review_scores_rating'].values
amenity_name = [' '.join(a) for a in amenities_flat]
similar_amenities = [vectorize_model.most_similar(positive=[vectorized_amenity], topn=1)[0] for vectorized_amenity in vectorized_amenities]

cluster_dict = {}
for i, c in enumerate(clusters):
  
  ##############################
  # Initialize a cluster array #
  ##############################

  if c not in cluster_dict: cluster_dict[c] = []

  #########################################
  # Populate the cluster with the texts,  #
  # their processed form, and their label #
  # i.e. gun-related or mideast-related.  #
  #########################################
  similar_amenity, confidence = similar_amenities[i]
  cluster_dict[c] += [{'text': strings[i], 'encoded': similar_amenity, 'confidence': confidence, 'encoding':vectorized_amenities[i]}]

CPU times: user 51.7 s, sys: 5.09 s, total: 56.7 s
Wall time: 35.3 s


In [114]:
from collections import Counter

for c in sorted(
    cluster_dict.keys(),
    key     = lambda c: len(cluster_dict[c]),
    reverse = True
)[:10]:

  print(f'Cluster {c}')
  print()
  
  for t in Counter([t['encoded'] for t in cluster_dict[c]]).most_common(): 
    print('- {0:35} {1}'.format(t[0], t[1]))

  print()
  print('- {0:35} {1}'.format('Total Entries:', len(cluster_dict[c])))
  print()

Cluster 54

- cable                               48
- tv                                  15
- netflix                             15
- hdtv                                8
- video                               7
- dvd                                 2

- Total Entries:                      95

Cluster 53

- sound                               16
- system                              15
- bluetooth                           11
- kardon                              2
- stereo                              1
- bose                                1

- Total Entries:                      46

Cluster 44

- stainless                           20
- stove                               9
- induction                           7
- gas                                 5
- electric                            1
- steel                               1

- Total Entries:                      43

Cluster 32

- soap                                16
- body                                5
- shower       