<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Setup" data-toc-modified-id="Setup-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Setup</a></span></li><li><span><a href="#Import-dataset" data-toc-modified-id="Import-dataset-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Import dataset</a></span></li><li><span><a href="#Sentence-embedding" data-toc-modified-id="Sentence-embedding-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Sentence embedding</a></span><ul class="toc-item"><li><span><a href="#Infersent" data-toc-modified-id="Infersent-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Infersent</a></span></li></ul></li><li><span><a href="#Dimensionality-reduction" data-toc-modified-id="Dimensionality-reduction-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Dimensionality reduction</a></span><ul class="toc-item"><li><span><a href="#TSNE" data-toc-modified-id="TSNE-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>TSNE</a></span></li></ul></li><li><span><a href="#Blocking" data-toc-modified-id="Blocking-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Blocking</a></span></li><li><span><a href="#Evaluation" data-toc-modified-id="Evaluation-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Evaluation</a></span></li><li><span><a href="#Results" data-toc-modified-id="Results-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Results</a></span></li></ul></div>

# Setup 

In [1]:
import os
import sys
import time
import numpy as np
import matplotlib.pyplot as plt

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from preprocessing_datasets.preprocessing_utilities import get_labels_by

params = {}

# Import dataset

In [2]:
from preprocessing_datasets.preprocessing_restaurant import clean_restaurant
table, pairs = clean_restaurant()

In [3]:
table

Unnamed: 0,citruname,addr,city,phone,type
0,arnie morton's of chicago,"""435 s. la cienega blv.""","""los angeles""","""310/246-1501""","""american"""
1,arnie morton's of chicago,"""435 s. la cienega blvd.""","""los angeles""","""310-246-1501""","""steakhouses"""
2,art's delicatessen,"""12224 ventura blvd.""","""studio city""","""818/762-1221""","""american"""
3,art's deli,"""12224 ventura blvd.""","""studio city""","""818-762-1221""","""delis"""
4,hotel bel-air,"""701 stone canyon rd.""","""bel air""","""310/472-1211""","""californian"""
...,...,...,...,...,...
859,ti couz,"""3108 16th st.""","""san francisco""","""415-252-7373""","""french"""
860,trio cafe,"""1870 fillmore st.""","""san francisco""","""415-563-2248""","""american"""
861,tu lan,"""8 sixth st.""","""san francisco""","""415-626-0927""","""vietnamese"""
862,vicolo pizzeria,"""201 ivy st.""","""san francisco""","""415-863-2382""","""pizza"""


# Sentence embedding

## Infersent

Setup model

In [38]:
from embedding_algorithms.inferSent import set_RNN_embedding
start = time.time()
model_type = "bilstm" 
char_level = False
model_version = 2
rnn_dim = 2048
verbose = 1
set_RNN_embedding(model_type, char_level, model_version, rnn_dim, verbose)
print("TIME: {0}".format(time.time() - start))

params['embedding'] = {
    'name': 'Infersent',
    'model_type': model_type,
    'char_level': char_level,
    'model_version': model_version,
    'rnn_dim': rnn_dim,
    'verbose': verbose
}

Vocab size : 2196017
TIME: 252.86702632904053


Embedding

In [63]:
from embedding_algorithms.inferSent import RNN_embedding
attr_list = ['citruname']
params['attr_list'] = attr_list
embeddings_tokens = RNN_embedding(table, attr_list, model_type, char_level)
embeddings_tokens = np.array(embeddings_tokens)

attrs: ['citruname']


# Dimensionality reduction

## TSNE

In [64]:
from dimensionality_reduction_algorithms.tsne import tsne_dim_reduction

start = time.time()
tsne_embeddings = tsne_dim_reduction(
    embeddings_tokens, 
    num_components=2,
    early_exaggeration=12,
    verbose=1,
    perplexity=10,
    method="barnes_hut")
print("TSNE: {0}".format(time.time() - start))

params['reduction'] = {
    'name': 'TSNE',
    'num_components': 2,
    'early_exaggeration': 12,
    'verbose': 1,
    'perplexity': 10,
    'method': "barnes_hut"
}

starting dimension: 4096
setting TSNE with n_components: 2 & perplexity: 10
early_exaggeration: 12
TSNE: 14.005852937698364


# Blocking

In [76]:
from cluster_algorithms.kMeans_cluster import kMean_cluster
num_clusters = 75

start = time.time()
blocks = kMean_cluster(tsne_embeddings, {'num_clusters': num_clusters, 'distance_algorithm': 'euclidean'})
print("BLOCKS: {0}".format(time.time() - start))

params['blocking'] = {
    'name': 'k_means',
    'num_clusters': num_clusters
}

clustering with NUM_CLUSTERS = 75, distance_algorithm = <function euclidean_distance at 0x7f7fea94d680>
BLOCKS: 97.77153301239014


In [83]:
from cluster_algorithms.birch_cluster import birch_cluster
num_clusters = 80

start = time.time()
blocks = birch_cluster(tsne_embeddings, {'num_clusters': num_clusters})
print("BLOCKS: {0}".format(time.time() - start))

params['blocking'] = {
    'name': 'birch_cluster',
    'num_clusters': num_clusters
}

clustering with NUM_CLUSTERS = 80, 
BLOCKS: 0.09139490127563477


In [93]:
from cluster_algorithms.DBScan_cluster import DBSCAN_cluster
eps = 7
min_samples = 2

start = time.time()
blocks = DBSCAN_cluster(tsne_embeddings, {'eps':eps, 'min_samples':min_samples})
print("BLOCKS: {0}".format(time.time() - start))

params['blocking'] = {
    'name': 'DBScan',
    'eps': eps,
    'min_samples': min_samples
}

DBScan_cluster
BLOCKS: 0.00858306884765625


In [65]:
from cluster_algorithms.hierarchy_cluster import hierarchy_cluster
num_clusters = 56


start = time.time()
blocks = hierarchy_cluster(tsne_embeddings, {'num_clusters': num_clusters})
print("BLOCKS: {0}".format(time.time() - start))
params['blocking'] = {
    'name': 'hierarchy',
    'num_clusters': num_clusters
}

clustering with NUM_CLUSTERS = 56, 
BLOCKS: 0.028515100479125977


# Evaluation

In [77]:
from evaluation import calc_index
calc_index(blocks,table,pairs)
print()
print(params['attr_list'])
print(params['embedding'])
print(params['reduction'])
print(params['blocking'])

(RR) Reduction ratio is: 0.9858294708381614
(PC) Pair completeness is: 0.9196428571428571
(RM) Reference metric (Harmonic mean RR and PC) is: 0.9515866674147461
(PQ) Pair quality - Precision is: 0.019496498201779292
(FM) Fmeasure is: 0.038183503243744206

['citruname']
{'name': 'Infersent', 'model_type': 'bilstm', 'char_level': False, 'model_version': 2, 'rnn_dim': 2048, 'verbose': 1}
{'name': 'TSNE', 'num_components': 2, 'early_exaggeration': 12, 'verbose': 1, 'perplexity': 10, 'method': 'barnes_hut'}
{'name': 'k_means', 'num_clusters': 75}


In [84]:
from evaluation import calc_index
calc_index(blocks,table,pairs)
print()
print(params['attr_list'])
print(params['embedding'])
print(params['reduction'])
print(params['blocking'])

(RR) Reduction ratio is: 0.9831203596412171
(PC) Pair completeness is: 0.9464285714285714
(RM) Reference metric (Harmonic mean RR and PC) is: 0.9644256049020895
(PQ) Pair quality - Precision is: 0.016844112505959002
(FM) Fmeasure is: 0.03309914129586261

['citruname']
{'name': 'Infersent', 'model_type': 'bilstm', 'char_level': False, 'model_version': 2, 'rnn_dim': 2048, 'verbose': 1}
{'name': 'TSNE', 'num_components': 2, 'early_exaggeration': 12, 'verbose': 1, 'perplexity': 10, 'method': 'barnes_hut'}
{'name': 'birch_cluster', 'num_clusters': 80}


In [94]:
from evaluation import calc_index
calc_index(blocks,table,pairs)
print()
print(params['attr_list'])
print(params['embedding'])
print(params['reduction'])
print(params['blocking'])

(RR) Reduction ratio is: 0.9859689498304793
(PC) Pair completeness is: 0.9464285714285714
(RM) Reference metric (Harmonic mean RR and PC) is: 0.9657942264933123
(PQ) Pair quality - Precision is: 0.020263811890651882
(FM) Fmeasure is: 0.03967808347370391

['citruname']
{'name': 'Infersent', 'model_type': 'bilstm', 'char_level': False, 'model_version': 2, 'rnn_dim': 2048, 'verbose': 1}
{'name': 'TSNE', 'num_components': 2, 'early_exaggeration': 12, 'verbose': 1, 'perplexity': 10, 'method': 'barnes_hut'}
{'name': 'DBScan', 'eps': 7, 'min_samples': 2}


In [66]:
from evaluation import calc_index
calc_index(blocks,table,pairs)
print()
print(params['attr_list'])
print(params['embedding'])
print(params['reduction'])
print(params['blocking'])

(RR) Reduction ratio is: 0.9804541650572937
(PC) Pair completeness is: 0.9464285714285714
(RM) Reference metric (Harmonic mean RR and PC) is: 0.963140950111651
(PQ) Pair quality - Precision is: 0.014546452586798408
(FM) Fmeasure is: 0.02865252061089336

['citruname']
{'name': 'Infersent', 'model_type': 'bilstm', 'char_level': False, 'model_version': 2, 'rnn_dim': 2048, 'verbose': 1}
{'name': 'TSNE', 'num_components': 2, 'early_exaggeration': 12, 'verbose': 1, 'perplexity': 10, 'method': 'barnes_hut'}
{'name': 'hierarchy', 'num_clusters': 56}


In [82]:
for i in range(50,80,1):
    num_clusters = int(i)
    blocks = birch_cluster(tsne_embeddings, {'num_clusters': num_clusters})
    calc_index(blocks,table,pairs)

clustering with NUM_CLUSTERS = 50, 
(RR) Reduction ratio is: 0.9741186000600832
(PC) Pair completeness is: 0.9464285714285714
(RM) Reference metric (Harmonic mean RR and PC) is: 0.9600739713591675
(PQ) Pair quality - Precision is: 0.010985594362110064
(FM) Fmeasure is: 0.021719086159205
clustering with NUM_CLUSTERS = 51, 
(RR) Reduction ratio is: 0.9745719067851165
(PC) Pair completeness is: 0.9464285714285714
(RM) Reference metric (Harmonic mean RR and PC) is: 0.9602940842063185
(PQ) Pair quality - Precision is: 0.011181434599156118
(FM) Fmeasure is: 0.022101751459549627
clustering with NUM_CLUSTERS = 52, 
(RR) Reduction ratio is: 0.9749098751126561
(PC) Pair completeness is: 0.9464285714285714
(RM) Reference metric (Harmonic mean RR and PC) is: 0.9604581244241286
(PQ) Pair quality - Precision is: 0.011332050459696387
(FM) Fmeasure is: 0.0223959433762941
clustering with NUM_CLUSTERS = 53, 
(RR) Reduction ratio is: 0.9764548731814086
(PC) Pair completeness is: 0.9464285714285714
(RM) R

(RR) Reduction ratio is: 0.9830398909918029
(PC) Pair completeness is: 0.9464285714285714
(RM) Reference metric (Harmonic mean RR and PC) is: 0.9643868845843502
(PQ) Pair quality - Precision is: 0.016764194211608414
(FM) Fmeasure is: 0.03294483294483294


In [92]:
for i in range(5,10,1):
    eps = int(i)
    min_samples = 2
    print(eps)
    blocks = DBSCAN_cluster(tsne_embeddings, {'eps':eps, 'min_samples':min_samples})
    calc_index(blocks,table,pairs)

5
DBScan_cluster
(RR) Reduction ratio is: 0.9772300545040985
(PC) Pair completeness is: 0.9196428571428571
(RM) Reference metric (Harmonic mean RR and PC) is: 0.947562310465726
(PQ) Pair quality - Precision is: 0.012133349039934032
(FM) Fmeasure is: 0.02395070340658063
6
DBScan_cluster
(RR) Reduction ratio is: 0.9861272048409939
(PC) Pair completeness is: 0.9196428571428571
(RM) Reference metric (Harmonic mean RR and PC) is: 0.9517253505622084
(PQ) Pair quality - Precision is: 0.01991492652745553
(FM) Fmeasure is: 0.03898561695685087
7
DBScan_cluster
(RR) Reduction ratio is: 0.9859689498304793
(PC) Pair completeness is: 0.9464285714285714
(RM) Reference metric (Harmonic mean RR and PC) is: 0.9657942264933123
(PQ) Pair quality - Precision is: 0.020263811890651882
(FM) Fmeasure is: 0.03967808347370391
8
DBScan_cluster
(RR) Reduction ratio is: 0.9776350800394833
(PC) Pair completeness is: 0.9464285714285714
(RM) Reference metric (Harmonic mean RR and PC) is: 0.9617787555772942
(PQ) Pair q

# Results

In [78]:
# Restaurant
# RR PC alpha
list_embedding = [0.9432, 0.9792, 0.9608]

# attrs: ['citruname']
list_clustering = [0.9790, 0.9464, 0.9624] # TSNE: 18.875, BLOCKS: 0.0285, n_cluster: 56 
list_kmeans = [0.9858, 0.9196, 0.9515] # TSNE: 18.875, BLOCKS: 97.771, n_cluster: 75 
list_birch = [0.9831, 0.9464, 0.9644] # TSNE: 18.875, BLOCKS: 0.0913, n_cluster: 80 
list_DBscan = [0.9859, 0.9464, 0.9657] # TSNE: 18.875, BLOCKS: 0.00858