<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Setup" data-toc-modified-id="Setup-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Setup</a></span></li><li><span><a href="#Import-dataset" data-toc-modified-id="Import-dataset-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Import dataset</a></span></li><li><span><a href="#Sentence-embedding" data-toc-modified-id="Sentence-embedding-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Sentence embedding</a></span><ul class="toc-item"><li><span><a href="#Infersent" data-toc-modified-id="Infersent-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Infersent</a></span></li></ul></li><li><span><a href="#Dimensionality-reduction" data-toc-modified-id="Dimensionality-reduction-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Dimensionality reduction</a></span><ul class="toc-item"><li><span><a href="#TSNE" data-toc-modified-id="TSNE-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>TSNE</a></span></li></ul></li><li><span><a href="#Blocking" data-toc-modified-id="Blocking-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Blocking</a></span></li><li><span><a href="#Evaluation" data-toc-modified-id="Evaluation-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Evaluation</a></span><ul class="toc-item"><li><span><a href="#bilstm,-rnn_dim-2048" data-toc-modified-id="bilstm,-rnn_dim-2048-6.1"><span class="toc-item-num">6.1&nbsp;&nbsp;</span>bilstm, rnn_dim 2048</a></span></li><li><span><a href="#Searching" data-toc-modified-id="Searching-6.2"><span class="toc-item-num">6.2&nbsp;&nbsp;</span>Searching</a></span></li></ul></li><li><span><a href="#Result" data-toc-modified-id="Result-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Result</a></span></li></ul></div>

# Setup 

In [1]:
import os
import sys
import time
import numpy as np
import matplotlib.pyplot as plt

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
params = {}

# Import dataset

In [2]:
from preprocessing_datasets.preprocessing_abt_buy import clean_abt_buy
table, pairs = clean_abt_buy()

In [3]:
table

Unnamed: 0,name,description,price
0,Sony Turntable - PSLX350H,Sony Turntable - PSLX350H/ Belt Drive System/ ...,unk
1,Bose Acoustimass 5 Series III Speaker System -...,Bose Acoustimass 5 Series III Speaker System -...,$399.00
2,Sony Switcher - SBV40S,Sony Switcher - SBV40S/ Eliminates Disconnecti...,$49.00
3,Sony 5 Disc CD Player - CDPCE375,Sony 5 Disc CD Player- CDPCE375/ 5 Disc Change...,unk
4,Bose 27028 161 Bookshelf Pair Speakers In Whit...,Bose 161 Bookshelf Speakers In White - 161WH/ ...,$158.00
...,...,...,...
2168,Sony VAIO FW378J/B Notebook - VGNFW378J/B,Intel Centrino 2 Core 2 Duo P8600 2.4GHz - 16....,unk
2169,Sennheiser CX380 Sennheiser CX 380 Sport II Gr...,unk,unk
2170,IWORK 09 RETAIL-INT DVD - MB942Z/A,unk,unk
2171,IWORK 09 FAMILY PACK-INT DVD - MB943Z/A,unk,unk


# Sentence embedding

## Infersent

Setup model

In [4]:
from embedding_algorithms.inferSent import set_RNN_embedding
start = time.time()
model_type = "bilstm" 
char_level = False
model_version = 2
rnn_dim = 2048
verbose = 1
set_RNN_embedding(model_type, char_level, model_version, rnn_dim, verbose)
print("TIME: {0}".format(time.time() - start))

params['embedding'] = {
    'name': 'Infersent',
    'model_type': model_type,
    'char_level': char_level,
    'model_version': model_version,
    'rnn_dim': rnn_dim,
    'verbose': verbose
}

Vocab size : 2196017
TIME: 234.89537286758423


Embedding

In [5]:
from embedding_algorithms.inferSent import RNN_embedding
start = time.time()
attr_list = ['name']
params['attr_list'] = attr_list
embeddings_tokens = RNN_embedding(table, attr_list, model_type, char_level)
embeddings_tokens = np.array(embeddings_tokens)
print("TIME: {0}".format(time.time() - start))

attrs: ['name']
TIME: 27.600971937179565


# Dimensionality reduction

## TSNE

In [7]:
from dimensionality_reduction_algorithms.tsne import tsne_dim_reduction

start = time.time()

tsne_embeddings = tsne_dim_reduction(
    embeddings_tokens, 
    num_components=2,
    early_exaggeration=12,
    verbose=1,
    perplexity=40,
    method="barnes_hut")

print("TIME: {0}".format(time.time() - start))
params['reduction'] = {
    'name': 'TSNE',
    'num_components': 2,
    'early_exaggeration': 12,
    'verbose': 1,
    'perplexity': 40,
    'method': "barnes_hut"
}

starting dimension: 4096
setting TSNE with n_components: 2 & perplexity: 40
early_exaggeration: 12
TIME: 73.44341397285461


# Blocking

In [24]:
from cluster_algorithms.kMeans_cluster import kMean_cluster
num_clusters = 13

start = time.time()
blocks = kMean_cluster(tsne_embeddings, {'num_clusters': num_clusters, 'distance_algorithm': 'euclidean'})
print("BLOCKS: {0}".format(time.time() - start))

params['blocking'] = {
    'name': 'k_means',
    'num_clusters': num_clusters
}

clustering with NUM_CLUSTERS = 13, distance_algorithm = <function euclidean_distance at 0x7ffd94a10320>
BLOCKS: 56.50607085227966


In [13]:
from cluster_algorithms.birch_cluster import birch_cluster
num_clusters = 15

start = time.time()
blocks = birch_cluster(tsne_embeddings, {'num_clusters': num_clusters})
print("BLOCKS: {0}".format(time.time() - start))

params['blocking'] = {
    'name': 'birch_cluster',
    'num_clusters': num_clusters
}

clustering with NUM_CLUSTERS = 15, 
BLOCKS: 0.3483707904815674


In [22]:
from cluster_algorithms.DBScan_cluster import DBSCAN_cluster
eps = 4
min_samples = 2

start = time.time()
blocks = DBSCAN_cluster(tsne_embeddings, {'eps':eps, 'min_samples':min_samples})
print("BLOCKS: {0}".format(time.time() - start))

params['blocking'] = {
    'name': 'DBScan',
    'eps': eps,
    'min_samples': min_samples
}

DBScan_cluster
BLOCKS: 0.02804875373840332


In [8]:
from cluster_algorithms.hierarchy_cluster import hierarchy_cluster
num_clusters = 13
start = time.time()
blocks = hierarchy_cluster(tsne_embeddings, {'num_clusters': num_clusters})
print("BLOCKS: {0}".format(time.time() - start))
params['blocking'] = {
    'name': 'hierarchy',
    'num_clusters': num_clusters
}

clustering with NUM_CLUSTERS = 13, 
BLOCKS: 0.1270890235900879


# Evaluation

## bilstm, rnn_dim 2048

In [9]:
from evaluation import calc_index
calc_index(blocks,table,pairs)
print()
print(params['attr_list'])
print(params['embedding'])
print(params['reduction'])
print(params['blocking'])

(RR) Reduction ratio is: 0.9078215060270065
(PC) Pair completeness is: 0.9134001823154057
(RM) Reference metric (Harmonic mean RR and PC) is: 0.9106023000084252
(PQ) Pair quality - Precision is: 0.004606261205350986
(FM) Fmeasure is: 0.009166296934962286

['name']
{'name': 'Infersent', 'model_type': 'bilstm', 'char_level': False, 'model_version': 2, 'rnn_dim': 2048, 'verbose': 1}
{'name': 'TSNE', 'num_components': 2, 'early_exaggeration': 12, 'verbose': 1, 'perplexity': 40, 'method': 'barnes_hut'}
{'name': 'hierarchy', 'num_clusters': 13}


In [14]:
from evaluation import calc_index
calc_index(blocks,table,pairs)
print()
print(params['attr_list'])
print(params['embedding'])
print(params['reduction'])
print(params['blocking'])

(RR) Reduction ratio is: 0.9217959572486374
(PC) Pair completeness is: 0.902461257976299
(RM) Reference metric (Harmonic mean RR and PC) is: 0.912026146568918
(PQ) Pair quality - Precision is: 0.0053643417573366855
(FM) Fmeasure is: 0.010665287720375548

['name']
{'name': 'Infersent', 'model_type': 'bilstm', 'char_level': False, 'model_version': 2, 'rnn_dim': 2048, 'verbose': 1}
{'name': 'TSNE', 'num_components': 2, 'early_exaggeration': 12, 'verbose': 1, 'perplexity': 40, 'method': 'barnes_hut'}
{'name': 'birch_cluster', 'num_clusters': 15}


In [23]:
from evaluation import calc_index
calc_index(blocks,table,pairs)
print()
print(params['attr_list'])
print(params['embedding'])
print(params['reduction'])
print(params['blocking'])

(RR) Reduction ratio is: 0.9442653391404132
(PC) Pair completeness is: 0.8477666362807658
(RM) Reference metric (Harmonic mean RR and PC) is: 0.8934178198817464
(PQ) Pair quality - Precision is: 0.007070791548503349
(FM) Fmeasure is: 0.014024610930148389

['name']
{'name': 'Infersent', 'model_type': 'bilstm', 'char_level': False, 'model_version': 2, 'rnn_dim': 2048, 'verbose': 1}
{'name': 'TSNE', 'num_components': 2, 'early_exaggeration': 12, 'verbose': 1, 'perplexity': 40, 'method': 'barnes_hut'}
{'name': 'DBScan', 'eps': 4, 'min_samples': 2}


In [25]:
from evaluation import calc_index
calc_index(blocks,table,pairs)
print()
print(params['attr_list'])
print(params['embedding'])
print(params['reduction'])
print(params['blocking'])

(RR) Reduction ratio is: 0.917779224180233
(PC) Pair completeness is: 0.8705560619872379
(RM) Reference metric (Harmonic mean RR and PC) is: 0.8935441506478496
(PQ) Pair quality - Precision is: 0.004921893924166757
(FM) Fmeasure is: 0.009788446558156697

['name']
{'name': 'Infersent', 'model_type': 'bilstm', 'char_level': False, 'model_version': 2, 'rnn_dim': 2048, 'verbose': 1}
{'name': 'TSNE', 'num_components': 2, 'early_exaggeration': 12, 'verbose': 1, 'perplexity': 40, 'method': 'barnes_hut'}
{'name': 'k_means', 'num_clusters': 13}


## Searching

In [12]:
for i in range(5,50,1):
    num_clusters = int(i)
    blocks = birch_cluster(tsne_embeddings, {'num_clusters': num_clusters})
    calc_index(blocks,table,pairs)

clustering with NUM_CLUSTERS = 5, 
(RR) Reduction ratio is: 0.7749752317704559
(PC) Pair completeness is: 0.959890610756609
(RM) Reference metric (Harmonic mean RR and PC) is: 0.8575780677793619
(PQ) Pair quality - Precision is: 0.0019829350828859333
(FM) Fmeasure is: 0.003957694389319863
clustering with NUM_CLUSTERS = 6, 
(RR) Reduction ratio is: 0.8297407747349651
(PC) Pair completeness is: 0.9389243391066545
(RM) Reference metric (Harmonic mean RR and PC) is: 0.8809624868505596
(PQ) Pair quality - Precision is: 0.00256352183100169
(FM) Fmeasure is: 0.0051130835368638425
clustering with NUM_CLUSTERS = 7, 
(RR) Reduction ratio is: 0.8503859097800819
(PC) Pair completeness is: 0.9380127620783957
(RM) Reference metric (Harmonic mean RR and PC) is: 0.8920525927660571
(PQ) Pair quality - Precision is: 0.002914427976242739
(FM) Fmeasure is: 0.005810801653452599
clustering with NUM_CLUSTERS = 8, 
(RR) Reduction ratio is: 0.8590795795375863
(PC) Pair completeness is: 0.9334548769371012
(RM) 

(RR) Reduction ratio is: 0.9667948936343319
(PC) Pair completeness is: 0.7711941659070192
(RM) Reference metric (Harmonic mean RR and PC) is: 0.8579876582148928
(PQ) Pair quality - Precision is: 0.010796324655436448
(FM) Fmeasure is: 0.02129453666763155
clustering with NUM_CLUSTERS = 35, 
(RR) Reduction ratio is: 0.9676847701448973
(PC) Pair completeness is: 0.7702825888787602
(RM) Reference metric (Harmonic mean RR and PC) is: 0.8577729910698664
(PQ) Pair quality - Precision is: 0.01108051403094676
(FM) Fmeasure is: 0.021846762413226986
clustering with NUM_CLUSTERS = 36, 
(RR) Reduction ratio is: 0.9686932968568714
(PC) Pair completeness is: 0.7693710118505014
(RM) Reference metric (Harmonic mean RR and PC) is: 0.8576029531724861
(PQ) Pair quality - Precision is: 0.011423930698429886
(FM) Fmeasure is: 0.022513570828387373
clustering with NUM_CLUSTERS = 37, 
(RR) Reduction ratio is: 0.9696831785371955
(PC) Pair completeness is: 0.7693710118505014
(RM) Reference metric (Harmonic mean RR

In [21]:
for i in range(1,5,1):
    eps = int(i)
    min_samples = 2
    print(eps)
    blocks = DBSCAN_cluster(tsne_embeddings, {'eps':eps, 'min_samples':min_samples})
    calc_index(blocks,table,pairs)

1
DBScan_cluster
(RR) Reduction ratio is: 0.9572236361371224
(PC) Pair completeness is: 0.3108477666362808
(RM) Reference metric (Harmonic mean RR and PC) is: 0.4692966481444341
(PQ) Pair quality - Precision is: 0.0033780102429988015
(FM) Fmeasure is: 0.006683391478185882
2
DBScan_cluster
(RR) Reduction ratio is: 0.9933221124142858
(PC) Pair completeness is: 0.495897903372835
(RM) Reference metric (Harmonic mean RR and PC) is: 0.66153603590906
(PQ) Pair quality - Precision is: 0.034519956850053934
(FM) Fmeasure is: 0.06454674893213098
3
DBScan_cluster
(RR) Reduction ratio is: 0.9769411808576545
(PC) Pair completeness is: 0.6818596171376481
(RM) Reference metric (Harmonic mean RR and PC) is: 0.8031545925835616
(PQ) Pair quality - Precision is: 0.013745957071449573
(FM) Fmeasure is: 0.026948642660277772
4
DBScan_cluster
(RR) Reduction ratio is: 0.9442653391404132
(PC) Pair completeness is: 0.8477666362807658
(RM) Reference metric (Harmonic mean RR and PC) is: 0.8934178198817464
(PQ) Pair

# Result

In [28]:
# RR PC alpha
list_embedding = [0.9343, 0.9088, 0.9213]

list_hierarchy = [0.9078, 0.9134, 0.9106] # TSNE: 73.44, BLOCKS: 0.1270, n_cluster: 13
list_kmeans = [0.9177, 0.8705, 0.8935] # TSNE: 73.44, BLOCKS: 56.50, n_cluster: 13
list_birch = [0.9217, 0.9024, 0.9120] # TSNE: 73.44, BLOCKS: 0.3483, n_cluster: 15
list_DBscan = [0.9442, 0.8477, 0.8934] # TSNE: 73.44, BLOCKS:  0.0280