<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Setup" data-toc-modified-id="Setup-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Setup</a></span></li><li><span><a href="#Import-dataset" data-toc-modified-id="Import-dataset-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Import dataset</a></span></li><li><span><a href="#Sentence-embedding" data-toc-modified-id="Sentence-embedding-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Sentence embedding</a></span><ul class="toc-item"><li><span><a href="#Infersent" data-toc-modified-id="Infersent-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Infersent</a></span></li></ul></li><li><span><a href="#Dimensionality-reduction" data-toc-modified-id="Dimensionality-reduction-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Dimensionality reduction</a></span><ul class="toc-item"><li><span><a href="#TSNE" data-toc-modified-id="TSNE-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>TSNE</a></span></li></ul></li><li><span><a href="#Blocking" data-toc-modified-id="Blocking-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Blocking</a></span></li><li><span><a href="#Evaluation" data-toc-modified-id="Evaluation-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Evaluation</a></span><ul class="toc-item"><li><span><a href="#bilstm,-rnn_dim-2048" data-toc-modified-id="bilstm,-rnn_dim-2048-6.1"><span class="toc-item-num">6.1&nbsp;&nbsp;</span>bilstm, rnn_dim 2048</a></span></li><li><span><a href="#Searching" data-toc-modified-id="Searching-6.2"><span class="toc-item-num">6.2&nbsp;&nbsp;</span>Searching</a></span></li></ul></li><li><span><a href="#Best" data-toc-modified-id="Best-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Best</a></span></li></ul></div>

# Setup 

In [1]:
import os
import sys
import time
import numpy as np
import matplotlib.pyplot as plt

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
params = {}

# Import dataset

In [2]:
from preprocessing_datasets.preprocessing_abt_buy import clean_abt_buy
table, pairs = clean_abt_buy()

In [3]:
table

Unnamed: 0,name,description,price
0,Sony Turntable - PSLX350H,Sony Turntable - PSLX350H/ Belt Drive System/ ...,unk
1,Bose Acoustimass 5 Series III Speaker System -...,Bose Acoustimass 5 Series III Speaker System -...,$399.00
2,Sony Switcher - SBV40S,Sony Switcher - SBV40S/ Eliminates Disconnecti...,$49.00
3,Sony 5 Disc CD Player - CDPCE375,Sony 5 Disc CD Player- CDPCE375/ 5 Disc Change...,unk
4,Bose 27028 161 Bookshelf Pair Speakers In Whit...,Bose 161 Bookshelf Speakers In White - 161WH/ ...,$158.00
...,...,...,...
2168,Sony VAIO FW378J/B Notebook - VGNFW378J/B,Intel Centrino 2 Core 2 Duo P8600 2.4GHz - 16....,unk
2169,Sennheiser CX380 Sennheiser CX 380 Sport II Gr...,unk,unk
2170,IWORK 09 RETAIL-INT DVD - MB942Z/A,unk,unk
2171,IWORK 09 FAMILY PACK-INT DVD - MB943Z/A,unk,unk


# Sentence embedding

## Infersent

Setup model

In [4]:
from embedding_algorithms.inferSent import set_RNN_embedding
start = time.time()
model_type = "bilstm" 
char_level = False
model_version = 2
rnn_dim = 2048
verbose = 1
set_RNN_embedding(model_type, char_level, model_version, rnn_dim, verbose)
print("TIME: {0}".format(time.time() - start))

params['embedding'] = {
    'name': 'Infersent',
    'model_type': model_type,
    'char_level': char_level,
    'model_version': model_version,
    'rnn_dim': rnn_dim,
    'verbose': verbose
}

Vocab size : 2196017
TIME: 186.89661526679993


Embedding

In [14]:
from embedding_algorithms.inferSent import RNN_embedding
start = time.time()
attr_list = ['name']
params['attr_list'] = attr_list
embeddings_tokens = RNN_embedding(table, attr_list, model_type, char_level)
embeddings_tokens = np.array(embeddings_tokens)
print("TIME: {0}".format(time.time() - start))

attrs: ['name']
TIME: 20.92107319831848


# Dimensionality reduction

## TSNE

In [15]:
from dimensionality_reduction_algorithms.tsne import tsne_dim_reduction

start = time.time()

tsne_embeddings = tsne_dim_reduction(
    embeddings_tokens, 
    num_components=2,
    early_exaggeration=12,
    verbose=1,
    perplexity=40,
    method="barnes_hut")

print("TIME: {0}".format(time.time() - start))
params['reduction'] = {
    'name': 'TSNE',
    'num_components': 2,
    'early_exaggeration': 12,
    'verbose': 1,
    'perplexity': 40,
    'method': "barnes_hut"
}

starting dimension: 4096
setting TSNE with n_components: 2 & perplexity: 40
early_exaggeration: 12
TIME: 68.996169090271


# Blocking

hierarchy_cluster

In [19]:
from cluster_algorithms.hierarchy_cluster import hierarchy_cluster
num_clusters = 13
blocks = hierarchy_cluster(tsne_embeddings, {'num_clusters': num_clusters})

params['blocking'] = {
    'name': 'hierarchy',
    'num_clusters': num_clusters
}

clustering with NUM_CLUSTERS = 13, 


# Evaluation

## bilstm, rnn_dim 2048

In [8]:
from evaluation import calc_index
calc_index(blocks,table,pairs)
print()
print(params['attr_list'])
print(params['embedding'])
print(params['reduction'])
print(params['blocking'])

(RR) Reduction ratio is: 0.8855987470538731
(PC) Pair completeness is: 0.4749316317228806
(RM) Reference metric (Harmonic mean RR and PC) is: 0.6182866102088701
(PQ) Pair quality - Precision is: 0.0019298226118908187
(FM) Fmeasure is: 0.003844025528461283

[]
{'name': 'Infersent', 'model_type': 'bilstm', 'char_level': False, 'model_version': 2, 'rnn_dim': 2048, 'verbose': 1}
{'name': 'TSNE', 'num_components': 2, 'early_exaggeration': 12, 'verbose': 1, 'perplexity': 40, 'method': 'barnes_hut'}
{'name': 'hierarchy', 'num_clusters': 10}


In [13]:
from evaluation import calc_index
calc_index(blocks,table,pairs)
print()
print(params['attr_list'])
print(params['embedding'])
print(params['reduction'])
print(params['blocking'])

(RR) Reduction ratio is: 0.896654827071569
(PC) Pair completeness is: 0.5278030993618961
(RM) Reference metric (Harmonic mean RR and PC) is: 0.6644733944106204
(PQ) Pair quality - Precision is: 0.0023740989494919675
(FM) Fmeasure is: 0.004726935778168741

['name', 'description']
{'name': 'Infersent', 'model_type': 'bilstm', 'char_level': False, 'model_version': 2, 'rnn_dim': 2048, 'verbose': 1}
{'name': 'TSNE', 'num_components': 2, 'early_exaggeration': 12, 'verbose': 1, 'perplexity': 40, 'method': 'barnes_hut'}
{'name': 'hierarchy', 'num_clusters': 10}


In [20]:
from evaluation import calc_index
calc_index(blocks,table,pairs)
print()
print(params['attr_list'])
print(params['embedding'])
print(params['reduction'])
print(params['blocking'])

(RR) Reduction ratio is: 0.9078215060270065
(PC) Pair completeness is: 0.9134001823154057
(RM) Reference metric (Harmonic mean RR and PC) is: 0.9106023000084252
(PQ) Pair quality - Precision is: 0.004606261205350986
(FM) Fmeasure is: 0.009166296934962286

['name']
{'name': 'Infersent', 'model_type': 'bilstm', 'char_level': False, 'model_version': 2, 'rnn_dim': 2048, 'verbose': 1}
{'name': 'TSNE', 'num_components': 2, 'early_exaggeration': 12, 'verbose': 1, 'perplexity': 40, 'method': 'barnes_hut'}
{'name': 'hierarchy', 'num_clusters': 13}


## Searching

In [18]:
for i in range(5,20,1):
    num_clusters = int(i)
    blocks = hierarchy_cluster(tsne_embeddings, {'num_clusters': num_clusters})
    calc_index(blocks,table,pairs)

clustering with NUM_CLUSTERS = 5, 
(RR) Reduction ratio is: 0.7754578838397578
(PC) Pair completeness is: 0.9279854147675478
(RM) Reference metric (Harmonic mean RR and PC) is: 0.8448929372150412
(PQ) Pair quality - Precision is: 0.001921146195828584
(FM) Fmeasure is: 0.003834354383989122
clustering with NUM_CLUSTERS = 6, 
(RR) Reduction ratio is: 0.8295178818566045
(PC) Pair completeness is: 0.9243391066545124
(RM) Reference metric (Harmonic mean RR and PC) is: 0.874365267968845
(PQ) Pair quality - Precision is: 0.002520400579642421
(FM) Fmeasure is: 0.005027093754802758
clustering with NUM_CLUSTERS = 7, 
(RR) Reduction ratio is: 0.8500041103819773
(PC) Pair completeness is: 0.9170464904284412
(RM) Reference metric (Harmonic mean RR and PC) is: 0.8822534973452876
(PQ) Pair quality - Precision is: 0.002842032703151662
(FM) Fmeasure is: 0.005666504256919078
clustering with NUM_CLUSTERS = 8, 
(RR) Reduction ratio is: 0.8593461187400365
(PC) Pair completeness is: 0.9170464904284412
(RM) R

In [21]:
for i in [30,35,40,45,50]:
    print('>>>>>>>>>>>>>>>>>>>>: '+str(i))
    tsne_embeddings = tsne_dim_reduction(
    embeddings_tokens, 
    num_components=2,
    early_exaggeration=12,
    verbose=1,
    perplexity=i,
    method="barnes_hut")
    num_clusters = 13
    blocks = hierarchy_cluster(tsne_embeddings, {'num_clusters': num_clusters})
    calc_index(blocks,table,pairs)

>>>>>>>>>>>>>>>>>>>>: 30
starting dimension: 4096
setting TSNE with n_components: 2 & perplexity: 30
early_exaggeration: 12
clustering with NUM_CLUSTERS = 13, 
(RR) Reduction ratio is: 0.9146828776741849
(PC) Pair completeness is: 0.8997265268915223
(RM) Reference metric (Harmonic mean RR and PC) is: 0.9071430589657029
(PQ) Pair quality - Precision is: 0.004902204253543792
(FM) Fmeasure is: 0.009751278188060366
>>>>>>>>>>>>>>>>>>>>: 35
starting dimension: 4096
setting TSNE with n_components: 2 & perplexity: 35
early_exaggeration: 12
clustering with NUM_CLUSTERS = 13, 
(RR) Reduction ratio is: 0.9098927995430273
(PC) Pair completeness is: 0.8960802187784868
(RM) Reference metric (Harmonic mean RR and PC) is: 0.902933687943208
(PQ) Pair quality - Precision is: 0.0046227932393412404
(FM) Fmeasure is: 0.009198134172986679
>>>>>>>>>>>>>>>>>>>>: 40
starting dimension: 4096
setting TSNE with n_components: 2 & perplexity: 40
early_exaggeration: 12
clustering with NUM_CLUSTERS = 13, 
(RR) Reduc

In [22]:
for i in [10,15,18]:
    print('>>>>>>>>>>>>>>>>>>>>: '+str(i))
    tsne_embeddings = tsne_dim_reduction(
    embeddings_tokens, 
    num_components=2,
    early_exaggeration=i,
    verbose=1,
    perplexity=40,
    method="barnes_hut")
    num_clusters = 13
    blocks = hierarchy_cluster(tsne_embeddings, {'num_clusters': num_clusters})
    calc_index(blocks,table,pairs)

>>>>>>>>>>>>>>>>>>>>: 10
starting dimension: 4096
setting TSNE with n_components: 2 & perplexity: 40
early_exaggeration: 10
clustering with NUM_CLUSTERS = 13, 
(RR) Reduction ratio is: 0.9115123747922562
(PC) Pair completeness is: 0.9143117593436645
(RM) Reference metric (Harmonic mean RR and PC) is: 0.9129099210360069
(PQ) Pair quality - Precision is: 0.004803179772052486
(FM) Fmeasure is: 0.009556157910031107
>>>>>>>>>>>>>>>>>>>>: 15
starting dimension: 4096
setting TSNE with n_components: 2 & perplexity: 40
early_exaggeration: 15
clustering with NUM_CLUSTERS = 13, 
(RR) Reduction ratio is: 0.9154579177398153
(PC) Pair completeness is: 0.8413855970829535
(RM) Reference metric (Harmonic mean RR and PC) is: 0.8768602328244759
(PQ) Pair quality - Precision is: 0.004626357708173566
(FM) Fmeasure is: 0.009202117583721324
>>>>>>>>>>>>>>>>>>>>: 18
starting dimension: 4096
setting TSNE with n_components: 2 & perplexity: 40
early_exaggeration: 18
clustering with NUM_CLUSTERS = 13, 
(RR) Reduc

In [23]:
for i in [5,8]:
    print('>>>>>>>>>>>>>>>>>>>>: '+str(i))
    tsne_embeddings = tsne_dim_reduction(
    embeddings_tokens, 
    num_components=2,
    early_exaggeration=i,
    verbose=1,
    perplexity=40,
    method="barnes_hut")
    num_clusters = 13
    blocks = hierarchy_cluster(tsne_embeddings, {'num_clusters': num_clusters})
    calc_index(blocks,table,pairs)

>>>>>>>>>>>>>>>>>>>>: 5
starting dimension: 4096
setting TSNE with n_components: 2 & perplexity: 40
early_exaggeration: 5
clustering with NUM_CLUSTERS = 13, 
(RR) Reduction ratio is: 0.9107038584197997
(PC) Pair completeness is: 0.9088422971741112
(RM) Reference metric (Harmonic mean RR and PC) is: 0.9097721255237029
(PQ) Pair quality - Precision is: 0.004731217493641092
(FM) Fmeasure is: 0.009413430898147055
>>>>>>>>>>>>>>>>>>>>: 8
starting dimension: 4096
setting TSNE with n_components: 2 & perplexity: 40
early_exaggeration: 8
clustering with NUM_CLUSTERS = 13, 
(RR) Reduction ratio is: 0.9147701703223641
(PC) Pair completeness is: 0.9115770282588879
(RM) Reference metric (Harmonic mean RR and PC) is: 0.913170807883755
(PQ) Pair quality - Precision is: 0.004971859276495038
(FM) Fmeasure is: 0.009889778419514512


# Best

In [28]:
#(RR) Reduction ratio is: 0.9990375547800934
#(PC) Pair completeness is: 0.9775179856115108
#(RM) Reference metric (Harmonic mean RR and PC) is: 0.9881606240170739
#(PQ) Pair quality - Precision is: 0.18742995085783257
#(FM) Fmeasure is: 0.3145482167402156

#['title', 'authors']
#{'name': 'Infersent', 'model_type': 'bilstm', 'char_level': False, 'model_version': 2, 'rnn_dim': 2048, 'verbose': 1}
#{'name': 'TSNE', 'num_components': 2, 'early_exaggeration': 12, 'verbose': 1, 'perplexity': 40, 'method': 'barnes_hut'}
#{'name': 'hierarchy', 'num_clusters': 1000}