<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Setup" data-toc-modified-id="Setup-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Setup</a></span></li><li><span><a href="#Import-dataset" data-toc-modified-id="Import-dataset-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Import dataset</a></span></li><li><span><a href="#Sentence-embedding" data-toc-modified-id="Sentence-embedding-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Sentence embedding</a></span><ul class="toc-item"><li><span><a href="#Infersent" data-toc-modified-id="Infersent-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Infersent</a></span></li></ul></li><li><span><a href="#Dimensionality-reduction" data-toc-modified-id="Dimensionality-reduction-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Dimensionality reduction</a></span><ul class="toc-item"><li><span><a href="#TSNE" data-toc-modified-id="TSNE-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>TSNE</a></span></li></ul></li><li><span><a href="#Blocking" data-toc-modified-id="Blocking-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Blocking</a></span></li><li><span><a href="#Evaluation" data-toc-modified-id="Evaluation-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Evaluation</a></span><ul class="toc-item"><li><span><a href="#bilstm,-rnn_dim-2048" data-toc-modified-id="bilstm,-rnn_dim-2048-6.1"><span class="toc-item-num">6.1&nbsp;&nbsp;</span>bilstm, rnn_dim 2048</a></span></li><li><span><a href="#Searching" data-toc-modified-id="Searching-6.2"><span class="toc-item-num">6.2&nbsp;&nbsp;</span>Searching</a></span></li></ul></li><li><span><a href="#Best" data-toc-modified-id="Best-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Best</a></span></li></ul></div>

# Setup 

In [1]:
import os
import sys
import time
import numpy as np
import matplotlib.pyplot as plt

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
params = {}

# Import dataset

In [2]:
from preprocessing_datasets.preprocessing_amzn_gp import clean_amzn_gp
table, pairs = clean_amzn_gp()

In [3]:
table

Unnamed: 0,title,description,manufacturer,price
0,clickart 950 000 - premier image pack (dvd-rom),unk,broderbund,0
1,ca international - arcserve lap/desktop oem 30pk,oem arcserve backup v11.1 win 30u for laptops ...,computer associates,0
2,noah's ark activity center (jewel case ages 3-8),unk,victory multimedia,0
3,peachtree by sage premium accounting for nonpr...,peachtree premium accounting for nonprofits 20...,sage software,599.99
4,singing coach unlimited,singing coach unlimited - electronic learning ...,carry-a-tune technologies,99.99
...,...,...,...,...
4584,jumpstart(r) advanced 1st grade,prepare your child for the 1st grade and beyon...,unk,19.99
4585,ibm(r) viavoice(r) advanced edition 10,ibm viavoice advanced edition release 10 is a ...,unk,78.95
4586,xbox 360: gears of war,as marcus fenix you fight a war against the im...,unk,59.99
4587,documents to go premium 7.0,this pda software enables you to use your docu...,unk,49.99


# Sentence embedding

## Infersent

Setup model

In [4]:
from embedding_algorithms.inferSent import set_RNN_embedding
start = time.time()
model_type = "bilstm" 
char_level = False
model_version = 2
rnn_dim = 2048
verbose = 1
set_RNN_embedding(model_type, char_level, model_version, rnn_dim, verbose)
print("TIME: {0}".format(time.time() - start))

params['embedding'] = {
    'name': 'Infersent',
    'model_type': model_type,
    'char_level': char_level,
    'model_version': model_version,
    'rnn_dim': rnn_dim,
    'verbose': verbose
}

Vocab size : 2196017
TIME: 183.27148509025574


Embedding

In [39]:
from embedding_algorithms.inferSent import RNN_embedding
start = time.time()
attr_list = ['title', 'manufacturer']
params['attr_list'] = attr_list
embeddings_tokens = RNN_embedding(table, attr_list, model_type, char_level)
embeddings_tokens = np.array(embeddings_tokens)
print("TIME: {0}".format(time.time() - start))

attrs: ['title', 'manufacturer']
TIME: 44.206966161727905


# Dimensionality reduction

## TSNE

In [48]:
from dimensionality_reduction_algorithms.tsne import tsne_dim_reduction

start = time.time()

tsne_embeddings = tsne_dim_reduction(
    embeddings_tokens, 
    num_components=2,
    early_exaggeration=12,
    verbose=1,
    perplexity=5,
    method="barnes_hut")

print("TIME: {0}".format(time.time() - start))
params['reduction'] = {
    'name': 'TSNE',
    'num_components': 2,
    'early_exaggeration': 12,
    'verbose': 1,
    'perplexity': 5,
    'method': "barnes_hut"
}

starting dimension: 4096
setting TSNE with n_components: 2 & perplexity: 5
early_exaggeration: 12
TIME: 193.18674898147583


# Blocking

hierarchy_cluster

In [49]:
from cluster_algorithms.hierarchy_cluster import hierarchy_cluster
num_clusters = 15
blocks = hierarchy_cluster(tsne_embeddings, {'num_clusters': num_clusters})

params['blocking'] = {
    'name': 'hierarchy',
    'num_clusters': num_clusters
}

clustering with NUM_CLUSTERS = 15, 


# Evaluation

## bilstm, rnn_dim 2048

In [10]:
from evaluation import calc_index
calc_index(blocks,table,pairs)
print()
print(params['attr_list'])
print(params['embedding'])
print(params['reduction'])
print(params['blocking'])

(RR) Reduction ratio is: 0.8896593822116987
(PC) Pair completeness is: 0.3484615384615385
(RM) Reference metric (Harmonic mean RR and PC) is: 0.5007783518651138
(PQ) Pair quality - Precision is: 0.0003899880679147433
(FM) Fmeasure is: 0.0007791041849761883

[]
{'name': 'Infersent', 'model_type': 'bilstm', 'char_level': False, 'model_version': 2, 'rnn_dim': 2048, 'verbose': 1}
{'name': 'TSNE', 'num_components': 2, 'early_exaggeration': 12, 'verbose': 1, 'perplexity': 40, 'method': 'barnes_hut'}
{'name': 'hierarchy', 'num_clusters': 10}


In [19]:
from evaluation import calc_index
calc_index(blocks,table,pairs)
print()
print(params['attr_list'])
print(params['embedding'])
print(params['reduction'])
print(params['blocking'])

(RR) Reduction ratio is: 0.9381128786227937
(PC) Pair completeness is: 0.6492307692307693
(RM) Reference metric (Harmonic mean RR and PC) is: 0.7673848654475537
(PQ) Pair quality - Precision is: 0.0012954799415499096
(FM) Fmeasure is: 0.002585800158089204

['title']
{'name': 'Infersent', 'model_type': 'bilstm', 'char_level': False, 'model_version': 2, 'rnn_dim': 2048, 'verbose': 1}
{'name': 'TSNE', 'num_components': 2, 'early_exaggeration': 12, 'verbose': 1, 'perplexity': 40, 'method': 'barnes_hut'}
{'name': 'hierarchy', 'num_clusters': 17}


In [26]:
from evaluation import calc_index
calc_index(blocks,table,pairs)
print()
print(params['attr_list'])
print(params['embedding'])
print(params['reduction'])
print(params['blocking'])

(RR) Reduction ratio is: 0.9147197830831204
(PC) Pair completeness is: 0.6284615384615385
(RM) Reference metric (Harmonic mean RR and PC) is: 0.745040383928709
(PQ) Pair quality - Precision is: 0.000910043786806927
(FM) Fmeasure is: 0.0018174558065710928

['title', 'price']
{'name': 'Infersent', 'model_type': 'bilstm', 'char_level': False, 'model_version': 2, 'rnn_dim': 2048, 'verbose': 1}
{'name': 'TSNE', 'num_components': 2, 'early_exaggeration': 12, 'verbose': 1, 'perplexity': 40, 'method': 'barnes_hut'}
{'name': 'hierarchy', 'num_clusters': 13}


In [30]:
from evaluation import calc_index
calc_index(blocks,table,pairs)
print()
print(params['attr_list'])
print(params['embedding'])
print(params['reduction'])
print(params['blocking'])

(RR) Reduction ratio is: 0.9114443526396373
(PC) Pair completeness is: 0.3030769230769231
(RM) Reference metric (Harmonic mean RR and PC) is: 0.454891578232552
(PQ) Pair quality - Precision is: 0.0004226379473097057
(FM) Fmeasure is: 0.0008440988066928039

['title', 'description']
{'name': 'Infersent', 'model_type': 'bilstm', 'char_level': False, 'model_version': 2, 'rnn_dim': 2048, 'verbose': 1}
{'name': 'TSNE', 'num_components': 2, 'early_exaggeration': 12, 'verbose': 1, 'perplexity': 40, 'method': 'barnes_hut'}
{'name': 'hierarchy', 'num_clusters': 13}


In [38]:
from evaluation import calc_index
calc_index(blocks,table,pairs)
print()
print(params['attr_list'])
print(params['embedding'])
print(params['reduction'])
print(params['blocking'])

(RR) Reduction ratio is: 0.933286793425695
(PC) Pair completeness is: 0.7084615384615385
(RM) Reference metric (Harmonic mean RR and PC) is: 0.8054800905278088
(PQ) Pair quality - Precision is: 0.0013114035150170653
(FM) Fmeasure is: 0.0026179610318916545

['title', 'manufacturer', 'price']
{'name': 'Infersent', 'model_type': 'bilstm', 'char_level': False, 'model_version': 2, 'rnn_dim': 2048, 'verbose': 1}
{'name': 'TSNE', 'num_components': 2, 'early_exaggeration': 12, 'verbose': 1, 'perplexity': 40, 'method': 'barnes_hut'}
{'name': 'hierarchy', 'num_clusters': 17}


In [45]:
from evaluation import calc_index
calc_index(blocks,table,pairs)
print()
print(params['attr_list'])
print(params['embedding'])
print(params['reduction'])
print(params['blocking'])

(RR) Reduction ratio is: 0.9257687206604323
(PC) Pair completeness is: 0.7230769230769231
(RM) Reference metric (Harmonic mean RR and PC) is: 0.8119644195422745
(PQ) Pair quality - Precision is: 0.0012028997562208473
(FM) Fmeasure is: 0.0024018039080415717

['title', 'manufacturer']
{'name': 'Infersent', 'model_type': 'bilstm', 'char_level': False, 'model_version': 2, 'rnn_dim': 2048, 'verbose': 1}
{'name': 'TSNE', 'num_components': 2, 'early_exaggeration': 12, 'verbose': 1, 'perplexity': 40, 'method': 'barnes_hut'}
{'name': 'hierarchy', 'num_clusters': 15}


In [50]:
from evaluation import calc_index
calc_index(blocks,table,pairs)
print()
print(params['attr_list'])
print(params['embedding'])
print(params['reduction'])
print(params['blocking'])

(RR) Reduction ratio is: 0.9277423762482704
(PC) Pair completeness is: 0.8407692307692308
(RM) Reference metric (Harmonic mean RR and PC) is: 0.8821171893191397
(PQ) Pair quality - Precision is: 0.0014368949397108857
(FM) Fmeasure is: 0.0028688868823887616

['title', 'manufacturer']
{'name': 'Infersent', 'model_type': 'bilstm', 'char_level': False, 'model_version': 2, 'rnn_dim': 2048, 'verbose': 1}
{'name': 'TSNE', 'num_components': 2, 'early_exaggeration': 12, 'verbose': 1, 'perplexity': 40, 'method': 'barnes_hut'}
{'name': 'hierarchy', 'num_clusters': 15}


## Searching

In [52]:
for i in range(10,30,1):
    num_clusters = int(i)
    blocks = hierarchy_cluster(tsne_embeddings, {'num_clusters': num_clusters})
    calc_index(blocks,table,pairs)

clustering with NUM_CLUSTERS = 10, 
(RR) Reduction ratio is: 0.8914286143108221
(PC) Pair completeness is: 0.85
(RM) Reference metric (Harmonic mean RR and PC) is: 0.8702215134601626
(PQ) Pair quality - Precision is: 0.0009667972936675214
(FM) Fmeasure is: 0.0019313977989056576
clustering with NUM_CLUSTERS = 11, 
(RR) Reduction ratio is: 0.9019257414578624
(PC) Pair completeness is: 0.8492307692307692
(RM) Reference metric (Harmonic mean RR and PC) is: 0.8747854192725354
(PQ) Pair quality - Precision is: 0.0010693073910061951
(FM) Fmeasure is: 0.0021359253354795776
clustering with NUM_CLUSTERS = 12, 
(RR) Reduction ratio is: 0.90739046007254
(PC) Pair completeness is: 0.8492307692307692
(RM) Reference metric (Harmonic mean RR and PC) is: 0.8773478147086777
(PQ) Pair quality - Precision is: 0.0011324052533756754
(FM) Fmeasure is: 0.002261794520884722
clustering with NUM_CLUSTERS = 13, 
(RR) Reduction ratio is: 0.9139962265247836
(PC) Pair completeness is: 0.8484615384615385
(RM) Referen

In [46]:
for i in [30,35,40,45,50]:
    print('>>>>>>>>>>>>>>>>>>>>: '+str(i))
    tsne_embeddings = tsne_dim_reduction(
    embeddings_tokens, 
    num_components=2,
    early_exaggeration=12,
    verbose=1,
    perplexity=i,
    method="barnes_hut")
    num_clusters = 15
    blocks = hierarchy_cluster(tsne_embeddings, {'num_clusters': num_clusters})
    calc_index(blocks,table,pairs)

>>>>>>>>>>>>>>>>>>>>: 30
starting dimension: 4096
setting TSNE with n_components: 2 & perplexity: 30
early_exaggeration: 12
clustering with NUM_CLUSTERS = 15, 
(RR) Reduction ratio is: 0.9292758373906139
(PC) Pair completeness is: 0.7376923076923076
(RM) Reference metric (Harmonic mean RR and PC) is: 0.8224747893227237
(PQ) Pair quality - Precision is: 0.0012880695745609617
(FM) Fmeasure is: 0.0025716488452384944
>>>>>>>>>>>>>>>>>>>>: 35
starting dimension: 4096
setting TSNE with n_components: 2 & perplexity: 35
early_exaggeration: 12
clustering with NUM_CLUSTERS = 15, 
(RR) Reduction ratio is: 0.9271282508511788
(PC) Pair completeness is: 0.7215384615384616
(RM) Reference metric (Harmonic mean RR and PC) is: 0.8115147673460212
(PQ) Pair quality - Precision is: 0.0012227345193076037
(FM) Fmeasure is: 0.002441331905319006
>>>>>>>>>>>>>>>>>>>>: 40
starting dimension: 4096
setting TSNE with n_components: 2 & perplexity: 40
early_exaggeration: 12
clustering with NUM_CLUSTERS = 15, 
(RR) Re

In [47]:
for i in [10,15]:
    print('>>>>>>>>>>>>>>>>>>>>: '+str(i))
    tsne_embeddings = tsne_dim_reduction(
    embeddings_tokens, 
    num_components=2,
    early_exaggeration=12,
    verbose=1,
    perplexity=i,
    method="barnes_hut")
    num_clusters = 15
    blocks = hierarchy_cluster(tsne_embeddings, {'num_clusters': num_clusters})
    calc_index(blocks,table,pairs)

>>>>>>>>>>>>>>>>>>>>: 10
starting dimension: 4096
setting TSNE with n_components: 2 & perplexity: 10
early_exaggeration: 12
clustering with NUM_CLUSTERS = 15, 
(RR) Reduction ratio is: 0.9272540206927486
(PC) Pair completeness is: 0.8084615384615385
(RM) Reference metric (Harmonic mean RR and PC) is: 0.8637926970927967
(PQ) Pair quality - Precision is: 0.0013724048685768906
(FM) Fmeasure is: 0.0027401581783032137
>>>>>>>>>>>>>>>>>>>>: 15
starting dimension: 4096
setting TSNE with n_components: 2 & perplexity: 15
early_exaggeration: 12
clustering with NUM_CLUSTERS = 15, 
(RR) Reduction ratio is: 0.9256769580720965
(PC) Pair completeness is: 0.7853846153846153
(RM) Reference metric (Harmonic mean RR and PC) is: 0.8497794035747445
(PQ) Pair quality - Precision is: 0.0013049407536448235
(FM) Fmeasure is: 0.0026055523018051296


# Best

In [28]:
#(RR) Reduction ratio is: 0.9990375547800934
#(PC) Pair completeness is: 0.9775179856115108
#(RM) Reference metric (Harmonic mean RR and PC) is: 0.9881606240170739
#(PQ) Pair quality - Precision is: 0.18742995085783257
#(FM) Fmeasure is: 0.3145482167402156

#['title', 'authors']
#{'name': 'Infersent', 'model_type': 'bilstm', 'char_level': False, 'model_version': 2, 'rnn_dim': 2048, 'verbose': 1}
#{'name': 'TSNE', 'num_components': 2, 'early_exaggeration': 12, 'verbose': 1, 'perplexity': 40, 'method': 'barnes_hut'}
#{'name': 'hierarchy', 'num_clusters': 1000}