<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Setup" data-toc-modified-id="Setup-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Setup</a></span></li><li><span><a href="#Import-dataset" data-toc-modified-id="Import-dataset-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Import dataset</a></span></li><li><span><a href="#Sentence-embedding" data-toc-modified-id="Sentence-embedding-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Sentence embedding</a></span><ul class="toc-item"><li><span><a href="#Infersent" data-toc-modified-id="Infersent-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Infersent</a></span></li></ul></li><li><span><a href="#Dimensionality-reduction" data-toc-modified-id="Dimensionality-reduction-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Dimensionality reduction</a></span><ul class="toc-item"><li><span><a href="#TSNE" data-toc-modified-id="TSNE-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>TSNE</a></span></li></ul></li><li><span><a href="#Blocking" data-toc-modified-id="Blocking-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Blocking</a></span></li><li><span><a href="#Evaluation" data-toc-modified-id="Evaluation-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Evaluation</a></span><ul class="toc-item"><li><span><a href="#bilstm,-rnn_dim-2048" data-toc-modified-id="bilstm,-rnn_dim-2048-6.1"><span class="toc-item-num">6.1&nbsp;&nbsp;</span>bilstm, rnn_dim 2048</a></span></li><li><span><a href="#Searching" data-toc-modified-id="Searching-6.2"><span class="toc-item-num">6.2&nbsp;&nbsp;</span>Searching</a></span></li></ul></li><li><span><a href="#Result" data-toc-modified-id="Result-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Result</a></span></li></ul></div>

# Setup 

In [1]:
import os
import sys
import time
import numpy as np
import matplotlib.pyplot as plt

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from preprocessing_datasets.preprocessing_utilities import get_labels_by

params = {}

# Import dataset

In [2]:
from preprocessing_datasets.preprocessing_census import clean_census
table, pairs = clean_census()

In [3]:
table

Unnamed: 0,last_name,first_name,middle_name,zip_code,street_address
0,anderson,unk,unk,4848,basswood
1,anderson,unk,unk,4848,basswood
2,anderson,unk,unk,4848,basswood
3,anderson,unk,unk,4848,basswood
4,aquendo,clara,j,666,starkey
...,...,...,...,...,...
836,william,sherry,v,510,woodhaven
837,wright,bryan,unk,307,woodhaven
838,wright,maxine,h,307,woodhaven
839,yates,chanse,e,403,woodhaven


# Sentence embedding

## Infersent

Setup model

In [4]:
from embedding_algorithms.inferSent import set_RNN_embedding
start = time.time()
model_type = "bilstm" 
char_level = False
model_version = 2
rnn_dim = 2048
verbose = 1
set_RNN_embedding(model_type, char_level, model_version, rnn_dim, verbose)
print("TIME: {0}".format(time.time() - start))

params['embedding'] = {
    'name': 'Infersent',
    'model_type': model_type,
    'char_level': char_level,
    'model_version': model_version,
    'rnn_dim': rnn_dim,
    'verbose': verbose
}

Vocab size : 2196017
TIME: 177.38499689102173


Embedding

In [5]:
from embedding_algorithms.inferSent import RNN_embedding
attr_list = ['last_name', 'first_name', 'zip_code', 'street_address']
params['attr_list'] = attr_list
embeddings_tokens = RNN_embedding(table, attr_list, model_type, char_level)
embeddings_tokens = np.array(embeddings_tokens)

attrs: ['last_name', 'first_name', 'zip_code', 'street_address']


# Dimensionality reduction

## TSNE

In [6]:
from dimensionality_reduction_algorithms.tsne import tsne_dim_reduction
start = time.time()

tsne_embeddings = tsne_dim_reduction(
    embeddings_tokens, 
    num_components=2,
    early_exaggeration=12,
    verbose=1,
    perplexity=40,
    method="barnes_hut")

print("TSNE: {0}".format(time.time() - start))

params['reduction'] = {
    'name': 'TSNE',
    'num_components': 2,
    'early_exaggeration': 12,
    'verbose': 1,
    'perplexity': 40,
    'method': "barnes_hut"
}

starting dimension: 4096
setting TSNE with n_components: 2 & perplexity: 40
early_exaggeration: 12
TSNE: 19.06475019454956


# Blocking

In [18]:
from cluster_algorithms.kMeans_cluster import kMean_cluster
num_clusters = 13

start = time.time()
blocks = kMean_cluster(tsne_embeddings, {'num_clusters': num_clusters, 'distance_algorithm': 'euclidean'})
print("BLOCKS: {0}".format(time.time() - start))

params['blocking'] = {
    'name': 'k_means',
    'num_clusters': num_clusters
}

clustering with NUM_CLUSTERS = 13, distance_algorithm = <function euclidean_distance at 0x7f833d050830>
BLOCKS: 13.915744066238403


In [13]:
from cluster_algorithms.birch_cluster import birch_cluster
num_clusters = 12

start = time.time()
blocks = birch_cluster(tsne_embeddings, {'num_clusters': num_clusters})
print("BLOCKS: {0}".format(time.time() - start))

params['blocking'] = {
    'name': 'birch_cluster',
    'num_clusters': num_clusters
}

clustering with NUM_CLUSTERS = 12, 
BLOCKS: 0.1053628921508789


In [None]:
from cluster_algorithms.DBScan_cluster import DBSCAN_cluster
eps = 6
min_samples = 2

start = time.time()
blocks = DBSCAN_cluster(tsne_embeddings, {'eps':eps, 'min_samples':min_samples})
print("BLOCKS: {0}".format(time.time() - start))

params['blocking'] = {
    'name': 'DBScan',
    'eps': eps,
    'min_samples': min_samples
}

In [9]:
from cluster_algorithms.hierarchy_cluster import hierarchy_cluster
num_clusters = 11
start = time.time()

blocks = hierarchy_cluster(tsne_embeddings, {'num_clusters': num_clusters})
print("BLOCKS: {0}".format(time.time() - start))

params['blocking'] = {
    'name': 'hierarchy',
    'num_clusters': num_clusters
}

clustering with NUM_CLUSTERS = 11, 
BLOCKS: 0.021318912506103516


# Evaluation

In [8]:
from evaluation import calc_index
calc_index(blocks,table,pairs)
print()
print(params['attr_list'])
print(params['embedding'])
print(params['reduction'])
print(params['blocking'])

(RR) Reduction ratio is: 0.9006822943208199
(PC) Pair completeness is: 0.9622093023255814
(RM) Reference metric (Harmonic mean RR and PC) is: 0.9304297508191931
(PQ) Pair quality - Precision is: 0.009435306861263933
(FM) Fmeasure is: 0.018687367678193366

['last_name', 'first_name', 'zip_code', 'street_address']
{'name': 'Infersent', 'model_type': 'bilstm', 'char_level': False, 'model_version': 2, 'rnn_dim': 2048, 'verbose': 1}
{'name': 'TSNE', 'num_components': 2, 'early_exaggeration': 12, 'verbose': 1, 'perplexity': 40, 'method': 'barnes_hut'}
{'name': 'hierarchy', 'num_clusters': 11}


In [14]:
from evaluation import calc_index
calc_index(blocks,table,pairs)
print()
print(params['attr_list'])
print(params['embedding'])
print(params['reduction'])
print(params['blocking'])

(RR) Reduction ratio is: 0.9035869996036464
(PC) Pair completeness is: 0.8924418604651163
(RM) Reference metric (Harmonic mean RR and PC) is: 0.897979849819893
(PQ) Pair quality - Precision is: 0.009014828953164
(FM) Fmeasure is: 0.017849356085932733

['last_name', 'first_name', 'zip_code', 'street_address']
{'name': 'Infersent', 'model_type': 'bilstm', 'char_level': False, 'model_version': 2, 'rnn_dim': 2048, 'verbose': 1}
{'name': 'TSNE', 'num_components': 2, 'early_exaggeration': 12, 'verbose': 1, 'perplexity': 40, 'method': 'barnes_hut'}
{'name': 'birch_cluster', 'num_clusters': 12}


In [19]:
from evaluation import calc_index
calc_index(blocks,table,pairs)
print()
print(params['attr_list'])
print(params['embedding'])
print(params['reduction'])
print(params['blocking'])

(RR) Reduction ratio is: 0.920381065624823
(PC) Pair completeness is: 0.8895348837209303
(RM) Reference metric (Harmonic mean RR and PC) is: 0.9046951207711825
(PQ) Pair quality - Precision is: 0.010880773743910678
(FM) Fmeasure is: 0.021498577300031612

['last_name', 'first_name', 'zip_code', 'street_address']
{'name': 'Infersent', 'model_type': 'bilstm', 'char_level': False, 'model_version': 2, 'rnn_dim': 2048, 'verbose': 1}
{'name': 'TSNE', 'num_components': 2, 'early_exaggeration': 12, 'verbose': 1, 'perplexity': 40, 'method': 'barnes_hut'}
{'name': 'k_means', 'num_clusters': 13}


In [17]:
from evaluation import calc_index
calc_index(blocks,table,pairs)
print()
print(params['attr_list'])
print(params['embedding'])
print(params['reduction'])
print(params['blocking'])

(RR) Reduction ratio is: 0.8875658229998301
(PC) Pair completeness is: 0.5843023255813954
(RM) Reference metric (Harmonic mean RR and PC) is: 0.7046918910301379
(PQ) Pair quality - Precision is: 0.005061187490557486
(FM) Fmeasure is: 0.01003544859953068

['last_name', 'first_name', 'zip_code']
{'name': 'Infersent', 'model_type': 'bilstm', 'char_level': False, 'model_version': 2, 'rnn_dim': 2048, 'verbose': 1}
{'name': 'TSNE', 'num_components': 2, 'early_exaggeration': 12, 'verbose': 1, 'perplexity': 40, 'method': 'barnes_hut'}
{'name': 'hierarchy', 'num_clusters': 11}


In [21]:
from evaluation import calc_index
calc_index(blocks,table,pairs)
print()
print(params['attr_list'])
print(params['embedding'])
print(params['reduction'])
print(params['blocking'])

(RR) Reduction ratio is: 0.9006822943208199
(PC) Pair completeness is: 0.9622093023255814
(RM) Reference metric (Harmonic mean RR and PC) is: 0.9304297508191931
(PQ) Pair quality - Precision is: 0.009435306861263933
(FM) Fmeasure is: 0.018687367678193366

['last_name', 'first_name', 'zip_code', 'street_address']
{'name': 'Infersent', 'model_type': 'bilstm', 'char_level': False, 'model_version': 2, 'rnn_dim': 2048, 'verbose': 1}
{'name': 'TSNE', 'num_components': 2, 'early_exaggeration': 12, 'verbose': 1, 'perplexity': 40, 'method': 'barnes_hut'}
{'name': 'hierarchy', 'num_clusters': 11}


## Searching

In [17]:
for i in range(5,15,1):
    num_clusters = int(i)
    blocks = kMean_cluster(tsne_embeddings, {'num_clusters': num_clusters, 'distance_algorithm': 'euclidean'})
    calc_index(blocks,table,pairs)

clustering with NUM_CLUSTERS = 5, distance_algorithm = <function euclidean_distance at 0x7f833d050830>
(RR) Reduction ratio is: 0.7341770001698658
(PC) Pair completeness is: 0.9854651162790697
(RM) Reference metric (Harmonic mean RR and PC) is: 0.8414609248299366
(PQ) Pair quality - Precision is: 0.0036104543421304875
(FM) Fmeasure is: 0.007194549969226851
clustering with NUM_CLUSTERS = 6, distance_algorithm = <function euclidean_distance at 0x7f833d050830>
(RR) Reduction ratio is: 0.80833191778495
(PC) Pair completeness is: 0.9563953488372093
(RM) Reference metric (Harmonic mean RR and PC) is: 0.8761522543547923
(PQ) Pair quality - Precision is: 0.004859603255491056
(FM) Fmeasure is: 0.009670071276361231
clustering with NUM_CLUSTERS = 7, distance_algorithm = <function euclidean_distance at 0x7f833d050830>
(RR) Reduction ratio is: 0.8411046939584395
(PC) Pair completeness is: 0.8343023255813954
(RM) Reference metric (Harmonic mean RR and PC) is: 0.8376897005238663
(PQ) Pair quality - P

In [12]:
for i in range(5,50,1):
    num_clusters = int(i)
    blocks = birch_cluster(tsne_embeddings, {'num_clusters': num_clusters})
    calc_index(blocks,table,pairs)

clustering with NUM_CLUSTERS = 5, 
(RR) Reduction ratio is: 0.749297888001812
(PC) Pair completeness is: 0.9505813953488372
(RM) Reference metric (Harmonic mean RR and PC) is: 0.8380226041754438
(PQ) Pair quality - Precision is: 0.0036927038045012593
(FM) Fmeasure is: 0.007356828689382095
clustering with NUM_CLUSTERS = 6, 
(RR) Reduction ratio is: 0.7917643395051243
(PC) Pair completeness is: 0.9505813953488372
(RM) Reference metric (Harmonic mean RR and PC) is: 0.8639346779212167
(PQ) Pair quality - Precision is: 0.004445773795766318
(FM) Fmeasure is: 0.00885015629863188
clustering with NUM_CLUSTERS = 7, 
(RR) Reduction ratio is: 0.8365239793896155
(PC) Pair completeness is: 0.8924418604651163
(RM) Reference metric (Harmonic mean RR and PC) is: 0.8635786772431274
(PQ) Pair quality - Precision is: 0.005316661759866997
(FM) Fmeasure is: 0.010570351369497478
clustering with NUM_CLUSTERS = 8, 
(RR) Reduction ratio is: 0.860667572617632
(PC) Pair completeness is: 0.8924418604651163
(RM) Re

(RR) Reduction ratio is: 0.9669186342789197
(PC) Pair completeness is: 0.688953488372093
(RM) Reference metric (Harmonic mean RR and PC) is: 0.8046055694106765
(PQ) Pair quality - Precision is: 0.020282413350449294
(FM) Fmeasure is: 0.039404771801479756
clustering with NUM_CLUSTERS = 35, 
(RR) Reduction ratio is: 0.967966140082668
(PC) Pair completeness is: 0.688953488372093
(RM) Reference metric (Harmonic mean RR and PC) is: 0.8049680109806632
(PQ) Pair quality - Precision is: 0.020945647370746796
(FM) Fmeasure is: 0.04065528776052835
clustering with NUM_CLUSTERS = 36, 
(RR) Reduction ratio is: 0.9686880697582243
(PC) Pair completeness is: 0.6802325581395349
(RM) Reference metric (Harmonic mean RR and PC) is: 0.7992296931489927
(PQ) Pair quality - Precision is: 0.02115732368896926
(FM) Fmeasure is: 0.041038232199228336
clustering with NUM_CLUSTERS = 37, 
(RR) Reduction ratio is: 0.9696817847234018
(PC) Pair completeness is: 0.6744186046511628
(RM) Reference metric (Harmonic mean RR an

In [27]:
for i in [30,35,40,45,50]:
    print('>>>>>>>>>>>>>>>>>>>>: '+str(i))
    tsne_embeddings = tsne_dim_reduction(
    embeddings_tokens, 
    num_components=2,
    early_exaggeration=12,
    verbose=1,
    perplexity=i,
    method="barnes_hut")
    num_clusters = 29
    blocks = hierarchy_cluster(tsne_embeddings, {'num_clusters': num_clusters})
    calc_index(blocks,table,pairs)

>>>>>>>>>>>>>>>>>>>>: 30
starting dimension: 4096
setting TSNE with n_components: 2 & perplexity: 30
early_exaggeration: 12
clustering with NUM_CLUSTERS = 29, 
(RR) Reduction ratio is: 0.9620762294641738
(PC) Pair completeness is: 0.875
(RM) Reference metric (Harmonic mean RR and PC) is: 0.9164744361497591
(PQ) Pair quality - Precision is: 0.47320220298977184
(FM) Fmeasure is: 0.61422823178578
>>>>>>>>>>>>>>>>>>>>: 35
starting dimension: 4096
setting TSNE with n_components: 2 & perplexity: 35
early_exaggeration: 12
clustering with NUM_CLUSTERS = 29, 
(RR) Reduction ratio is: 0.9624832162699242
(PC) Pair completeness is: 0.87243947858473
(RM) Reference metric (Harmonic mean RR and PC) is: 0.9152520241901533
(PQ) Pair quality - Precision is: 0.47693580199783675
(FM) Fmeasure is: 0.6167263153564524
>>>>>>>>>>>>>>>>>>>>: 40
starting dimension: 4096
setting TSNE with n_components: 2 & perplexity: 40
early_exaggeration: 12
clustering with NUM_CLUSTERS = 29, 
(RR) Reduction ratio is: 0.963588

# Result

In [28]:
# Census
# RR PC alpha
list_embedding = [0.8945, 0.9542, 0.9234]

list_hierarchy = [0.9006, 0.9622, 0.9304] # TSNE: 19.064, BLOCKS: 0.021, n_cluster: 11
list_kmeans = [0.9203, 0.8895, 0.9046] # TSNE: 19.064, BLOCKS: 13.915, n_cluster: 13
list_birch = [0.9035, 0.8924, 0.8979] # TSNE: 19.064, BLOCKS: 0.1053, n_cluster: 12
list_DBscan = [0.9633, 0.8717, 0.9152] # TSNE: 19.064, BLOCKS: 0.011