<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Setup" data-toc-modified-id="Setup-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Setup</a></span></li><li><span><a href="#Import-dataset" data-toc-modified-id="Import-dataset-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Import dataset</a></span></li><li><span><a href="#Sentence-embedding" data-toc-modified-id="Sentence-embedding-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Sentence embedding</a></span><ul class="toc-item"><li><span><a href="#Infersent" data-toc-modified-id="Infersent-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Infersent</a></span></li></ul></li><li><span><a href="#Dimensionality-reduction" data-toc-modified-id="Dimensionality-reduction-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Dimensionality reduction</a></span><ul class="toc-item"><li><span><a href="#PCA" data-toc-modified-id="PCA-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>PCA</a></span></li></ul></li><li><span><a href="#Blocking" data-toc-modified-id="Blocking-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Blocking</a></span></li><li><span><a href="#Evaluation" data-toc-modified-id="Evaluation-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Evaluation</a></span><ul class="toc-item"><li><span><a href="#bilstm,-rnn_dim-2048" data-toc-modified-id="bilstm,-rnn_dim-2048-6.1"><span class="toc-item-num">6.1&nbsp;&nbsp;</span>bilstm, rnn_dim 2048</a></span></li><li><span><a href="#Searching" data-toc-modified-id="Searching-6.2"><span class="toc-item-num">6.2&nbsp;&nbsp;</span>Searching</a></span></li></ul></li><li><span><a href="#Result" data-toc-modified-id="Result-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Result</a></span></li></ul></div>

# Setup 

In [1]:
import os
import sys
import time
import numpy as np
import matplotlib.pyplot as plt

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from preprocessing_datasets.preprocessing_utilities import get_labels_by

params = {}

# Import dataset

In [2]:
from preprocessing_datasets.preprocessing_dblp_acm import clean_dblp_acm
table, pairs = clean_dblp_acm()

In [3]:
table

Unnamed: 0,title,authors,venue,year
0,semantic integration of environmental models f...,d. scott mackay,sigmod record,1999
1,estimation of query-result distribution and it...,"viswanath poosala, yannis e. ioannidis",vldb,1996
2,incremental maintenance for non-distributive a...,"themistoklis palpanas, richard sidle, hamid pi...",vldb,2002
3,cost-based selection of path expression proces...,"zhao-hui tang, georges gardarin, jean-robert g...",vldb,1996
4,benchmarking spatial join operations with spat...,"erik g. hoel, hanan samet",vldb,1995
...,...,...,...,...
4905,dual-buffering strategies in object bases,"alfons kemper, donald kossmann",very large data bases,1994
4906,guest editorial,"philip a. bernstein, yannis ioannidis, raghu r...",the vldb journal &mdash; the international jou...,2003
4907,graphdb: modeling and querying graphs in datab...,ralf hartmut g&#252;ting,very large data bases,1994
4908,review of the data warehouse toolkit: the comp...,alexander a. anisimov,acm sigmod record,2003


# Sentence embedding

## Infersent

Setup model

In [4]:
from embedding_algorithms.inferSent import set_RNN_embedding
start = time.time()
model_type = "bilstm" 
char_level = False
model_version = 2
rnn_dim = 2048
verbose = 1
set_RNN_embedding(model_type, char_level, model_version, rnn_dim, verbose)
print("TIME: {0}".format(time.time() - start))

params['embedding'] = {
    'name': 'Infersent',
    'model_type': model_type,
    'char_level': char_level,
    'model_version': model_version,
    'rnn_dim': rnn_dim,
    'verbose': verbose
}

Vocab size : 2196017
TIME: 160.28073811531067


Embedding

In [5]:
from embedding_algorithms.inferSent import RNN_embedding
attr_list = ['title', 'authors']
params['attr_list'] = attr_list
embeddings_tokens = RNN_embedding(table, attr_list, model_type, char_level)
embeddings_tokens = np.array(embeddings_tokens)

attrs: ['title', 'authors']


# Dimensionality reduction

## PCA

In [6]:
from dimensionality_reduction_algorithms.pca import pca_dim_reduction

start = time.time()
pca_embeddings = pca_dim_reduction(
            embeddings_tokens, 
            num_components=2,
            verbose=1)

print("PCA: {0}".format(time.time() - start))

params['reduction'] = {
    'name': 'PCA',
    'num_components': 2,
    'verbose': 1,
}

starting dimension: 4096
setting PCA with n_components: 2
PCA: 0.6182851791381836


# Blocking

In [16]:
from cluster_algorithms.hierarchy_cluster import hierarchy_cluster
num_clusters = 10
start = time.time()
blocks = hierarchy_cluster(pca_embeddings, {'num_clusters': num_clusters})
print("BLOCKS: {0}".format(time.time() - start))

params['blocking'] = {
    'name': 'hierarchy',
    'num_clusters': num_clusters
}

clustering with NUM_CLUSTERS = 10, 
BLOCKS: 0.5895111560821533


# Evaluation

## bilstm, rnn_dim 2048

In [17]:
from evaluation import calc_index
calc_index(blocks,table,pairs)
print()
print(params['attr_list'])
print(params['embedding'])
print(params['reduction'])
print(params['blocking'])

(RR) Reduction ratio is: 0.869636093811649
(PC) Pair completeness is: 0.7351618705035972
(RM) Reference metric (Harmonic mean RR and PC) is: 0.7967648409334909
(PQ) Pair quality - Precision is: 0.0010406767772499783
(FM) Fmeasure is: 0.002078411407237067

['title', 'authors']
{'name': 'Infersent', 'model_type': 'bilstm', 'char_level': False, 'model_version': 2, 'rnn_dim': 2048, 'verbose': 1}
{'name': 'PCA', 'num_components': 2, 'verbose': 1}
{'name': 'hierarchy', 'num_clusters': 10}


## Searching

In [15]:
for i in range(1,50,5):
    num_clusters = int(i)
    blocks = hierarchy_cluster(pca_embeddings, {'num_clusters': num_clusters})
    calc_index(blocks,table,pairs)

clustering with NUM_CLUSTERS = 1, 
(RR) Reduction ratio is: 0.0
(PC) Pair completeness is: 1.0
(RM) Reference metric (Harmonic mean RR and PC) is: 0
(PQ) Pair quality - Precision is: 0.00018453988870352844
(FM) Fmeasure is: 0.0003690116800326934
clustering with NUM_CLUSTERS = 6, 
(RR) Reduction ratio is: 0.75905488028763
(PC) Pair completeness is: 0.7877697841726619
(RM) Reference metric (Harmonic mean RR and PC) is: 0.7731458037333901
(PQ) Pair quality - Precision is: 0.0006033529480438037
(FM) Fmeasure is: 0.0012057823872495394
clustering with NUM_CLUSTERS = 11, 
(RR) Reduction ratio is: 0.8705478403481033
(PC) Pair completeness is: 0.7342625899280576
(RM) Reference metric (Harmonic mean RR and PC) is: 0.7966183417692245
(PQ) Pair quality - Precision is: 0.001046724419189734
(FM) Fmeasure is: 0.002090468780903382
clustering with NUM_CLUSTERS = 16, 
(RR) Reduction ratio is: 0.9241639799545205
(PC) Pair completeness is: 0.6933453237410072
(RM) Reference metric (Harmonic mean RR and PC)

# Result

In [None]:
# DBLP_ACM
# RR PC alpha

# attrs: ['title', 'authors']
list_hierarchy_tsne = [0.9990, 0.9775, 0.9881]  
list_hierarchy_pca = [0.8696, 0.7351, 0.7967] # PCA: 0.6182, BLOCKS: 0.589, n_cluster: 10 
