# IMPORT LIBRARY

In [6]:
import os
import sys
import time
import numpy as np
import matplotlib.pyplot as plt
module_path = os.path.abspath(os.path.join('../../fusion'))
sys.path.append(module_path)

from preprocessing_datasets.preprocessing_utilities import ValueUtils
from preprocessing_datasets import load_dataset
from embedding_algorithms import sentence_embedding, set_embedding_model
from dimensionality_reduction_algorithms import dimension_reduction_algorithms
from cluster_algorithms import cluster_algorithm

from helper import load_by_index, get_author_candidates, getFinalAuthors, launchWithoutReductionFusion
from plot_tools import plotChart, plotCluster

# SETUP INFERSENT

In [7]:
key_values = {
    'model_type':'bilstm',
    'char_level':False,
    'model_version': 2,
    'rnn_dim':1024,
    'verbose':1,
    'attributes_list': ['author','title','big_cate', 'small_cate'],
    'embedding_type': 'inferSent',
    'dataset': 'merged_book-multiAuthors',
    'dimension_reduction': 'pca',
    'num_components': 2,
    'cluster_method': 'hierarchy',
    'num_clusters': 10,
    'block_length_thresold': 0.2,
}

In [8]:
set_embedding_model(key_values)

Vocab size : 2196017
model_version: 2
rnn_dim: 1024
model_type: bilstm
char_level: False
Setup time is: 164.8251781463623


# LOAD

In [4]:
dataset_name, table_group_by_isbn, isbn_list, true_authors = load_dataset(key_values)
table_ISBN, list_authors, true_author = load_by_index(table_group_by_isbn, isbn_list, true_authors, 0, key_values['verbose'])

dataset: merged_book-multiAuthors
Loading time is: 5.196461915969849
ISBN: 1878685546
true author: Alan Shepard, Deke Slayton


In [6]:
embeddings_tokens = sentence_embedding(table_ISBN, key_values)

embedding_type: inferSent
attributes_list: ['author', 'title', 'big_cate', 'small_cate']
model_type: bilstm
char_level: False
Embedding time is: 4.089197158813477


In [7]:
blocks = cluster_algorithm(embeddings_tokens, key_values)

cluster_method: hierarchy
num_clusters: 10
Blocking time is: 0.4616127014160156


In [9]:
listCandidates = get_author_candidates(list_authors, blocks, key_values['block_length_thresold'] * len(embeddings_tokens), key_values['verbose'])

Discarted candidate: [{'alan': 1, 'barbree jay': 86, 'jr': 2}, {'benedict howard': 85, 'benedict howard shepard': 1}, {'alan b shepard': 1, 'alan shepard': 21, 'alan shepherd': 1, 'armstrong neil': 1, 'barbree jay': 1}, {'alan barbree~howard benedict shepard~deke slayton~jay': 1, 'alan shepard': 6, 'armstrong neil': 2, 'barbree jay': 2, 'benedict howard': 2}, {'deke slayton': 8, 'deke slayton w': 1}, {'adams ben': 1, 'alan shepard': 5, 'alan shpard': 1, 'armstrong intro neil': 1}, {'alan shepard': 1, 'nan': 6}, {'armstrong neil': 7}]
Possible candidate: [{'armstrong deke neil slayton': 1, 'deke slayton': 113}, {'alan b shepard': 1, 'alan shepard': 90, 'alan shepherd': 5}]
lengthNecessary: 90.80000000000001


In [10]:
print(listCandidates)
print("{0} VS true_author: {1}".format(getFinalAuthors(listCandidates), true_author))

[{'armstrong deke neil slayton': 1, 'deke slayton': 113}, {'alan b shepard': 1, 'alan shepard': 90, 'alan shepherd': 5}]
['deke slayton', 'alan shepard'] VS true_author: Alan Shepard, Deke Slayton


# Testing Launch

In [9]:
key_values = {
    'model_type':'bilstm',
    'char_level':False,
    'model_version': 2,
    'rnn_dim':1024,
    'verbose':0,
    'attributes_list': ['author','title','big_cate', 'small_cate'],
    'embedding_type': 'inferSent',
    'dataset': 'merged_book-multiAuthors',
    'cluster_method': 'hierarchy',
    'num_clusters': 10,
    'block_length_thresold': 0.2,
}

In [10]:
dataset_name, table_group_by_isbn, isbn_list, true_authors = load_dataset(key_values)

In [22]:
def launchWithoutReductionFusion(tableGroupByISBN, list_ISBN_10, golden_true, key_values):
    finalAuthors = []
    realAuthors = []
    for index in range(0,len(list_ISBN_10)):
        table_ISBN, list_authors, true_author = load_by_index(tableGroupByISBN, list_ISBN_10, golden_true, index, key_values['verbose'])
        embeddings_tokens = sentence_embedding(table_ISBN, key_values)
        blocks = cluster_algorithm(embeddings_tokens, key_values)
        listCandidates = get_author_candidates(list_authors, blocks, key_values['block_length_thresold'] * len(embeddings_tokens), key_values['verbose'])
        finalAuthor = getFinalAuthors(listCandidates)
        realAuthor = filterGoldenTruth(true_author)
        if (key_values['verbose'] > 0):
            print(listCandidates)
            print("{0} VS true_author: {1}".format(finalAuthor, realAuthor))
        finalAuthors.append(finalAuthor)
        realAuthors.append(realAuthor)
    return finalAuthors, realAuthors

In [23]:
from fuzzywuzzy import fuzz

def getTPFNFP(predAuthors, trueAuthors):
    TP, FP, FN = 0, 0, 0 
    predLength = len(predAuthors)
    trueLength = len(trueAuthors)
    if trueLength != predLength:
        raise Exception("size not compatible")
    cpyPredAuthors = predAuthors
    cpyTrueAuthors = trueAuthors
    for index in range(0, predLength):
        predCandidates = cpyPredAuthors[index]
        trueCandidates = cpyTrueAuthors[index]
        
        calculateFN = len(trueCandidates)
        calculateFP = len(predCandidates)
        calculateTP = 0
        for predCandidate in predCandidates:
            res = True in (fuzz.ratio(predCandidate, trueCandidate) > 90 for trueCandidate in trueCandidates)
            if res:
                calculateTP = calculateTP + 1
        FN = FN + calculateFN - calculateTP
        FP = FP + calculateFP - calculateTP
        TP = TP + calculateTP
    
    return TP, FP, FN

In [69]:
def filterGoldenTruth(authors):
    authors = ValueUtils.split_values(authors)
    return [ValueUtils.clean_value(x) for x in authors]

In [12]:
finalAuthors, trueAuthors = launchWithoutReductionFusion(table_group_by_isbn, isbn_list, true_authors, key_values)

In [13]:
finalAuthors

[['deke slayton', 'alan shepard'],
 ['carter jimmy', 'carter rosalynn'],
 ['petrov vadim', 'igor lysenko'],
 ['a byrne john', 'jack welch'],
 ['barbara kingsolver', 'camille kingsolver', 'hopp l steven'],
 ['colin l powell', 'e joseph persico'],
 ['barbara kingsolver', 'hopp l steven', 'camille kingsolver'],
 ['bowles sheldon', 'blanchard ken'],
 ['bob hope']]

In [14]:
trueAuthors

[['alan shepard', 'deke slayton'],
 ['carter jimmy', 'carter rosalynn'],
 ['petrov vadim', 'igor lysenho', 'egorgy georgy'],
 ['jack welch', 'a byrne john'],
 ['barbara kingsolver', 'hopp l steven', 'camille kingsolver'],
 ['colin powell', 'e joseph persico'],
 ['barbara kingsolver', 'hopp l steven', 'camille kingsolver'],
 ['blanchard ken', 'bowles sheldon'],
 ['bob hope', 'melville shavelson']]

In [27]:
TP, FP, FN = getTPFNFP(finalAuthors, trueAuthors)
print(TP,FN,FP)

19 2 0


In [28]:
FN

2

In [37]:
def _getPrecision(TP, FP):
    return TP / (TP + FP)

def _getRecall(TP, FN):
    return TP / (TP + FN)

def _getF1Score(precision, recall):
    return (2*precision*recall)/(precision+recall)

In [38]:
precision = _getPrecision(TP, FP)

In [39]:
recall = _getRecall(TP, FN)

In [40]:
_getF1Score(precision, recall)

0.9500000000000001

# Testing 

## ['author','title','big_cate', 'small_cate', 'publisher']

In [12]:
key_values = {
    'model_type':'bilstm',
    'char_level':False,
    'model_version': 2,
    'rnn_dim':1024,
    'verbose':1,
    'attributes_list': ['author','title','big_cate', 'small_cate', 'publisher'],
    'embedding_type': 'inferSent',
    'dataset': 'merged_book-multiAuthors',
    'cluster_method': 'hierarchy',
    'num_clusters': 10,
    'block_length_thresold': 0.2,
}

In [13]:
for index in range(0,9):
    print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
    print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>book number {0}<<<<<<<<<<<<<<<<<<<<<<<<".format(index+1))
    table_ISBN, list_authors, true_author = load_by_index(table_group_by_isbn, isbn_list, true_authors, index, key_values['verbose'])
    print("tot cases of author's name: {0}".format(len(table_ISBN['author'].value_counts())))
    embeddings_tokens = sentence_embedding(table_ISBN, key_values)
    blocks = cluster_algorithm(embeddings_tokens, key_values)
    listCandidates = get_author_candidates(list_authors, blocks, key_values['block_length_thresold'] * len(embeddings_tokens), key_values['verbose'])
    print(listCandidates)
    print("{0} VS true_author: {1}".format(getFinalAuthors(listCandidates), true_author))

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>>book number 1<<<<<<<<<<<<<<<<<<<<<<<<
ISBN: 1878685546
true author: Alan Shepard, Deke Slayton
tot cases of author's name: 16
embedding_type: inferSent
attributes_list: ['author', 'title', 'big_cate', 'small_cate', 'publisher']
model_type: bilstm
char_level: False
Embedding time is: 4.4399168491363525
cluster_method: hierarchy
num_clusters: 10
Blocking time is: 0.23854374885559082
Discarted candidate: [{'alan': 1, 'barbree jay': 83, 'jr': 2}, {'benedict howard': 84, 'benedict howard shepard': 1}, {'alan shepard': 15, 'barbree jay': 3}, {'alan shepard': 7, 'armstrong intro neil': 1, 'armstrong neil': 1, 'barbree jay': 1, 'benedict howard': 1, 'deke slayton': 1, 'nan': 2}, {'alan barbree~howard benedict shepard~deke slayton~jay': 1, 'alan shepard': 6, 'armstrong neil': 2, 'barbree jay': 2, 'benedict howard': 2}, {'armstrong neil': 7, 'nan': 4}, {'deke slayton': 8,

embedding_type: inferSent
attributes_list: ['author', 'title', 'big_cate', 'small_cate', 'publisher']
model_type: bilstm
char_level: False
Embedding time is: 3.043212890625
cluster_method: hierarchy
num_clusters: 10
Blocking time is: 0.1187748908996582
Discarted candidate: [{'barbara kingsolver': 4, 'camille kingsolver': 58}, {'barbara kingsolver': 58, 'kingsolver': 2}, {'barbara kingsolver': 29, 'camille kingsolver': 23, 'hopp l steven': 1}, {'a houser richard': 1, 'camille kin': 1, 'hopp l steven': 24}, {'barbara kingsolver': 16, 'camille kingsolver': 6}, {'barbara kingsolver': 5, 'camille kingsolver': 4, 'kingsolver l steven': 1}, {'barbara kingsolver': 3, 'camille': 1, 'hopp steven': 1, 'l': 1, 'nan': 1}, {'nan': 7}, {'barbara hopp': 1, 'hopp l steven': 3}]
Possible candidate: [{'a houser richard': 2, 'barbara kingslover': 1, 'harper perennial': 1, 'hopp l steven': 64, 'l steven': 1}]
lengthNecessary: 64.0
[{'a houser richard': 2, 'barbara kingslover': 1, 'harper perennial': 1, 'ho

## ['author','title','big_cate', 'small_cate']

In [14]:
key_values = {
    'model_type':'bilstm',
    'char_level':False,
    'model_version': 2,
    'rnn_dim':1024,
    'verbose':1,
    'attributes_list': ['author','title','big_cate', 'small_cate'],
    'embedding_type': 'inferSent',
    'dataset': 'merged_book-multiAuthors',
    'cluster_method': 'hierarchy',
    'num_clusters': 10,
    'block_length_thresold': 0.2,
}

In [15]:
for index in range(0,9):
    print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
    print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>book number {0}<<<<<<<<<<<<<<<<<<<<<<<<".format(index+1))
    table_ISBN, list_authors, true_author = load_by_index(table_group_by_isbn, isbn_list, true_authors, index, key_values['verbose'])
    print("tot cases of author's name: {0}".format(len(table_ISBN['author'].value_counts())))
    embeddings_tokens = sentence_embedding(table_ISBN, key_values)
    blocks = cluster_algorithm(embeddings_tokens, key_values)
    listCandidates = get_author_candidates(list_authors, blocks, key_values['block_length_thresold'] * len(embeddings_tokens), key_values['verbose'])
    print(listCandidates)
    print("{0} VS true_author: {1}".format(getFinalAuthors(listCandidates), true_author))

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>>book number 1<<<<<<<<<<<<<<<<<<<<<<<<
ISBN: 1878685546
true author: Alan Shepard, Deke Slayton
tot cases of author's name: 16
embedding_type: inferSent
attributes_list: ['author', 'title', 'big_cate', 'small_cate']
model_type: bilstm
char_level: False
Embedding time is: 3.5261456966400146
cluster_method: hierarchy
num_clusters: 10
Blocking time is: 0.23763108253479004
Discarted candidate: [{'alan': 1, 'barbree jay': 86, 'jr': 2}, {'benedict howard': 85, 'benedict howard shepard': 1}, {'alan b shepard': 1, 'alan shepard': 21, 'alan shepherd': 1, 'armstrong neil': 1, 'barbree jay': 1}, {'alan barbree~howard benedict shepard~deke slayton~jay': 1, 'alan shepard': 6, 'armstrong neil': 2, 'barbree jay': 2, 'benedict howard': 2}, {'deke slayton': 8, 'deke slayton w': 1}, {'adams ben': 1, 'alan shepard': 5, 'alan shpard': 1, 'armstrong intro neil': 1}, {'alan shepard': 

embedding_type: inferSent
attributes_list: ['author', 'title', 'big_cate', 'small_cate']
model_type: bilstm
char_level: False
Embedding time is: 2.759558916091919
cluster_method: hierarchy
num_clusters: 10
Blocking time is: 0.25157999992370605
Discarted candidate: [{'barbara kingsolver': 28}, {'a houser richard': 1, 'camille kin': 1, 'hopp l steven': 25}, {'barbara kingsolver': 1, 'camille kingsolver': 23}, {'barbara kingsolver': 6, 'camille kingsolver': 4, 'kingsolver l steven': 1}, {'barbara kingsolver': 3, 'camille': 1, 'hopp steven': 1, 'l': 1, 'nan': 1}, {'nan': 7}, {'a houser richard': 1, 'barbara hopp': 1, 'hopp l steven': 3}]
Possible candidate: [{'barbara kingsolver': 77, 'kingsolver': 2}, {'a houser richard': 1, 'barbara kingslover': 1, 'harper perennial': 1, 'hopp l steven': 64, 'l steven': 1}, {'camille kingsolver': 64}]
lengthNecessary: 64.0
[{'barbara kingsolver': 77, 'kingsolver': 2}, {'a houser richard': 1, 'barbara kingslover': 1, 'harper perennial': 1, 'hopp l steven'

## ['author','title','big_cate']

In [16]:
key_values = {
    'model_type':'bilstm',
    'char_level':False,
    'model_version': 2,
    'rnn_dim':1024,
    'verbose':1,
    'attributes_list': ['author','title','big_cate'],
    'embedding_type': 'inferSent',
    'dataset': 'merged_book-multiAuthors',
    'cluster_method': 'hierarchy',
    'num_clusters': 10,
    'block_length_thresold': 0.2,
}

for index in range(0,9):
    print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
    print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>book number {0}<<<<<<<<<<<<<<<<<<<<<<<<".format(index+1))
    table_ISBN, list_authors, true_author = load_by_index(table_group_by_isbn, isbn_list, true_authors, index, key_values['verbose'])
    print("tot cases of author's name: {0}".format(len(table_ISBN['author'].value_counts())))
    embeddings_tokens = sentence_embedding(table_ISBN, key_values)
    blocks = cluster_algorithm(embeddings_tokens, key_values)
    listCandidates = get_author_candidates(list_authors, blocks, key_values['block_length_thresold'] * len(embeddings_tokens), key_values['verbose'])
    print(listCandidates)
    print("{0} VS true_author: {1}".format(getFinalAuthors(listCandidates), true_author))

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>>book number 1<<<<<<<<<<<<<<<<<<<<<<<<
ISBN: 1878685546
true author: Alan Shepard, Deke Slayton
tot cases of author's name: 16
embedding_type: inferSent
attributes_list: ['author', 'title', 'big_cate']
model_type: bilstm
char_level: False
Embedding time is: 3.991178035736084
cluster_method: hierarchy
num_clusters: 10
Blocking time is: 0.24894404411315918
Discarted candidate: [{'alan': 1, 'barbree jay': 86, 'jr': 2}, {'benedict howard': 85, 'benedict howard shepard': 1}, {'alan b shepard': 1, 'alan shepard': 21, 'alan shepherd': 1, 'armstrong neil': 1, 'barbree jay': 1}, {'alan barbree~howard benedict shepard~deke slayton~jay': 1, 'alan shepard': 6, 'armstrong neil': 2, 'barbree jay': 2, 'benedict howard': 2}, {'deke slayton': 8, 'deke slayton w': 1}, {'adams ben': 1, 'alan shepard': 5, 'alan shpard': 1, 'armstrong intro neil': 1}, {'alan shepard': 1, 'nan': 6}, {

embedding_type: inferSent
attributes_list: ['author', 'title', 'big_cate']
model_type: bilstm
char_level: False
Embedding time is: 2.3745601177215576
cluster_method: hierarchy
num_clusters: 10
Blocking time is: 0.1181192398071289
Discarted candidate: [{'barbara kingsolver': 29}, {'hopp l steven': 25}, {'camille kingsolver': 23}, {'barbara kingsolver': 6, 'camille kingsolver': 4, 'kingsolver l steven': 1}, {'barbara kingsolver': 3, 'camille': 1, 'hopp steven': 1, 'l': 1, 'nan': 1}, {'nan': 7}, {'a houser richard': 1, 'barbara hopp': 1, 'hopp l steven': 3}]
Possible candidate: [{'barbara kingsolver': 77, 'kingsolver': 2}, {'a houser richard': 2, 'barbara kingslover': 1, 'camille kin': 1, 'harper perennial': 1, 'hopp l steven': 64, 'l steven': 1}, {'camille kingsolver': 64}]
lengthNecessary: 64.0
[{'barbara kingsolver': 77, 'kingsolver': 2}, {'a houser richard': 2, 'barbara kingslover': 1, 'camille kin': 1, 'harper perennial': 1, 'hopp l steven': 64, 'l steven': 1}, {'camille kingsolver':

## ['author','title']

In [17]:
key_values = {
    'model_type':'bilstm',
    'char_level':False,
    'model_version': 2,
    'rnn_dim':1024,
    'verbose':1,
    'attributes_list': ['author','title'],
    'embedding_type': 'inferSent',
    'dataset': 'merged_book-multiAuthors',
    'cluster_method': 'hierarchy',
    'num_clusters': 10,
    'block_length_thresold': 0.2,
}

for index in range(0,9):
    print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
    print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>book number {0}<<<<<<<<<<<<<<<<<<<<<<<<".format(index+1))
    table_ISBN, list_authors, true_author = load_by_index(table_group_by_isbn, isbn_list, true_authors, index, key_values['verbose'])
    print("tot cases of author's name: {0}".format(len(table_ISBN['author'].value_counts())))
    embeddings_tokens = sentence_embedding(table_ISBN, key_values)
    blocks = cluster_algorithm(embeddings_tokens, key_values)
    listCandidates = get_author_candidates(list_authors, blocks, key_values['block_length_thresold'] * len(embeddings_tokens), key_values['verbose'])
    print(listCandidates)
    print("{0} VS true_author: {1}".format(getFinalAuthors(listCandidates), true_author))

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>>book number 1<<<<<<<<<<<<<<<<<<<<<<<<
ISBN: 1878685546
true author: Alan Shepard, Deke Slayton
tot cases of author's name: 16
embedding_type: inferSent
attributes_list: ['author', 'title']
model_type: bilstm
char_level: False
Embedding time is: 3.7854409217834473
cluster_method: hierarchy
num_clusters: 10
Blocking time is: 0.25205302238464355
Discarted candidate: [{'alan': 1, 'barbree jay': 86, 'jr': 2}, {'benedict howard': 85, 'benedict howard shepard': 1}, {'alan b shepard': 1, 'alan shepard': 21, 'alan shepherd': 1, 'armstrong neil': 1, 'barbree jay': 1}, {'alan barbree~howard benedict shepard~deke slayton~jay': 1, 'alan shepard': 6, 'armstrong neil': 2, 'barbree jay': 2, 'benedict howard': 2}, {'deke slayton': 8, 'deke slayton w': 1}, {'adams ben': 1, 'alan shepard': 5, 'alan shpard': 1, 'armstrong intro neil': 1}, {'alan shepard': 1, 'nan': 6}, {'armstrong 

embedding_type: inferSent
attributes_list: ['author', 'title']
model_type: bilstm
char_level: False
Embedding time is: 1.6197230815887451
cluster_method: hierarchy
num_clusters: 10
Blocking time is: 0.0664219856262207
Discarted candidate: [{'blanchard': 1, 'blanchard h kenneth': 2, 'blanchard ken': 13, 'harvey mackay': 1}, {'blanchard h kenneth': 13, 'blanchard kenneth': 2}, {'bowles m sheldon': 15}, {'blanchard ken': 6, 'blanchard kenneth': 2, 'bowels sheldon': 1, 'bowles sheldon': 1, 'nan': 1}, {'bowles sheldon': 6, 'nan': 2}, {'blanchard': 3, 'blanchard ken': 2, 'bowles': 1, 'bowles ken sheldon': 1, 'bowles sheldon': 1}, {'nan': 7}, {'foreword harvey mackay': 1, 'foreword-harvey mackay': 1, 'harvey mackay': 4, 'harvey mackay sheldon': 1}]
Possible candidate: [{'blanchard': 1, 'blanchard bowles ken': 1, 'blanchard ken': 69}, {'bowles sheldon': 69}]
lengthNecessary: 45.6
[{'blanchard': 1, 'blanchard bowles ken': 1, 'blanchard ken': 69}, {'bowles sheldon': 69}]
['blanchard ken', 'bowle

## ['author']

In [18]:
key_values = {
    'model_type':'bilstm',
    'char_level':False,
    'model_version': 2,
    'rnn_dim':1024,
    'verbose':1,
    'attributes_list': ['author'],
    'embedding_type': 'inferSent',
    'dataset': 'merged_book-multiAuthors',
    'cluster_method': 'hierarchy',
    'num_clusters': 10,
    'block_length_thresold': 0.2,
}

for index in range(0,9):
    print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
    print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>book number {0}<<<<<<<<<<<<<<<<<<<<<<<<".format(index+1))
    table_ISBN, list_authors, true_author = load_by_index(table_group_by_isbn, isbn_list, true_authors, index, key_values['verbose'])
    print("tot cases of author's name: {0}".format(len(table_ISBN['author'].value_counts())))
    embeddings_tokens = sentence_embedding(table_ISBN, key_values)
    blocks = cluster_algorithm(embeddings_tokens, key_values)
    listCandidates = get_author_candidates(list_authors, blocks, key_values['block_length_thresold'] * len(embeddings_tokens), key_values['verbose'])
    print(listCandidates)
    print("{0} VS true_author: {1}".format(getFinalAuthors(listCandidates), true_author))

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>>book number 1<<<<<<<<<<<<<<<<<<<<<<<<
ISBN: 1878685546
true author: Alan Shepard, Deke Slayton
tot cases of author's name: 16
embedding_type: inferSent
attributes_list: ['author']
model_type: bilstm
char_level: False
Embedding time is: 1.4238479137420654
cluster_method: hierarchy
num_clusters: 10
Blocking time is: 0.2919588088989258
Discarted candidate: [{'barbree jay': 89}, {'alan barbree~howard benedict shepard~deke slayton~jay': 1, 'benedict howard': 87, 'benedict howard shepard': 1}, {'armstrong intro neil': 1, 'armstrong neil': 10}, {'nan': 6}, {'alan shepherd': 6}, {'adams ben': 1, 'alan': 1, 'alan shpard': 1}, {'jr': 2}, {'armstrong deke neil slayton': 1}]
Possible candidate: [{'alan b shepard': 2, 'alan shepard': 123}, {'deke slayton': 121, 'deke slayton w': 1}]
lengthNecessary: 90.80000000000001
[{'alan b shepard': 2, 'alan shepard': 123}, {'deke slayto

embedding_type: inferSent
attributes_list: ['author']
model_type: bilstm
char_level: False
Embedding time is: 0.499845027923584
cluster_method: hierarchy
num_clusters: 10
Blocking time is: 0.04325413703918457
Discarted candidate: [{'nan': 2}, {'bob hope w': 2}, {'better homes': 1}, {'gardens': 1}, {'bob hope~melville shavelson': 1}, {'melville shavelson': 1}, {'melville shavelson': 1}, {'melville shavelson': 1}]
Possible candidate: [{'bob hope': 120}, {'melville shavelson': 61}]
lengthNecessary: 38.2
[{'bob hope': 120}, {'melville shavelson': 61}]
['bob hope', 'melville shavelson'] VS true_author: Bob Hope, Melville Shavelson
