# IMPORT LIBRARY

In [1]:
import os
import sys
import time
import numpy as np
import matplotlib.pyplot as plt
module_path = os.path.abspath(os.path.join('../../fusion'))
sys.path.append(module_path)

from preprocessing_datasets import load_dataset
from embedding_algorithms import sentence_embedding, set_embedding_model
from dimensionality_reduction_algorithms import dimension_reduction_algorithms
from cluster_algorithms import cluster_algorithm

from helper import load_by_index, get_author_candidates, getFinalAuthors
from plot_tools import plotChart, plotCluster

# SETUP INFERSENT

In [2]:
key_values = {
    'model_type':'bilstm',
    'char_level':False,
    'model_version': 2,
    'rnn_dim':1024,
    'verbose':1,
    'attributes_list': ['author','title','big_cate', 'small_cate'],
    'embedding_type': 'inferSent',
    'dataset': 'merged_book-multiAuthors',
    'dimension_reduction': 'pca',
    'num_components': 2,
    'cluster_method': 'hierarchy',
    'num_clusters': 10,
    'block_length_thresold': 0.2,
}

In [3]:
set_embedding_model(key_values)

Vocab size : 2196017
model_version: 2
rnn_dim: 1024
model_type: bilstm
char_level: False
Setup time is: 216.9755938053131


# LOAD

In [5]:
dataset_name, table_group_by_isbn, isbn_list, true_authors = load_dataset(key_values)
table_ISBN, list_authors, true_author = load_by_index(table_group_by_isbn, isbn_list, true_authors, 0, key_values['verbose'])

dataset: merged_book-multiAuthors
Loading time is: 6.243867874145508
ISBN: 1878685546
true author: Alan Shepard, Deke Slayton


In [6]:
embeddings_tokens = sentence_embedding(table_ISBN, key_values)

embedding_type: inferSent
attributes_list: ['author', 'title', 'big_cate', 'small_cate']
model_type: bilstm
char_level: False
Embedding time is: 4.089197158813477


In [7]:
blocks = cluster_algorithm(embeddings_tokens, key_values)

cluster_method: hierarchy
num_clusters: 10
Blocking time is: 0.4616127014160156


In [9]:
listCandidates = get_author_candidates(list_authors, blocks, key_values['block_length_thresold'] * len(embeddings_tokens), key_values['verbose'])

Discarted candidate: [{'alan': 1, 'barbree jay': 86, 'jr': 2}, {'benedict howard': 85, 'benedict howard shepard': 1}, {'alan b shepard': 1, 'alan shepard': 21, 'alan shepherd': 1, 'armstrong neil': 1, 'barbree jay': 1}, {'alan barbree~howard benedict shepard~deke slayton~jay': 1, 'alan shepard': 6, 'armstrong neil': 2, 'barbree jay': 2, 'benedict howard': 2}, {'deke slayton': 8, 'deke slayton w': 1}, {'adams ben': 1, 'alan shepard': 5, 'alan shpard': 1, 'armstrong intro neil': 1}, {'alan shepard': 1, 'nan': 6}, {'armstrong neil': 7}]
Possible candidate: [{'armstrong deke neil slayton': 1, 'deke slayton': 113}, {'alan b shepard': 1, 'alan shepard': 90, 'alan shepherd': 5}]
lengthNecessary: 90.80000000000001


In [10]:
print(listCandidates)
print("{0} VS true_author: {1}".format(getFinalAuthors(listCandidates), true_author))

[{'armstrong deke neil slayton': 1, 'deke slayton': 113}, {'alan b shepard': 1, 'alan shepard': 90, 'alan shepherd': 5}]
['deke slayton', 'alan shepard'] VS true_author: Alan Shepard, Deke Slayton


# Testing 

## ['author','title','big_cate', 'small_cate', 'publisher']

In [None]:
key_values = {
    'model_type':'bilstm',
    'char_level':False,
    'model_version': 2,
    'rnn_dim':1024,
    'verbose':1,
    'attributes_list': ['author','title','big_cate', 'small_cate', 'publisher'],
    'embedding_type': 'inferSent',
    'dataset': 'merged_book-multiAuthors',
    'cluster_method': 'hierarchy',
    'num_clusters': 10,
    'block_length_thresold': 0.2,
}