In [44]:
import os
import sys
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
module_path = os.path.abspath(os.path.join('../../fusion'))
sys.path.append(module_path)

In [45]:
from preprocessing_datasets import load_dataset
from embedding_algorithms import sentence_embedding, set_embedding_model
from dimensionality_reduction_algorithms import dimension_reduction_algorithms
from cluster_algorithms import cluster_algorithm

from helper import load_by_index, get_author_candidate
from plot_tools import plotChart, plotCluster

In [46]:
key_values = {
    'model_type':'bilstm',
    'char_level':False,
    'model_version': 2,
    'rnn_dim':1024,
    'verbose':1,
    'attributes_list': ['authors','title','big_cate', 'small_cate'],
    'embedding_type': 'inferSent',
    'dataset': 'merged_book',
    'cluster_method': 'tsne',
    'dimension_reduction': 'pca',
    'num_components': 2,
    'cluster_method': 'hierarchy',
    'num_clusters': 10
}

In [47]:
data = pd.read_csv('../../fusion/source_datasets/books/books_merged.csv', dtype='str')

In [48]:
data = data[['ISBN_10', 'title', 'authors', 'big_cate', 'small_cate', 'publisher']]

In [49]:
data

Unnamed: 0,ISBN_10,title,authors,big_cate,small_cate,publisher
0,0001361589,Noddys Rainy Day Activity Book,"Blyton, Enid",childrens-fiction-young-adult,enid-blyton,HarperCollinsChildrensBooks
1,0001361589,Fun With Noddy Summer Activity Book. Noddy's T...,Enid Blyton,childrens-fiction-young-adult,enid-blyton,BBC Childrens Books
2,0001380451,Fun And Games With Paddington,"BOND, Michael",childrens-fiction-young-adult,childrens-series,Collins
3,0001380451,FUN AND GAMES WITH PADDINGTON,Michael Bond,childrens-fiction-young-adult,childrens-series,"Collins, London"
4,0001380451,Fun and Games with Paddington,"Bond, Michael",childrens-fiction-young-adult,childrens-series,Collins
...,...,...,...,...,...,...
2245344,9977473579,Butterfly in the City: A Good Life in Costa Rica,"Stuart, Jo",travel-books,general-travel,"Self published, San Jose, Costa Rica"
2245345,9979940417,Iceland : The New Millennium Series,"Isberg, Jon Olafur; Granz, Olafur [Editors]",travel-books,adventure-travel,
2245346,9984592774,Riga - Journey Through The Centuries,Andris Lamsters,travel-books,travel-guides,Madris
2245347,9986830494,"Lithuania: Nature, Traditions, Culture, Cities","Kairiene, A.",travel-books,exploration,R Paknis leidykla


In [50]:
from preprocessing_datasets.preprocessing_utilities import ValueUtils

In [51]:
data['numberOfAuthors'] = data['authors'].map(lambda x: len(ValueUtils.split_values(x)))

In [52]:
data

Unnamed: 0,ISBN_10,title,authors,big_cate,small_cate,publisher,numberOfAuthors
0,0001361589,Noddys Rainy Day Activity Book,"Blyton, Enid",childrens-fiction-young-adult,enid-blyton,HarperCollinsChildrensBooks,1
1,0001361589,Fun With Noddy Summer Activity Book. Noddy's T...,Enid Blyton,childrens-fiction-young-adult,enid-blyton,BBC Childrens Books,1
2,0001380451,Fun And Games With Paddington,"BOND, Michael",childrens-fiction-young-adult,childrens-series,Collins,1
3,0001380451,FUN AND GAMES WITH PADDINGTON,Michael Bond,childrens-fiction-young-adult,childrens-series,"Collins, London",1
4,0001380451,Fun and Games with Paddington,"Bond, Michael",childrens-fiction-young-adult,childrens-series,Collins,1
...,...,...,...,...,...,...,...
2245344,9977473579,Butterfly in the City: A Good Life in Costa Rica,"Stuart, Jo",travel-books,general-travel,"Self published, San Jose, Costa Rica",1
2245345,9979940417,Iceland : The New Millennium Series,"Isberg, Jon Olafur; Granz, Olafur [Editors]",travel-books,adventure-travel,,2
2245346,9984592774,Riga - Journey Through The Centuries,Andris Lamsters,travel-books,travel-guides,Madris,1
2245347,9986830494,"Lithuania: Nature, Traditions, Culture, Cities","Kairiene, A.",travel-books,exploration,R Paknis leidykla,1


In [53]:
data['numberOfAuthors'].value_counts()

1     1925004
2      249063
3       50821
4       12674
5        4013
6        1613
7         657
8         428
9         422
10        216
12         89
11         80
15         65
14         52
13         45
16         30
17         19
18         11
19          9
20          8
23          8
21          7
22          6
24          3
28          2
26          2
27          1
33          1
Name: numberOfAuthors, dtype: int64

In [54]:
data['dirtyAuthor'] = data['authors'].map(lambda x: ValueUtils.split_values(x))

In [55]:
data

Unnamed: 0,ISBN_10,title,authors,big_cate,small_cate,publisher,numberOfAuthors,dirtyAuthor
0,0001361589,Noddys Rainy Day Activity Book,"Blyton, Enid",childrens-fiction-young-adult,enid-blyton,HarperCollinsChildrensBooks,1,"[blyton, enid]"
1,0001361589,Fun With Noddy Summer Activity Book. Noddy's T...,Enid Blyton,childrens-fiction-young-adult,enid-blyton,BBC Childrens Books,1,[enid blyton]
2,0001380451,Fun And Games With Paddington,"BOND, Michael",childrens-fiction-young-adult,childrens-series,Collins,1,"[bond, michael]"
3,0001380451,FUN AND GAMES WITH PADDINGTON,Michael Bond,childrens-fiction-young-adult,childrens-series,"Collins, London",1,[michael bond]
4,0001380451,Fun and Games with Paddington,"Bond, Michael",childrens-fiction-young-adult,childrens-series,Collins,1,"[bond, michael]"
...,...,...,...,...,...,...,...,...
2245344,9977473579,Butterfly in the City: A Good Life in Costa Rica,"Stuart, Jo",travel-books,general-travel,"Self published, San Jose, Costa Rica",1,"[stuart, jo]"
2245345,9979940417,Iceland : The New Millennium Series,"Isberg, Jon Olafur; Granz, Olafur [Editors]",travel-books,adventure-travel,,2,"[isberg, jon olafur, granz, olafur]"
2245346,9984592774,Riga - Journey Through The Centuries,Andris Lamsters,travel-books,travel-guides,Madris,1,[andris lamsters]
2245347,9986830494,"Lithuania: Nature, Traditions, Culture, Cities","Kairiene, A.",travel-books,exploration,R Paknis leidykla,1,"[kairiene, a.]"


In [56]:
data = data.explode('dirtyAuthor')

In [57]:
data

Unnamed: 0,ISBN_10,title,authors,big_cate,small_cate,publisher,numberOfAuthors,dirtyAuthor
0,0001361589,Noddys Rainy Day Activity Book,"Blyton, Enid",childrens-fiction-young-adult,enid-blyton,HarperCollinsChildrensBooks,1,"blyton, enid"
1,0001361589,Fun With Noddy Summer Activity Book. Noddy's T...,Enid Blyton,childrens-fiction-young-adult,enid-blyton,BBC Childrens Books,1,enid blyton
2,0001380451,Fun And Games With Paddington,"BOND, Michael",childrens-fiction-young-adult,childrens-series,Collins,1,"bond, michael"
3,0001380451,FUN AND GAMES WITH PADDINGTON,Michael Bond,childrens-fiction-young-adult,childrens-series,"Collins, London",1,michael bond
4,0001380451,Fun and Games with Paddington,"Bond, Michael",childrens-fiction-young-adult,childrens-series,Collins,1,"bond, michael"
...,...,...,...,...,...,...,...,...
2245345,9979940417,Iceland : The New Millennium Series,"Isberg, Jon Olafur; Granz, Olafur [Editors]",travel-books,adventure-travel,,2,"isberg, jon olafur"
2245345,9979940417,Iceland : The New Millennium Series,"Isberg, Jon Olafur; Granz, Olafur [Editors]",travel-books,adventure-travel,,2,"granz, olafur"
2245346,9984592774,Riga - Journey Through The Centuries,Andris Lamsters,travel-books,travel-guides,Madris,1,andris lamsters
2245347,9986830494,"Lithuania: Nature, Traditions, Culture, Cities","Kairiene, A.",travel-books,exploration,R Paknis leidykla,1,"kairiene, a."


In [58]:
data['author'] = data['dirtyAuthor'].map(lambda x: ValueUtils.clean_value(x))

In [59]:
data

Unnamed: 0,ISBN_10,title,authors,big_cate,small_cate,publisher,numberOfAuthors,dirtyAuthor,author
0,0001361589,Noddys Rainy Day Activity Book,"Blyton, Enid",childrens-fiction-young-adult,enid-blyton,HarperCollinsChildrensBooks,1,"blyton, enid",blyton enid
1,0001361589,Fun With Noddy Summer Activity Book. Noddy's T...,Enid Blyton,childrens-fiction-young-adult,enid-blyton,BBC Childrens Books,1,enid blyton,blyton enid
2,0001380451,Fun And Games With Paddington,"BOND, Michael",childrens-fiction-young-adult,childrens-series,Collins,1,"bond, michael",bond michael
3,0001380451,FUN AND GAMES WITH PADDINGTON,Michael Bond,childrens-fiction-young-adult,childrens-series,"Collins, London",1,michael bond,bond michael
4,0001380451,Fun and Games with Paddington,"Bond, Michael",childrens-fiction-young-adult,childrens-series,Collins,1,"bond, michael",bond michael
...,...,...,...,...,...,...,...,...,...
2245345,9979940417,Iceland : The New Millennium Series,"Isberg, Jon Olafur; Granz, Olafur [Editors]",travel-books,adventure-travel,,2,"isberg, jon olafur",isberg jon olafur
2245345,9979940417,Iceland : The New Millennium Series,"Isberg, Jon Olafur; Granz, Olafur [Editors]",travel-books,adventure-travel,,2,"granz, olafur",granz olafur
2245346,9984592774,Riga - Journey Through The Centuries,Andris Lamsters,travel-books,travel-guides,Madris,1,andris lamsters,andris lamsters
2245347,9986830494,"Lithuania: Nature, Traditions, Culture, Cities","Kairiene, A.",travel-books,exploration,R Paknis leidykla,1,"kairiene, a.",a kairiene


In [60]:
data = data[['ISBN_10', 'title', 'author', 'big_cate', 'small_cate', 'publisher']]

In [61]:
data

Unnamed: 0,ISBN_10,title,author,big_cate,small_cate,publisher
0,0001361589,Noddys Rainy Day Activity Book,blyton enid,childrens-fiction-young-adult,enid-blyton,HarperCollinsChildrensBooks
1,0001361589,Fun With Noddy Summer Activity Book. Noddy's T...,blyton enid,childrens-fiction-young-adult,enid-blyton,BBC Childrens Books
2,0001380451,Fun And Games With Paddington,bond michael,childrens-fiction-young-adult,childrens-series,Collins
3,0001380451,FUN AND GAMES WITH PADDINGTON,bond michael,childrens-fiction-young-adult,childrens-series,"Collins, London"
4,0001380451,Fun and Games with Paddington,bond michael,childrens-fiction-young-adult,childrens-series,Collins
...,...,...,...,...,...,...
2245345,9979940417,Iceland : The New Millennium Series,isberg jon olafur,travel-books,adventure-travel,
2245345,9979940417,Iceland : The New Millennium Series,granz olafur,travel-books,adventure-travel,
2245346,9984592774,Riga - Journey Through The Centuries,andris lamsters,travel-books,travel-guides,Madris
2245347,9986830494,"Lithuania: Nature, Traditions, Culture, Cities",a kairiene,travel-books,exploration,R Paknis leidykla


In [62]:
data.to_csv('books_merged_cleaned.csv', index=False)