## Data set analysis

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import itertools
import math
import seaborn as sns
import data_preprocessing as proc
import visualisations as vis
import networkx as nx
from igraph import *
np.set_printoptions(threshold=sys.maxsize)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [None]:
#Take only 'shelves' with minimum 200 usages in all data set
limit_of_tag_frequency = 200
df, shelves = proc.get_all_data(limit_of_tag_frequency)

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.describe(include='all')

In [None]:
df, shelves = proc.get_all_data(50)
all_tags = []
for key, value in shelves.items():
    all_tags += value
print(set(all_tags))
print("how many books: " + str(len(shelves.items())))    
print("how many usages of all tags [with duplicates]: " + str(len(all_tags)))
print("how many unique tags: " + str(len(set(all_tags))))

In [None]:
vis.show_wordcloud(shelves)

In [None]:
data_arr = df.values
columns = list(zip(*data_arr))

authors = columns[-1]
years = columns[-5]
years_disc = vis.discretize_data([int(i) for i in columns[-5] if i != "None"])
pages= vis.discretize_data([int(i) for i in columns[-3] if i != "None"])
rates = vis.discretize_data(columns[-4])
popularity= vis.discretize_data(columns[-2], strategy = "quantile")

In [None]:
mylayout={
    0:"circular", #circular layout
    1:"fr", #layout_fruchterman_reingold
    2:"grid_fr", #layout_grid_fruchterman_reingold
    3: "kk", #layout_kamada_kawai
    4: "lgl", #layout_lgl
    5: "Bipartite" # Bipartite layout - only for Bipartites
}

## Visualisations

For visualisations we used igraph library.
Below we present the graph, where the edge means, that two connected books have the same author.
<br>Parameters:
- degree -> Mimnium node degree, to be shown in the graph. It corresponds with minimum ammount of books written by single author.
- use_layout -> Layout, that will be used to visualise graph (best 0,2 - circular, grid_fruchterman_reingold)

In [None]:
#Parameters
degree = 12
use_layout = mylayout[0]

In [None]:
g, visual_style = vis.visualise_normal_graph(authors, degree, use_layout)
plot(g, **visual_style)

The other way to visualise that, is to add 'authors' to graph as nodes, and then create bipartite network, where each book is connected to one author.
<br>Parameters:
- how_many -> How many top authors should be visualised.
- use_layout -> Layout, that will be used to visualise graph (best 1 - fruchterman_reingold, 5 - Bipartite)

In [None]:
#Parameters
how_many = 10
use_layout = mylayout[1]

In [None]:
g, visual_style = vis.visualise_binomials(df,authors,"AUTHOR", how_many, use_layout)
plot(g, **visual_style)

Below there is visualisation that shows, when books were published.
<br>Parameters:
- how_many -> How many most frequent years should be visualised
- use_layout -> Layout, that will be used to visualise graph (best 1 - fruchterman_reingold, 5 - Bipartite)
- discretized -> Decides, if year should be discretized into 5 bins (1888. , 1913.8, 1939.6, 1965.4, 1991.2, 2017)

In [None]:
#Parameters
how_many = 30
use_layout = mylayout[1]
discretized = True

In [None]:
if discretized:
    g, visual_style = vis.visualise_binomials(df,years_disc,"None", how_many, use_layout)
else:
    g, visual_style = vis.visualise_binomials(df,years,"YEAR", how_many, use_layout)

plot(g, **visual_style)

That visualisations shows relation between tags and books.
<br>Parameters:
- tag_limit -> set up minimal frequency of tag in the entire data set, to be considered as node
- how_many_books -> set up ammount of books to be presented on the graph 
- use_layout -> Layout, that will be used to visualise graph (best 1 - fruchterman_reingold, 5 - Bipartite)

In [None]:
#Parameters
tag_limit = 100
how_many_books = 50
use_layout = mylayout[5]

In [None]:
g, visual_style = vis.visualise_tags_binomial(tag_limit, use_layout, how_many_books)
plot(g, **visual_style)

Below there is visualisation that shows, how books were rated (1,2,3,4,5).
<br>Parameters:
- use_layout -> Layout, that will be used to visualise graph (best 1 - fruchterman_reingold, 5 - Bipartite)

In [None]:
#Parameters
use_layout = mylayout[1]

In [None]:
g, visual_style = vis.visualise_binomials(df,rates,"None", -1, use_layout)
plot(g, **visual_style)

Below there is visualisation that shows popularity of books. Popularity is discretized by the following bins: <br>[0, 5, 20, 83, 613, 1538774] 
<br>Parameters:
- use_layout -> Layout, that will be used to visualise graph (best 1 - fruchterman_reingold, 5 - Bipartite)

In [None]:
#Parameters
use_layout = mylayout[5]

In [None]:
g, visual_style = vis.visualise_binomials(df,popularity,"None", -1, use_layout)
plot(g, **visual_style)

Below there is visualisation that shows size of books. Page ammount is discretized by the following bins: <br>[   0. ,  566.4, 1132.8, 1699.2, 2265.6, 2832. ]
<br>Parameters:
- use_layout -> Layout, that will be used to visualise graph (best 1/2 - fruchterman_reingold, 5 - Bipartite)

In [None]:
#Parameters
use_layout = mylayout[2]

In [None]:
g, visual_style = vis.visualise_binomials(df,pages,"None", -1, use_layout)

plot(g, **visual_style)

In [None]:
authors = columns[-1]
years = vis.discretize_data([int(i) if i != "None" else -1 for i in columns[-5]])
titles = list(columns[-6])
pages= vis.discretize_data([int(i) if i != "None" else -1 for i in columns[-3]])
rates = vis.discretize_data(columns[-4])
popularity= vis.discretize_data(columns[-2], strategy = "quantile")
size = df.shape[0]
ids = list(columns[0])
matrix_authors = vis.make_feature_matrix(size,authors)
matrix_years = vis.make_feature_matrix(size,years)
matrix_pages = vis.make_feature_matrix(size,pages)
matrix_rates = vis.make_feature_matrix(size,rates)
matrix_popularity = vis.make_feature_matrix(size,popularity)



In [None]:
list1 = ["The Old Man and the Sea", "Misery", "It's Not about the Bike: My Journey Back to Life",
       "The Universe in a Single Atom: The Convergence of Science and Spirituality", 
        "The Seven Habits of Highly Effective People", "Poems New and Collected", "Trump: How to Get Rich",
        "The Odyssey","The Complete Chronicles of Narnia", "Hamlet (Norton Critical Edition)",
        "The History of the Lord of the Rings (The History of Middle-earth #6-9)", "The Iliad", 
        "The Sorrows of Young Werther / Die Leiden Des Jungen Werther",'A Briefer History of Time']
        
    
list2 = ['Rose Madder','The Eyes of the Dragon', 'Eleven Minutes', 'Twelfth Night',
'The Old Man and the Sea','Misery','A Briefer History of Time','The Universe in a Nutshell',
 "It's Not about the Bike: My Journey Back to Life",
 "The Universe in a Single Atom: The Convergence of Science and Spirituality",
 "In the Name of Jesus: Reflections on Christian Leadership","The Innocents Abroad",
 "To Kill a Mockingbird",
 "Harry Potter Schoolbooks Box Set: Two Classic Books from the Library of Hogwarts School of Witchcraft and Wizardry",
 "Trump: The Art of the Deal", "Dracula",
 "Preface to the Presidency: Selected Speeches of Bill Clinton 1974-1992",
 "Lysis/Phaedrus/Symposium: Plato on Homosexuality", "Poems New and Collected"]

tlist1 = ['Rose Madder','The Eyes of the Dragon', 'Eleven Minutes',
          "Lysis/Phaedrus/Symposium: Plato on Homosexuality", "The Complete Chronicles of Narnia",
         "Harry Potter Schoolbooks Box Set: Two Classic Books from the Library of Hogwarts School of Witchcraft and Wizardry"]

tlist2 = ['Rose Madder','The Eyes of the Dragon','Misery',
          'A Briefer History of Time','The Universe in a Nutshell',
          "Dracula" ]

tlist3 = ["The Universe in a Single Atom: The Convergence of Science and Spirituality",
 "In the Name of Jesus: Reflections on Christian Leadership",
          'A Briefer History of Time','The Universe in a Nutshell',
          'Twelfth Night', "Poems New and Collected",
          'Rose Madder','The Eyes of the Dragon','Misery'
         ]

tlist11 = ['Rose Madder','The Eyes of the Dragon','The Old Man and the Sea','Misery',"To Kill a Mockingbird",
              "Trump: The Art of the Deal", "Dracula","Preface to the Presidency: Selected Speeches of Bill Clinton 1974-1992",]

tlist22 = ['A Briefer History of Time','The Universe in a Nutshell', 
               "In the Name of Jesus: Reflections on Christian Leadership",
              "Poems New and Collected", 'Twelfth Night',
              "Harry Potter Schoolbooks Box Set: Two Classic Books from the Library of Hogwarts School of Witchcraft and Wizardry",
              "It's Not about the Bike: My Journey Back to Life", 
              "The Universe in a Single Atom: The Convergence of Science and Spirituality", "Misery"]

This graph is showing correlation between books. All nodes are books and every pair has edge (complete graph). darker and widther edge means bigger correlation between books. Thick and  bright edge means small correlaion. 
- tag_limit -> set up minimal frequency of tag in the entire data set, to be considered as node
- how_many -> how many books should be presentend on the graph [only random mode, nearest books mode]
- use_layout -> Layout, that will be used to visualise graph [best 0]
- scal -> Set up weight for each attributes, that may be common between two books: author, publish year, book size, book rate, book popularity, shelves
- custom_title_list -> list of books mode - list of books to show on the grah [if none, then random mode/nearest books mode]
- find_best_for -> nearest books mode - give book title, and there will be generated graph containing the most similar books to that title [if none, then random mode/list of books mode]

In [None]:
#Parameters
tag_limit = 50
how_many = 10
use_layout = mylayout[0]
scal = {"auth":1, "years":0, "pages":0, "rates":0, "popularity":0, "shelves":5}
custom_title_list = tlist2
#custom_title_list = None
#find_best_for ="A Briefer History of Time"
find_best_for = None

In [None]:
df, shelves = proc.get_all_data(tag_limit)
vis.show_wordcloud(shelves)

In [None]:
g, visual_style = vis.visualuse_similarity(use_layout, how_many, tag_limit, ids, scal, titles, authors,
                         matrix_authors, matrix_years, matrix_pages,
                         matrix_rates, matrix_popularity, title_list=custom_title_list, find_best=find_best_for)

plot(g, **visual_style)

In [None]:
print(g.vs['label'])
#df["TITLE"= 'A Briefer History of Time']
#print(titles.index('A Briefer History of Time'))
#df.loc[df["TITLE"] == 'A Briefer History of Time']
#df.sort_values('YEAR')[:1000]

In [None]:
df.sort_values('POPULARITY')[:-1]

In [44]:

from ipywidgets import interact, fixed, FloatSlider, IntSlider, interact_manual, VBox, HBox, interactive
import ipywidgets as widgets
from IPython.display import display, clear_output, Image
import data_preprocessing as proc
import visualisations as vis
from igraph import * 
import warnings
warnings.filterwarnings('ignore')


tag_limit = 100  
how_many = 10 

text = widgets.Text( description='Books:')
display(text)


select = widgets.Dropdown(
    options=['None', 'List_1', 'List_2','List_3'],
    value='None',
    description='Selected list:',
)
display(select)

auth = FloatSlider(min=0, max=10, value=1, description="Author: ")
year = FloatSlider(min=0, max=10, value=1, description="Publication year: ")
pages = FloatSlider(min=0, max=10, value=0.4, description="Number of pages: ")
rates = FloatSlider(min=0, max=10, value=2, description="Author: ")
popularity = FloatSlider(min=0, max=10, value=0, description="Popularity: ")
shelves = FloatSlider(min=0, max=10, value=4, description="Subject: ")

tag_limit = IntSlider(min=1, max=300, value=20, description="Tags:")
how_many = IntSlider(min=1, max=300, value=2, description="Neighbors:")

#display(auth, year, pages, rates, popularity, shelves)

items = HBox([auth, year, pages, rates, popularity, shelves, tag_limit, how_many])

tlist3 = ['Rose Madder','The Eyes of the Dragon', 'Eleven Minutes', 'Twelfth Night',
'The Old Man and the Sea','Misery','A Briefer History of Time','The Universe in a Nutshell',
 "It's Not about the Bike: My Journey Back to Life",
 "The Universe in a Single Atom: The Convergence of Science and Spirituality",
 "In the Name of Jesus: Reflections on Christian Leadership","The Innocents Abroad",
 "To Kill a Mockingbird",
 "Harry Potter Schoolbooks Box Set: Two Classic Books from the Library of Hogwarts School of Witchcraft and Wizardry",
 "Trump: The Art of the Deal", "Dracula",
 "Preface to the Presidency: Selected Speeches of Bill Clinton 1974-1992",
 "Lysis/Phaedrus/Symposium: Plato on Homosexuality", "Poems New and Collected"]

tlist1 = ['Rose Madder','The Eyes of the Dragon','The Old Man and the Sea','Misery',"To Kill a Mockingbird",
              "Trump: The Art of the Deal", "Dracula","Preface to the Presidency: Selected Speeches of Bill Clinton 1974-1992",]

tlist2 = ['A Briefer History of Time','The Universe in a Nutshell', 
               "In the Name of Jesus: Reflections on Christian Leadership",
              "Poems New and Collected", 'Twelfth Night',
              "Harry Potter Schoolbooks Box Set: Two Classic Books from the Library of Hogwarts School of Witchcraft and Wizardry",
              "It's Not about the Bike: My Journey Back to Life", 
              "The Universe in a Single Atom: The Convergence of Science and Spirituality", "Misery"]

scal = {"auth":1, "years":1, "pages":1, "rates":1, "popularity":1, "shelves":1}

@interact_manual(auth=auth, year=year, pages=pages, rates=rates, popularity=popularity, shelves=shelves, tag_limit=tag_limit, how_many=how_many)
def update_graph(auth, year, pages, rates, popularity, shelves, tag_limit, how_many):
    scal = {"auth":auth, "years":year, "pages":pages, "rates":rates, "popularity":popularity, "shelves":shelves}
   
    if select.value == "None" : 
        if "," not in (text.value): 
            find_best_for = text.value 
        else :
            find_best_for = None
        tlist = (text.value).split(",")
    else : 
        find_best_for = None
        if select.value == "List_1" :
            tlist=tlist1
        if select.value == "List_2" : 
            tlist=tlist2
        if select.value == "List_3" : 
            tlist=tlist3
    g, visual_style = vis.visualuse_similarity("circular", how_many, tag_limit, ids, scal, titles, authors,
                         matrix_authors, matrix_years, matrix_pages,
                         matrix_rates, matrix_popularity, title_list=tlist, find_best=find_best_for)
    p = plot(g, **visual_style)
    p.save('graph.png')
    return Image(filename='graph.png')  
    


Text(value='', description='Books:')

Dropdown(description='Selected list:', options=('None', 'List_1', 'List_2', 'List_3'), value='None')

interactive(children=(FloatSlider(value=1.0, description='Author: ', max=10.0), FloatSlider(value=1.0, descrip…