## Data set analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import itertools
import math
import seaborn as sns
import data_preprocessing as proc
import visualisations as vis
import networkx as nx
from igraph import *
np.set_printoptions(threshold=sys.maxsize)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [None]:
#Take only 'shelves' with minimum 200 usages in all data set
limit_of_tag_frequency = 200
df, shelves = proc.get_all_data(limit_of_tag_frequency)

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.describe(include='all')

In [None]:
all_tags = []
for key, value in shelves.items():
    all_tags += value
        
print("how many books: " + str(len(shelves.items())))    
print("how many usages of all tags [with duplicates]: " + str(len(all_tags)))
print("how many unique tags: " + str(len(set(all_tags))))

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

def discretize_data(data):
    data_disc = [[i] for i in data]
    #kmeans/quantile/uniform
    enc = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy="uniform")
    enc.fit(data_disc)
    grid_encoded = enc.transform(data_disc)
    print(enc.bin_edges_)
    return [int(j)+1 for sub in grid_encoded for j in sub]

In [None]:
data_arr = df.values
columns = list(zip(*data_arr))

authors = columns[-1]
years = columns[-5]
years_disc = discretize_data([int(i) for i in columns[-5] if i != "None"])
pages= discretize_data([int(i) for i in columns[-3] if i != "None"])
rates = discretize_data(columns[-4])
popularity= discretize_data(columns[-2])

In [None]:
mylayout={
    0:"circular", #circular layout
    1:"fr", #layout_fruchterman_reingold
    2:"grid_fr", #layout_grid_fruchterman_reingold
    3: "kk", #layout_kamada_kawai
    4: "lgl", #layout_lgl
    5: "Bipartite" # Bipartite layout - only for Bipartites
}

## Visualisations

For visualisations we used igraph library.
Below we present the graph, where the edge means, that two connected books have the same author.

In [None]:
#Parameters
#degree -> Mimnium node degree, to be shown in the graph. 
#          It corresponds with minimum ammount of books written by single author.
#use_layout -> Layout, that will be used to visualise graph [best 0,2 - circular, grid_fruchterman_reingold]
degree = 7
use_layout = mylayout[0]

In [None]:
size = len(authors)
unique_authors = list(set(authors))
matrix = vis.make_feature_matrix(size,authors)

g = Graph.Adjacency((matrix > 0).tolist(), mode = ADJ_MAX)
#show only books with degree bigger than our parametr 'degree'
to_delete_ids = [v.index for v in g.vs if v.degree() <= degree]
g.delete_vertices(to_delete_ids)

visual_style = vis.visualise_gaph(g, use_layout)
plot(g, **visual_style)

The other way to visualise that, is to add 'authors' to graph as nodes, and then create bipartite network, where each book is connected to one author.

In [None]:
#Parameters
#how_many -> How many top authors should be visualised. 
#
#use_layout -> Layout, that will be used to visualise graph [best 1 - fruchterman_reingold, 5 - Bipartite]
how_many = 10
use_layout = mylayout[1]

In [None]:
g, visual_style = vis.visualise_binomials(df,authors,"AUTHOR", how_many, use_layout)

plot(g, **visual_style)

In [None]:
#Parameters
#how_many -> How many top years should be visualised. 
#
#use_layout -> Layout, that will be used to visualise graph [best 1 - fruchterman_reingold, 5 - Bipartite]
how_many = 50
use_layout = mylayout[1]

In [None]:
g, visual_style = vis.visualise_binomials(df,years,"YEAR", how_many, use_layout)

plot(g, **visual_style)

In [None]:
#Parameters
#use_layout -> Layout, that will be used to visualise graph [best 1/2 - fruchterman_reingold, 5 - Bipartite]
use_layout = mylayout[2]

In [None]:
g, visual_style = vis.visualise_binomials(df,years_disc,"None", -1, use_layout)

plot(g, **visual_style)

In [None]:
#Parameters
#how_many -> How many top years should be visualised. 
#
#use_layout -> Layout, that will be used to visualise graph [best 1 - fruchterman_reingold, 5 - Bipartite]
how_many = 10
use_layout = mylayout[5]
df, shelves = proc.get_all_data(1000)

In [None]:
all_tags = []
all_keys = []
for key, value in shelves.items():
    all_tags += value
    all_keys.append(key)

unique = list(set(all_tags))
dic_val_unique = {aut: num + size for num, aut in enumerate(unique)}
dic_key_unique = {aut: num for num, aut in enumerate(all_keys)}

edges = []
counter = 0
limit = 10
for key,item in shelves.items():
    if counter > limit:
        break
    for tag in item:
        edges.append((dic_key_unique[key], dic_val_unique[tag]))
    counter += 1
g = Graph.Bipartite([0] * size + [1] * len(dic_val_unique), edges)
labels = [""] * len(g.vs)
labels[len(g.vs) - len(unique):] = unique
g.vs["label"] = labels

# delete books without connections with its author
to_delete_ids = [v.index for v in g.vs if v.degree() <= 0]
g.delete_vertices(to_delete_ids)

if use_layout == "Bipartite":
    visual_style = vis.visualise_gaph(g)
    visual_style["layout"] = g.layout_bipartite()
else:
    visual_style = vis.visualise_gaph(g, use_layout)
    
#visual_style["vertex_label"] = labels
seq = g.vs
print([seq[-1]])
plot(g, **visual_style)

In [None]:
#Parameters
#use_layout -> Layout, that will be used to visualise graph [best 1 - fruchterman_reingold, 5 - Bipartite]
use_layout = mylayout[1]

In [None]:
g, visual_style = vis.visualise_binomials(df,rates,"None", -1, use_layout)

plot(g, **visual_style)

In [None]:
#Parameters
#use_layout -> Layout, that will be used to visualise graph [best 1/2 - fruchterman_reingold, 5 - Bipartite]
use_layout = mylayout[2]

In [None]:
g, visual_style = vis.visualise_binomials(df,popularity,"None", -1, use_layout)

plot(g, **visual_style)

In [None]:
#Parameters
#use_layout -> Layout, that will be used to visualise graph [best 1/2 - fruchterman_reingold, 5 - Bipartite]
use_layout = mylayout[2]

In [None]:
g, visual_style = vis.visualise_binomials(df,pages,"None", -1, use_layout)

plot(g, **visual_style)

In [None]:
#Parameters
#degree -> Mimnium node degree, to be shown in the graph. 
#          It corresponds with minimum ammount of books written by single author.
#use_layout -> Layout, that will be used to visualise graph [best 0,2 - circular, grid_fruchterman_reingold]
degree = 7
use_layout = mylayout[0]

In [None]:
authors = columns[-1]
years = discretize_data([int(i) if i != "None" else -1 for i in columns[-5]])
pages= discretize_data([int(i) if i != "None" else -1 for i in columns[-3]])
rates = discretize_data(columns[-4])
popularity= discretize_data(columns[-2])
size = df.shape[0]
matrix_authors = vis.make_feature_matrix(size,authors)
matrix_years = vis.make_feature_matrix(size,years)
matrix_pages = vis.make_feature_matrix(size,pages)
matrix_rates = vis.make_feature_matrix(size,rates)
matrix_popularity = vis.make_feature_matrix(size,popularity)

In [None]:
matrix = matrix_authors + matrix_years + matrix_pages + matrix_rates + matrix_popularity
print(matrix_authors.shape)
print(matrix.shape)
g = Graph.Adjacency((matrix > 0).tolist(), mode = ADJ_MAX)
#show only books with degree bigger than our parametr 'degree'
to_delete_ids = [v.index for v in g.vs if v.degree() <= degree]
g.delete_vertices(to_delete_ids)

visual_style = vis.visualise_gaph(g, use_layout)
#plot(g, **visual_style)