<a href="https://colab.research.google.com/github/JohannPalheiros/gutendex-analytics/blob/main/gutendex_analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#%% Importações e constantes
import requests as req
import numpy as np
import pandas as pd
import matplotlib as plt
import json as js
import math

decodeUtf8Str = 'utf-8'
gutendexBooksUrl = 'https://gutendex.com/books/'
gutendexPageParam = '?page='

In [None]:
#%% Requisição para verificar quantidade de livros disponiveis
booksDataBytes = req.get(gutendexBooksUrl).content
allBooksReqData = js.loads(booksDataBytes.decode(decodeUtf8Str))
allBooksCount = allBooksReqData['count']
totalItensPerPage = len(allBooksReqData['results'])

print('Total de todos os livros: ', allBooksCount)
print('Total de livros por pagina: ', totalItensPerPage)

Total de todos os livros:  72007
Total de livros por pagina:  32


In [None]:
#%% Pegando apenas amostra de 1% do total de livros
onePercentOfAllBooks = round(allBooksCount * 0.01)
totalRequestsToOnePercent = math.ceil(onePercentOfAllBooks / totalItensPerPage)

print('Tota de 1% de livros: ', onePercentOfAllBooks)
print('Tota de requisições: ', totalRequestsToOnePercent)

Tota de 1% de livros:  720
Tota de requisições:  23


In [None]:
#%% Iniciando requisições para carga de dados
booksListData = []

for index in range(1, totalRequestsToOnePercent+1):
  urlToReq = gutendexBooksUrl + gutendexPageParam + str(index)
  dataFromReq = req.get(urlToReq).content
  dataDecoded = js.loads(dataFromReq.decode(decodeUtf8Str))
  booksListData = booksListData + dataDecoded['results']

print('Total de livros retornados: ', len(booksListData))

Total de livros retornados:  736


In [None]:
#%% Bloco para exibir dataframe
booksDataDf = pd.DataFrame(booksListData)

booksDataDf
booksDataDf

Unnamed: 0,id,title,authors,translators,subjects,bookshelves,languages,copyright,media_type,formats,download_count
0,84,"Frankenstein; Or, The Modern Prometheus","[{'name': 'Shelley, Mary Wollstonecraft', 'bir...",[],[Frankenstein's monster (Fictitious character)...,"[Gothic Fiction, Movie Books, Precursors of Sc...",[en],False,Text,{'text/plain': 'https://www.gutenberg.org/eboo...,84070
1,1513,Romeo and Juliet,"[{'name': 'Shakespeare, William', 'birth_year'...",[],"[Conflict of generations -- Drama, Juliet (Fic...",[],[en],False,Text,{'application/x-mobipocket-ebook': 'https://ww...,60644
2,1342,Pride and Prejudice,"[{'name': 'Austen, Jane', 'birth_year': 1775, ...",[],"[Courtship -- Fiction, Domestic fiction, Engla...","[Best Books Ever Listings, Harvard Classics]",[en],False,Text,{'application/octet-stream': 'https://www.gute...,53375
3,25344,The Scarlet Letter,"[{'name': 'Hawthorne, Nathaniel', 'birth_year'...",[],"[Adultery -- Fiction, Boston (Mass.) -- Histor...",[Banned Books from Anne Haight's list],[en],False,Text,{'text/plain; charset=us-ascii': 'https://www....,38073
4,11,Alice's Adventures in Wonderland,"[{'name': 'Carroll, Lewis', 'birth_year': 1832...",[],[Alice (Fictitious character from Carroll) -- ...,[Children's Literature],[en],False,Text,{'text/plain': 'https://www.gutenberg.org/eboo...,30353
...,...,...,...,...,...,...,...,...,...,...,...
731,28299,The Orbis Pictus,"[{'name': 'Comenius, Johann Amos', 'birth_year...","[{'name': 'Hoole, Charles', 'birth_year': 1610...","[Latin language -- Readers, Natural history --...",[],[en],False,Text,{'application/x-mobipocket-ebook': 'https://ww...,1042
732,2131,An Account of Egypt,"[{'name': 'Herodotus', 'birth_year': -484, 'de...","[{'name': 'Macaulay, G. C. (George Campbell)',...",[Egypt -- History -- To 332 B.C.],[Egypt],[en],False,Text,{'application/octet-stream': 'https://www.gute...,1040
733,15859,The Piazza Tales,"[{'name': 'Melville, Herman', 'birth_year': 18...",[],"[Manners and customs -- Fiction, Short stories]",[],[en],False,Text,{'text/plain; charset=us-ascii': 'https://www....,1040
734,71814,The railhead at Kysyl Khoto,"[{'name': 'Lang, Allen Kim', 'birth_year': 192...",[],"[Cold War -- Fiction, Science fiction, Short s...",[],[en],False,Text,{'application/octet-stream': 'https://www.gute...,1039


In [None]:
#%% Tratando os dados dos autores extraindo apenas os nomes
authorsDict = booksDataDf["authors"].to_frame()

colunsMap = authorsDict.keys().values

authorsValuesArray = []

for column in colunsMap:
  for index, item in enumerate(authorsDict[column]):
    if ((item == []) and (len(item) != 0)):
      item = item[0]['name']
      authorsDict.at[index, column] = item

authorsDict.head()

Unnamed: 0,authors
0,"[{'name': 'Shelley, Mary Wollstonecraft', 'bir..."
1,"[{'name': 'Shakespeare, William', 'birth_year'..."
2,"[{'name': 'Austen, Jane', 'birth_year': 1775, ..."
3,"[{'name': 'Hawthorne, Nathaniel', 'birth_year'..."
4,"[{'name': 'Carroll, Lewis', 'birth_year': 1832..."


In [None]:
#%% Atualizando dados dos nomes dos autores no data frame
booksDataDf["authors"] = authorsDict

booksDataDf.head()

Unnamed: 0,id,title,authors,translators,subjects,bookshelves,languages,copyright,media_type,formats,download_count
0,84,"Frankenstein; Or, The Modern Prometheus","[{'name': 'Shelley, Mary Wollstonecraft', 'bir...",[],[Frankenstein's monster (Fictitious character)...,"[Gothic Fiction, Movie Books, Precursors of Sc...",[en],False,Text,{'text/plain': 'https://www.gutenberg.org/eboo...,84070
1,1513,Romeo and Juliet,"[{'name': 'Shakespeare, William', 'birth_year'...",[],"[Conflict of generations -- Drama, Juliet (Fic...",[],[en],False,Text,{'application/x-mobipocket-ebook': 'https://ww...,60644
2,1342,Pride and Prejudice,"[{'name': 'Austen, Jane', 'birth_year': 1775, ...",[],"[Courtship -- Fiction, Domestic fiction, Engla...","[Best Books Ever Listings, Harvard Classics]",[en],False,Text,{'application/octet-stream': 'https://www.gute...,53375
3,25344,The Scarlet Letter,"[{'name': 'Hawthorne, Nathaniel', 'birth_year'...",[],"[Adultery -- Fiction, Boston (Mass.) -- Histor...",[Banned Books from Anne Haight's list],[en],False,Text,{'text/plain; charset=us-ascii': 'https://www....,38073
4,11,Alice's Adventures in Wonderland,"[{'name': 'Carroll, Lewis', 'birth_year': 1832...",[],[Alice (Fictitious character from Carroll) -- ...,[Children's Literature],[en],False,Text,{'text/plain': 'https://www.gutenberg.org/eboo...,30353


In [None]:
#%% verificando as tipagens
booksDataDf.dtypes

id                 int64
title             object
authors           object
translators       object
subjects          object
bookshelves       object
languages         object
copyright           bool
media_type        object
formats           object
download_count     int64
dtype: object

In [None]:
#%% trocando as tipagens das colunas object para string
booksDataDf[['title', 'authors','translators','subjects','bookshelves','languages',
             'media_type','formats']] = booksDataDf[['title', 'authors','translators',
                                                     'subjects','bookshelves','languages',
                                                     'media_type','formats']].astype(str)

In [None]:
#%% verificando a quantidade de dados unicos em cada coluna
booksDataDf.nunique()

id                736
title             720
authors           415
translators       138
subjects          677
bookshelves       182
languages          19
copyright           2
media_type          1
formats           736
download_count    634
dtype: int64

In [None]:
#%% por ter mais ids unicos que titulos unicos, percebo que há titulos repetidos e irei investigar isso
duplicatedDf = booksDataDf[booksDataDf.duplicated(['title'])]
duplicatedDf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16 entries, 41 to 664
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              16 non-null     int64 
 1   title           16 non-null     object
 2   authors         16 non-null     object
 3   translators     16 non-null     object
 4   subjects        16 non-null     object
 5   bookshelves     16 non-null     object
 6   languages       16 non-null     object
 7   copyright       16 non-null     bool  
 8   media_type      16 non-null     object
 9   formats         16 non-null     object
 10  download_count  16 non-null     int64 
dtypes: bool(1), int64(2), object(8)
memory usage: 1.4+ KB


In [None]:
#%% realmente tinham titulos repetidos, o que seria um problema para dizer quantos
#%% downloads haveriam de verdade em cada titulo
groupbyDF = booksDataDf.groupby(['title'], as_index=False)['download_count'].sum()
groupbyDF.sort_values('download_count', ascending=False).head()

Unnamed: 0,title,download_count
171,"Frankenstein; Or, The Modern Prometheus",101031
338,Romeo and Juliet,60644
321,Pride and Prejudice,55729
575,The Scarlet Letter,39258
43,Alice's Adventures in Wonderland,32728


In [None]:
#%% na visão cima podemos ver os 5 livros mais baixados
#%% nesta logo abaixo, podemos ver os 5 livros menos baixados
groupbyDF.sort_values('download_count', ascending=False).tail()

Unnamed: 0,title,download_count
158,"Eve's Diary, Complete",1042
47,An Account of Egypt,1040
541,The Piazza Tales,1040
661,The railhead at Kysyl Khoto,1039
251,Milton: Minor Poems,1037


In [None]:
#%% verificando alguns
groupbyDF.describe().round(2)

Unnamed: 0,download_count
count,720.0
mean,3563.16
std,6201.01
min,1037.0
25%,1333.25
50%,1751.5
75%,3205.5
max,101031.0
