# Libraries

In [88]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Amazon database 2009 - 2022

In [89]:
def importarCsv(path):
    return pd.read_csv(path)

In [90]:
df_amazon = importarCsv("../data/bestsellers_with_categories_2022_03_27.csv")

## Explore table

In [91]:
df_amazon.sample()

Unnamed: 0,Name,Author,User Rating,Reviews,Price,Year,Genre
463,First 100 Words,Roger Priddy,4.7,17323,4,2018,Non Fiction


In [92]:
df_amazon.shape

(700, 7)

In [93]:
df_amazon.columns

Index(['Name', 'Author', 'User Rating', 'Reviews', 'Price', 'Year', 'Genre'], dtype='object')

In [94]:
df_amazon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Name         700 non-null    object 
 1   Author       700 non-null    object 
 2   User Rating  700 non-null    float64
 3   Reviews      700 non-null    int64  
 4   Price        700 non-null    int64  
 5   Year         700 non-null    int64  
 6   Genre        700 non-null    object 
dtypes: float64(1), int64(3), object(3)
memory usage: 38.4+ KB


## Explore data and its cohesiveness

In [95]:
df_amazon.duplicated().sum()

0

In [98]:
df_amazon.duplicated(subset=['Name', 'Year']).sum()

3

In [96]:
df_amazon.isna().sum()

Name           0
Author         0
User Rating    0
Reviews        0
Price          0
Year           0
Genre          0
dtype: int64

In [97]:
df_amazon.dtypes

Name            object
Author          object
User Rating    float64
Reviews          int64
Price            int64
Year             int64
Genre           object
dtype: object

In [99]:
df_amazon.describe()

Unnamed: 0,User Rating,Reviews,Price,Year
count,700.0,700.0,700.0,700.0
mean,4.639857,19255.195714,12.7,2015.5
std,0.218586,23613.443875,9.915162,4.034011
min,3.3,37.0,0.0,2009.0
25%,4.5,4987.25,7.0,2012.0
50%,4.7,10284.0,11.0,2015.5
75%,4.8,23358.0,15.0,2019.0
max,4.9,208917.0,105.0,2022.0


## Cleaning

In [100]:
amazon_csv = df_amazon
amazon_csv.columns = [i.lower() for i in amazon_csv.columns]

In [101]:
amazon_csv.rename({"name":"title"}, axis = 1, inplace = True)
amazon_csv.rename({"year":"year_bought"}, axis = 1, inplace = True)

In [103]:
amazon_csv.drop_duplicates(subset=['title', 'year_bought'], inplace = True)
# Some titles are, repeated, I guess because they were bestsellers in two different years.
# I think it's not the publication date, but the selling date

In [104]:
amazon_csv.drop(amazon_csv["year_bought"][amazon_csv["year_bought"]< 2020].index, axis = 0, inplace = True)

dtype('int64')

In [17]:
# We don't have the actual sales, and the order given is alphabetical by name.
# An approximation to order by number of sales would be the ammount of interactions/reviews
amazon_csv.sort_values(by = "reviews", ascending = False, inplace=True)

In [18]:
amazon_csv.reset_index(inplace = True)

In [19]:
amazon_csv.drop(["user rating", "reviews", "index"], axis = 1, inplace = True)

In [20]:
amazon_csv.to_csv("../data/amazon_projectII.csv", index=False)

In [21]:
amazon_csv

Unnamed: 0,title,author,price,year_bought,genre
0,Where the Crawdads Sing,Delia Owens,10,2022,Fiction
1,Where the Crawdads Sing,Delia Owens,12,2020,Fiction
2,The Midnight Library: A Novel,Matt Haig,13,2022,Fiction
3,The Midnight Library: A Novel,Matt Haig,13,2021,Fiction
4,A Promised Land,Barack Obama,16,2020,Non Fiction
...,...,...,...,...,...
145,Call Us What We Carry: Poems,Amanda Gorman,14,2021,Fiction
146,Maus I: A Survivor's Tale: My Father Bleeds Hi...,Art Spiegelman,14,2022,Fiction
147,Principles for Dealing with the Changing World...,Ray Dalio,21,2022,Non Fiction
148,Life Force: How New Breakthroughs in Precision...,Tony Robbins,20,2022,Non Fiction


# Spanish indie database 2020

Top Books in Spain: Ranking & Details, dataset from Kaggle (https://www.kaggle.com/datasets/thedevastator/top-books-in-spain-ranking-details)

In [22]:
df_spanish = importarCsv("../data/libros_mas_vendidos.csv")

## Explore table

In [23]:
df_spanish.sample()

Unnamed: 0,Puesto,Url,Título,Autor,Materias,Editorial,Traductor,Colección,Encuadernación,País de publicación,Idioma de publicación,Idioma original,ISBN,EAN,Dimensiones,Peso,Nº páginas,Fecha publicación,Ilustrador
36,3,https://www.todostuslibros.com/libros/independ...,Independencia,"Cercas, Javier",Ficción moderna y contemporanea,Tusquets Editores,,Andanzas,Tapa blanda o Bolsillo,España,Español,Español,978-84-9066-929-7,9788490669297,225 x 148 mm.,559 gramos,400,03-03-2021,


In [24]:
df_spanish.shape

(100, 19)

In [25]:
df_spanish.columns

Index(['Puesto', 'Url', 'Título', 'Autor', 'Materias', 'Editorial',
       'Traductor', 'Colección', 'Encuadernación', 'País de publicación ',
       'Idioma de publicación ', 'Idioma original ', 'ISBN', 'EAN',
       'Dimensiones', 'Peso', 'Nº páginas', 'Fecha publicación ',
       'Ilustrador'],
      dtype='object')

In [26]:
df_spanish.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Puesto                  100 non-null    int64 
 1   Url                     100 non-null    object
 2   Título                  100 non-null    object
 3   Autor                   100 non-null    object
 4   Materias                100 non-null    object
 5   Editorial               100 non-null    object
 6   Traductor               32 non-null     object
 7   Colección               98 non-null     object
 8   Encuadernación          100 non-null    object
 9   País de publicación     100 non-null    object
 10  Idioma de publicación   100 non-null    object
 11  Idioma original         100 non-null    object
 12  ISBN                    100 non-null    object
 13  EAN                     100 non-null    int64 
 14  Dimensiones             99 non-null     object
 15  Peso   

## Explore data and its cohesiveness

In [27]:
df_spanish.duplicated().sum()

0

In [28]:
df_spanish.isna().sum()

Puesto                     0
Url                        0
Título                     0
Autor                      0
Materias                   0
Editorial                  0
Traductor                 68
Colección                  2
Encuadernación             0
País de publicación        0
Idioma de publicación      0
Idioma original            0
ISBN                       0
EAN                        0
Dimensiones                1
Peso                      21
Nº páginas                 0
Fecha publicación          0
Ilustrador                93
dtype: int64

In [29]:
df_spanish.dtypes

Puesto                     int64
Url                       object
Título                    object
Autor                     object
Materias                  object
Editorial                 object
Traductor                 object
Colección                 object
Encuadernación            object
País de publicación       object
Idioma de publicación     object
Idioma original           object
ISBN                      object
EAN                        int64
Dimensiones               object
Peso                      object
Nº páginas                 int64
Fecha publicación         object
Ilustrador                object
dtype: object

In [30]:
df_spanish.describe()

Unnamed: 0,Puesto,EAN,Nº páginas
count,100.0,100.0,100.0
mean,50.5,9788445000000.0,335.54
std,29.011492,140069000.0,168.939617
min,1.0,9788401000000.0,24.0
25%,25.75,9788417000000.0,200.0
50%,50.5,9788420000000.0,312.0
75%,75.25,9788434000000.0,448.0
max,100.0,9789808000000.0,936.0


## Cleaning

In [107]:
spanish_csv = df_spanish.drop(["Url", "Traductor", "Colección", "Encuadernación", "ISBN", "EAN", "Dimensiones", "Peso", "Ilustrador"], axis = 1)

In [108]:
spanish_csv.rename({"Puesto":"order"}, axis = 1, inplace = True)
spanish_csv.rename({"Título":"title"}, axis = 1, inplace = True)
spanish_csv.rename({"Autor":"author"}, axis = 1, inplace = True)
spanish_csv.rename({"Materias":"genre"}, axis = 1, inplace = True)
spanish_csv.rename({"Editorial":"editorial"}, axis = 1, inplace = True)
spanish_csv.rename({"País de publicación ":"country"}, axis = 1, inplace = True)
spanish_csv.rename({"Idioma de publicación ":"language"}, axis = 1, inplace = True)
spanish_csv.rename({"Idioma original ":"orig_language"}, axis = 1, inplace = True)
spanish_csv.rename({"Nº páginas":"pages"}, axis = 1, inplace = True)
spanish_csv.rename({"Fecha publicación ":"date"}, axis = 1, inplace = True)

In [109]:
spanish_csv["date"] = pd.to_datetime(spanish_csv["date"])

  spanish_csv["date"] = pd.to_datetime(spanish_csv["date"])


In [110]:
spanish_csv["year_pub"] = pd.DatetimeIndex(spanish_csv["date"]).year

In [111]:
spanish_csv.to_csv("../data/spanish_projectII.csv", index=False)

# Web Scrapping

In [43]:
# Best sellers from this independent conglomerate https://www.todostuslibros.com/mas_vendidos
url = "https://www.todostuslibros.com/mas_vendidos"
res = requests.get(url)
html = res.content
soup = BeautifulSoup(html, 'html.parser')
body = soup.find("main", {"class": "main"})
body

<main class="main">
<div class="container d-none d-sm-block">
<div class="row justify-content-center">
<ul class="breadcrumb">
<li>
<a href="/" title="Todos tus libros">Todos tus libros <i class="fas fa-angle-right"></i></a>
</li>
<li>
<a href="" title="Más vendidos">
                Más vendidos            </a>
</li>
</ul>
</div>
</div>
<div class="container">
<div class="row justify-content-center">
<div class="col" id="main-php-flash-errors">
</div>
</div>
</div>
<div class="container">
<div class="row justify-content-center">
<div class="col">
<h1 class="page-title">Los 100 más vendidos</h1>
<div class="header-list row">
<div class="col-12 col-sm-9 text-center text-sm-left"></div>
<div class="alter-view col-4 col-sm-3 text-right d-none d-md-block" id="alter-view">
<a>
<i class="fa fa-th"></i>
</a>
<a class="selected">
<i class="fa fa-bars"></i>
</a>
</div>
</div>
<div class="row">
<ul class="books col-12">
<li class="book row" data-gtm-disponibilidad="" data-gtm-editorial="B" data-

## Get titles

In [44]:
titles_info = body.find_all("li", {"class":"book row"})
titles_info

[<li class="book row" data-gtm-disponibilidad="" data-gtm-editorial="B" data-gtm-index="1" data-gtm-isbn="978-84-666-7568-0" data-gtm-precio="22.90" data-gtm-titulo="Todo vuelve (Todo arde 2)" id="book_978-84-666-7568-0">
 <div class="col-12 col-sm-12 col-md-9 book-content">
 <div class="row">
 <div class="book-image col-4 col-sm-3 col-md-3 col-lg-3">
 <a href="https://www.todostuslibros.com/libros/todo-vuelve-todo-arde-2_978-84-666-7568-0">
 <img alt="Todo vuelve (Todo arde 2)" height="150" src="https://static.cegal.es/imagenes/marcadas/9788466/978846667568.gif" title="Todo vuelve (Todo arde 2)" width="100"/>
 <span> 1 </span>
 </a>
 <!-- Solo Vista grid y móvil -->
 <a class="btn bookshelves-link" data-book-isbn="978-84-666-7568-0" data-book-title="Todo vuelve (Todo arde 2)" href="https://www.todostuslibros.com/login?_next=https://www.todostuslibros.com/libros/todo-vuelve-todo-arde-2_978-84-666-7568-0" title="Añadir a estantería">
 <i class="far fa-heart"></i>
 </a>
 <!-- //Solo  Vis

In [45]:
titles = [titles_info[i].get("data-gtm-titulo") for i in range(len(titles_info))]

## Get authors

In [46]:
authors_info = body.find_all("li", {"class":"book row"})
authors_info

[<li class="book row" data-gtm-disponibilidad="" data-gtm-editorial="B" data-gtm-index="1" data-gtm-isbn="978-84-666-7568-0" data-gtm-precio="22.90" data-gtm-titulo="Todo vuelve (Todo arde 2)" id="book_978-84-666-7568-0">
 <div class="col-12 col-sm-12 col-md-9 book-content">
 <div class="row">
 <div class="book-image col-4 col-sm-3 col-md-3 col-lg-3">
 <a href="https://www.todostuslibros.com/libros/todo-vuelve-todo-arde-2_978-84-666-7568-0">
 <img alt="Todo vuelve (Todo arde 2)" height="150" src="https://static.cegal.es/imagenes/marcadas/9788466/978846667568.gif" title="Todo vuelve (Todo arde 2)" width="100"/>
 <span> 1 </span>
 </a>
 <!-- Solo Vista grid y móvil -->
 <a class="btn bookshelves-link" data-book-isbn="978-84-666-7568-0" data-book-title="Todo vuelve (Todo arde 2)" href="https://www.todostuslibros.com/login?_next=https://www.todostuslibros.com/libros/todo-vuelve-todo-arde-2_978-84-666-7568-0" title="Añadir a estantería">
 <i class="far fa-heart"></i>
 </a>
 <!-- //Solo  Vis

In [47]:
authors = [authors_info[i].find_all("h3", {"class":"author"})[0].getText().strip() for i in range(len(authors_info))]

## Get secondary links to obtain other data

In [48]:
link_info = body.find_all("div", {"class": "book-image col-4 col-sm-3 col-md-3 col-lg-3"})
link_info

[<div class="book-image col-4 col-sm-3 col-md-3 col-lg-3">
 <a href="https://www.todostuslibros.com/libros/todo-vuelve-todo-arde-2_978-84-666-7568-0">
 <img alt="Todo vuelve (Todo arde 2)" height="150" src="https://static.cegal.es/imagenes/marcadas/9788466/978846667568.gif" title="Todo vuelve (Todo arde 2)" width="100"/>
 <span> 1 </span>
 </a>
 <!-- Solo Vista grid y móvil -->
 <a class="btn bookshelves-link" data-book-isbn="978-84-666-7568-0" data-book-title="Todo vuelve (Todo arde 2)" href="https://www.todostuslibros.com/login?_next=https://www.todostuslibros.com/libros/todo-vuelve-todo-arde-2_978-84-666-7568-0" title="Añadir a estantería">
 <i class="far fa-heart"></i>
 </a>
 <!-- //Solo  Vista grid y móvil -->
 </div>,
 <div class="book-image col-4 col-sm-3 col-md-3 col-lg-3">
 <a href="https://www.todostuslibros.com/libros/el-infierno_978-84-08-27758-3">
 <img alt="El Infierno" height="150" src="https://static.cegal.es/imagenes/marcadas/9788408/978840827758.gif" title="El Infiern

In [49]:
link_unpross = [link_info[i].find_all("a") for i in range(len(link_info))]
links = [link_unpross[i][0].get("href") for i in range(len(link_info))]

## Get genres

In [50]:
genres = []
for i in links:
    url = i
    res = requests.get(url)
    html = res.content
    soup = BeautifulSoup(html, 'html.parser')
    genres_unpross = soup.find_all("dl", {"class":"materias"})[0].getText().strip().split("\n")[1:]
    genre_for_each_title = [i.strip() for i in genres_unpross if "|" not in i][1:]
    genres.append(genre_for_each_title)

## Get editorial

In [51]:
editorials = []
for i in links:
    url = i
    res = requests.get(url)
    html = res.content
    soup = BeautifulSoup(html, 'html.parser')
    editorials_unpross = soup.find_all("div", {"id":"collapseFic"})[0].find("a").getText()
    editorials.append(editorials_unpross)
editorials

['B',
 'Editorial Planeta',
 'PLAZA & JANES',
 'Libros Cúpula',
 'Editorial Bruño',
 'EDIBESA',
 'ALFAGUARA',
 'Diana Editorial',
 'Seix Barral',
 'ALFAGUARA',
 'Editorial Funambulista',
 'EDIBESA']

## Get country

In [52]:
countries = []
for i in links:
    url = i
    res = requests.get(url)
    html = res.content
    soup = BeautifulSoup(html, 'html.parser')
    countries_unpross = soup.find_all("div", {"id":"collapseFic"})[0].find_all("dd")[-8].getText()
    countries.append(countries_unpross)
countries

['España',
 'España',
 'España',
 'España',
 'Castellano',
 'España',
 'España',
 'España',
 'España',
 'España',
 'España',
 'España']

## Get language

In [53]:
languages = []
for i in links:
    url = i
    res = requests.get(url)
    html = res.content
    soup = BeautifulSoup(html, 'html.parser')
    languages_unpross = soup.find_all("div", {"id":"collapseFic"})[0].find_all("dd")[-7].getText()
    languages.append(languages_unpross)
languages

['Castellano',
 'Castellano',
 'Castellano',
 'Castellano',
 'Francés',
 'Castellano',
 'Castellano',
 'Castellano',
 'Castellano',
 'Castellano',
 'Castellano',
 'Castellano']

## Get original language

In [54]:
orig_language = []
for i in links:
    url = i
    res = requests.get(url)
    html = res.content
    soup = BeautifulSoup(html, 'html.parser')
    orig_language_unpross = soup.find_all("div", {"id":"collapseFic"})[0].find_all("dd")[-6].getText()
    orig_language.append(orig_language_unpross)
orig_language

['Castellano',
 'Castellano',
 'Inglés',
 'Castellano',
 '                                         295\n                                         x                                         225\n                                     mm.',
 'Castellano',
 'Castellano',
 'Inglés',
 'Castellano',
 'Castellano',
 'Francés',
 'Castellano']

## Get pages

In [55]:
pages = []
for i in links:
    url = i
    res = requests.get(url)
    html = res.content
    soup = BeautifulSoup(html, 'html.parser')
    pages_unpross = soup.find_all("div", {"id":"collapseFic"})[0].find_all("dd")[-3].getText()
    pages.append(pages_unpross)
pages

['608',
 '480',
 '832',
 '424',
 '48',
 '512',
 '328',
 '336',
 '240',
 '208',
 '584',
 '512']

## Get dates

In [56]:
dates = []
for i in links:
    url = i
    res = requests.get(url)
    html = res.content
    soup = BeautifulSoup(html, 'html.parser')
    dates_unpross = soup.find_all("div", {"id":"collapseFic"})[0].find_all("dd")[-2].getText()
    dates.append(dates_unpross)
dates

['24-10-2023',
 '04-10-2023',
 '26-09-2023',
 '18-10-2023',
 '26-10-2023',
 '01-06-2023',
 '05-09-2023',
 '08-09-2020',
 '30-08-2023',
 '05-10-2023',
 '04-10-2023',
 '01-06-2023']

## Get prices

In [57]:
prices_info = body.find_all("div", {"class":"book-price d-block d-md-none"})
prices = [prices_info[i].find("strong").getText().strip() for i in range(len(prices_info))]
prices

['22.90€',
 '22.90€',
 '24.90€',
 '19.90€',
 '10.50€',
 '2.65€',
 '21.90€',
 '19.90€',
 '19.90€',
 '18.90€',
 '24.90€',
 '4.60€']

## Web scrapping functions

In [58]:
def getBodies(url):
    res = requests.get(url)
    html = res.content
    soup = BeautifulSoup(html, 'html.parser')
    body = soup.find("main", {"class": "main"})
    return body
body = getBodies("https://www.todostuslibros.com/mas_vendidos")

In [59]:
def getTitles(body):
    titles_info = body.find_all("li", {"class":"book row"})
    titles = [titles_info[i].get("data-gtm-titulo") for i in range(len(titles_info))]
    return titles
titles = getTitles(body)

In [60]:
def getAuthors(body):
    authors_info = body.find_all("li", {"class":"book row"})
    authors = [authors_info[i].find_all("h3", {"class":"author"})[0].getText().strip() for i in range(len(authors_info))]
    return authors
authors = getAuthors(body)

In [61]:
def getLinks(body):
    link_info = body.find_all("div", {"class": "book-image col-4 col-sm-3 col-md-3 col-lg-3"})
    link_unpross = [link_info[i].find_all("a") for i in range(len(link_info))]
    links = [link_unpross[i][0].get("href") for i in range(len(link_info))]
    return links
links = getLinks(body)

In [62]:
def getGenres(links):
    genres = []
    for i in links:
        url = i
        res = requests.get(url)
        html = res.content
        soup = BeautifulSoup(html, 'html.parser')
        genres_unpross = soup.find_all("dl", {"class":"materias"})[0].getText().strip().split("\n")[1:]
        genre_for_each_title = [i.strip() for i in genres_unpross if "|" not in i][1:]
        genres.append(genre_for_each_title)
    return genres
genres = getGenres(links)

In [63]:
def getEditorials(links):
    editorials = []
    for i in links:
        url = i
        res = requests.get(url)
        html = res.content
        soup = BeautifulSoup(html, 'html.parser')
        editorials_unpross = soup.find_all("div", {"id":"collapseFic"})[0].find("a").getText()
        editorials.append(editorials_unpross)
    return editorials
editorials = getEditorials(links)

In [64]:
def getCountries(links):
    countries = []
    for i in links:
        url = i
        res = requests.get(url)
        html = res.content
        soup = BeautifulSoup(html, 'html.parser')
        countries_unpross = soup.find_all("div", {"id":"collapseFic"})[0].find_all("dd")[-8].getText()
        countries.append(countries_unpross)
    return countries
countries = getCountries(links)

In [65]:
def getLanguages(links):
    languages = []
    for i in links:
        url = i
        res = requests.get(url)
        html = res.content
        soup = BeautifulSoup(html, 'html.parser')
        languages_unpross = soup.find_all("div", {"id":"collapseFic"})[0].find_all("dd")[-7].getText()
        languages.append(languages_unpross)
    return languages
languages = getLanguages(links)

In [66]:
def getLang_orig(links):
    orig_language = []
    for i in links:
        url = i
        res = requests.get(url)
        html = res.content
        soup = BeautifulSoup(html, 'html.parser')
        orig_language_unpross = soup.find_all("div", {"id":"collapseFic"})[0].find_all("dd")[-6].getText()
        orig_language.append(orig_language_unpross)
    return orig_language
orig_language = getLang_orig(links)

In [67]:
def getPages(links):
    pages = []
    for i in links:
        url = i
        res = requests.get(url)
        html = res.content
        soup = BeautifulSoup(html, 'html.parser')
        pages_unpross = soup.find_all("div", {"id":"collapseFic"})[0].find_all("dd")[-3].getText()
        pages.append(pages_unpross)
    return pages
pages = getPages(links)

In [68]:
def getDates(links):
    fechas = []
    for i in links:
        url = i
        res = requests.get(url)
        html = res.content
        soup = BeautifulSoup(html, 'html.parser')
        fechas_unpross = soup.find_all("div", {"id":"collapseFic"})[0].find_all("dd")[-2].getText()
        fechas.append(fechas_unpross)
    return fechas
dates = getDates(links)

In [69]:
def getPrices(body):
    prices_info = body.find_all("div", {"class":"book-price d-block d-md-none"})
    prices = [prices_info[i].find("strong").getText().strip() for i in range(len(prices_info))]
    return prices
prices = getPrices(body)

In [70]:
url_basal = "https://www.todostuslibros.com/mas_vendidos"

def getUrls():
    urls = [(url_basal + "?page=" + str(i)) for i in range(1, 11)]
    return urls
urls = getUrls()

## Looping through 10 pages

In [71]:
bodies = [getBodies(i) for i in urls]

In [72]:
links_secondary = [getLinks(i) for i in bodies]

In [73]:
titles_100 = []
for i in bodies:
    titles = getTitles(i)
    titles_100.extend(titles)

In [74]:
authors_100 = []
for i in bodies:
    authors = getAuthors(i)
    authors_100.extend(authors)

In [75]:
genres_100 = []
for i in links_secondary:
    genres = getGenres(i)
    genres_100.extend(genres)

In [76]:
editorials_100 = []
for i in links_secondary:
    editorials = getEditorials(i)
    editorials_100.extend(editorials)

In [77]:
countries_100 = []
for i in links_secondary:
    countries = getCountries(i)
    countries_100.extend(countries)

In [78]:
languages_100 = []
for i in links_secondary:
    languages = getLanguages(i)
    languages_100.extend(languages)

In [79]:
orig_lang_100 = []
for i in links_secondary:
    orig_lang = getLang_orig(i)
    orig_lang_100.extend(orig_lang)

In [80]:
pages_100 = []
for i in links_secondary:
    pages = getPages(i)
    pages_100.extend(pages)

In [81]:
dates_100 = []
for i in links_secondary:
    dates = getDates(i)
    dates_100.extend(dates)

In [82]:
prices_100 = []

for i in bodies:
    prices = getPrices(i)
    prices_100.extend(prices)

In [83]:
dictionary = {
    "titles":titles_100,
    "authors":authors_100,
    "genres": genres_100,
    "editorial":editorials_100,
    "countries":countries_100,
    "languages":languages_100,
    "orig_language": orig_lang_100,
    "pages":pages_100,
    "dates":dates_100
}

In [84]:
pd.DataFrame(dictionary)

Unnamed: 0,titles,authors,genres,editorial,countries,languages,orig_language,pages,dates
0,Todo vuelve (Todo arde 2),"Gómez-Jurado, Juan","[Género policíaco y misterio, Obra de espionaj...",B,España,Castellano,Castellano,608,24-10-2023
1,El Infierno,"Mola, Carmen","[Ficción histórica, Obra de misterio y suspens...",Editorial Planeta,España,Castellano,Castellano,480,04-10-2023
2,La armadura de la luz (Saga Los pilares de la ...,"Follett, Ken","[Ficción de las guerras napoleónicas, Ficción ...",PLAZA & JANES,España,Castellano,Inglés,832,26-09-2023
3,La chica del verano,La Vecina Rubia,[Ficción moderna y contemporanea],Libros Cúpula,España,Castellano,Castellano,424,18-10-2023
4,El Lirio Blanco,"Goscinny, René / Fabcaro","[HUMOR, Edad de interés: a partir de 10 años, ...",Editorial Bruño,Castellano,Francés,295\n...,48,26-10-2023
...,...,...,...,...,...,...,...,...,...
95,Las reglas del ratoncito Pérez (Cuentos para c...,"Moreno, Eloy","[Edad de interés: a partir de 6 años, Cuentos ...",Nube de Tinta,España,Castellano,Castellano,48,07-09-2023
96,Las garras del águila: una novela de Lisbeth S...,"Smirnoff, Karin","[Obra de misterio y suspense, Género policíaco...",Ediciones Destino,España,Castellano,Sueco,592,30-08-2023
97,Cómo vender una casa encantada,"Hendrix, Grady","[Ficción moderna y contemporanea, Ficción y te...",Minotauro,España,Castellano,Inglés,440,18-10-2023
98,Biografía del silencio,"d'Ors, Pablo",[Ficción moderna y contemporanea],Galaxia Gutenberg,España,Castellano,Castellano,112,12-02-2020


### Functions to gather the data and create the df

In [85]:
bodies = [getBodies(i) for i in urls]
links_secondary = [getLinks(i) for i in bodies]

def getTitles_df(bodies):
    titles_100 = []
    for i in bodies:
        titles = getTitles(i)
        titles_100.extend(titles)
    return titles_100

def getAuthors_df(bodies):
    authors_100 = []
    for i in bodies:
        authors = getAuthors(i)
        authors_100.extend(authors)
    return authors_100

def getGenres_df(links_secondary):
    genres_100 = []
    for i in links_secondary:
        genres = getGenres(i)
        genres_100.extend(genres)
    return genres_100

def getEditorials_df(links_secondary):
    editorials_100 = []
    for i in links_secondary:
        editorials = getEditorials(i)
        editorials_100.extend(editorials)
    return editorials_100

def getCountries_df(links_secondary):
    countries_100 = []
    for i in links_secondary:
        countries = getCountries(i)
        countries_100.extend(countries)
    return countries_100

def getLanguages_df(links_secondary):
    languages_100 = []
    for i in links_secondary:
        languages = getLanguages(i)
        languages_100.extend(languages)
    return languages_100

def getOrigLang_df(links_secondary):
    orig_lang_100 = []
    for i in links_secondary:
        orig_lang = getLang_orig(i)
        orig_lang_100.extend(orig_lang)
    return orig_lang_100

def getPages_df(links_secondary):
    pages_100 = []
    for i in links_secondary:
        pages = getPages(i)
        pages_100.extend(pages)
    return pages_100

def getDates_df(links_secondary):
    dates_100 = []
    for i in links_secondary:
        dates = getDates(i)
        dates_100.extend(dates)
    return dates_100

def getPrices_df(bodies):
    prices_100 = []
    for i in bodies:
        prices = getPrices(i)
        prices_100.extend(prices)
    return prices_100

In [105]:
def build_df(titl, auth, genr, edit, countr, lang, orig_lang, pag, date, price):
    dictionary = {
        "title":titl,
        "author":auth,
        "genre": genr,
        "editorial":edit,
        "country":countr,
        "language":lang,
        "orig_language": orig_lang,
        "pages":pag,
        "date":date,
        "price": price
    }
    library_csv = pd.DataFrame(dictionary)
    library_csv.to_csv("../data/web_projectII.csv", index=False)

In [112]:
titles_df = getTitles_df(bodies)
authors_df = getAuthors_df(bodies)
genres_df = getGenres_df(links_secondary)
editorials_df = getEditorials_df(links_secondary)
countries_df = getCountries_df(links_secondary)
languages_df = getLanguages_df(links_secondary)
orig_lang_df = getOrigLang_df(links_secondary)
pages_df = getPages_df(links_secondary)
dates_df = getDates_df(links_secondary)
prices_df = getPrices_df(bodies)

build_df(titles_df, authors_df, genres_df, editorials_df, countries_df, languages_df, orig_lang_df, pages_df, dates_df, prices_df)