# Web scraping. 2

In [None]:
# BeautifulSoup library documentation
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/

In [1]:
# Basic libraries
import requests
from bs4 import BeautifulSoup

In [2]:
# https://bigdatawirtz.github.io/exemplo-web/08.html
url = 'https://bigdatawirtz.github.io/exemplo-web/08.html'
paxina = requests.get(url)

soup = BeautifulSoup(paxina.content,'html.parser')
#soup
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   Páxina web con seccións
  </title>
  <meta charset="utf-8"/>
 </head>
 <body>
  <article>
   <header class="art">
    <h1>
     Título do artigo
    </h1>
    <p>
     Introducción do artigo
    </p>
   </header>
   <section id="main_content">
    <p>
     Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
    </p>
   </section>
   <section id="aux_content">
    <p>
     Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
    </p>
   </section>
   <footer class="art">
    <p>
     Pé do artigo
    </p>
   </footer>
  </article>
  <footer class="web">
   <p>
    Pé de toda a web
   </p>
  </footer>
 </body

In [3]:
# Show footres
# We have two footers (one for the article and other for the web)
soup.find_all('footer')

[<footer class="art">
 <p>Pé do artigo</p>
 </footer>,
 <footer class="web">
 <p>Pé de toda a web</p>
 </footer>]

In [4]:
# In addition to selecting by tag...
# we can also select by class
soup.find_all(class_='art')

[<header class="art">
 <h1>Título do artigo</h1>
 <p>Introducción do artigo</p>
 </header>,
 <footer class="art">
 <p>Pé do artigo</p>
 </footer>]

In [5]:
# Select by tag and class
# Select the footer with 'art' class
soup.find_all('footer', class_='art')

[<footer class="art">
 <p>Pé do artigo</p>
 </footer>]

In [6]:
# Select the header with 'art' class
soup.find_all('header', class_='art')

[<header class="art">
 <h1>Título do artigo</h1>
 <p>Introducción do artigo</p>
 </header>]

In [7]:
# Select by id
# The ids identify elements that are unique on the web -> use find, instead of find_all
soup.find(id='main_content')

<section id="main_content">
<p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.</p>
</section>

In [8]:
# Search for "article"
soup.article

<article>
<header class="art">
<h1>Título do artigo</h1>
<p>Introducción do artigo</p>
</header>
<section id="main_content">
<p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.</p>
</section>
<section id="aux_content">
<p>Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
</section>
<footer class="art">
<p>Pé do artigo</p>
</footer>
</article>

In [9]:
# We can build a list of elements "child" or "inside" of *article*
soup.article.contents

['\n',
 <header class="art">
 <h1>Título do artigo</h1>
 <p>Introducción do artigo</p>
 </header>,
 '\n',
 <section id="main_content">
 <p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.</p>
 </section>,
 '\n',
 <section id="aux_content">
 <p>Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
 </section>,
 '\n',
 <footer class="art">
 <p>Pé do artigo</p>
 </footer>,
 '\n']

In [10]:

# You should delete the \n to leave only the contents in the list
contidos = soup.article.contents
while '\n' in contidos: 
    contidos.remove('\n')
contidos

[<header class="art">
 <h1>Título do artigo</h1>
 <p>Introducción do artigo</p>
 </header>,
 <section id="main_content">
 <p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.</p>
 </section>,
 <section id="aux_content">
 <p>Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
 </section>,
 <footer class="art">
 <p>Pé do artigo</p>
 </footer>]

In [11]:
# Access to the contents
contidos[0]

<header class="art">
<h1>Título do artigo</h1>
<p>Introducción do artigo</p>
</header>

In [12]:
contidos[1]

<section id="main_content">
<p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.</p>
</section>

In [13]:
contidos[1].contents

['\n',
 <p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.</p>,
 '\n']

## Table Scraping

The information that we are interested in downloading is in tables and our objective is to import them into Pandas tables. This conversion usually requires the manipulation of the text, numbers and data contained in the original táboa.

The typical structure of a table in `html` is:

```
<table>
    <thead>
        <tr>
            <th>Columna A</th>
            <th>Columna B</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <td>A1</td>
            <td>B1</td>
        </tr>
        <tr>
            <td>A2</td>
            <td>B2</td>
        </tr>
    </tbody>
</table>   
```

In [14]:
# https://bigdatawirtz.github.io/exemplo-web/taboa_simple.html
url = 'https://bigdatawirtz.github.io/exemplo-web/taboa_simple.html'
paxina = requests.get(url)

soup = BeautifulSoup(paxina.content,'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   Taboa simple
  </title>
  <meta charset="utf-8"/>
 </head>
 <body>
  <p>
   Temperatura da Coruña
  </p>
  <table border="2" title="Taboa simple">
   <tr>
    <td>
     Mes
    </td>
    <td>
     Temperatura media
    </td>
   </tr>
   <tr>
    <td>
     Xaneiro
    </td>
    <td>
     11
    </td>
   </tr>
   <tr>
    <td>
     Febreiro
    </td>
    <td>
     11
    </td>
   </tr>
   <tr>
    <td>
     Marzo
    </td>
    <td>
     12
    </td>
   </tr>
   <tr>
    <td>
     Abril
    </td>
    <td>
     13
    </td>
   </tr>
   <tr>
    <td>
     Maio
    </td>
    <td>
     15
    </td>
   </tr>
   <tr>
    <td>
     Xuño
    </td>
    <td>
     18
    </td>
   </tr>
   <tr>
    <td>
     Xullo
    </td>
    <td>
     19
    </td>
   </tr>
   <tr>
    <td>
     Agosto
    </td>
    <td>
     20
    </td>
   </tr>
   <tr>
    <td>
     Setembro
    </td>
    <td>
     18
    </td>
   </tr>
   <tr>
    <td>
     Outubro
    </td>
    <td>


In [20]:
#soup.find_all('tr')
lista_filas = soup.find_all('tr')

#lista_filas[0]
#lista_filas[0].find_all('td')
fila_0 = lista_filas[0].find_all('td')

for cela in fila_0:
    print(cela.text)
    
fila_1 = lista_filas[1].find_all('td')
fila_1

for cela in fila_1:
    print(cela.text)


Mes
Temperatura media
Xaneiro
11


In [21]:
lista_filas = soup.find_all('tr')

for fila in lista_filas:
    lista_celas = fila.find_all('td')
    for cela in lista_celas:
        print(cela.text)

Mes
Temperatura media
Xaneiro
11
Febreiro
11
Marzo
12
Abril
13
Maio
15
Xuño
18
Xullo
19
Agosto
20
Setembro
18
Outubro
16
Novembro
13
Decembro
11


In [22]:
taboa = []
temp_fila = []

lista_filas = soup.find_all('tr')

for fila in lista_filas:
    lista_celas = fila.find_all('td')    
    for cela in lista_celas:
        #print(cela.text)
        temp_fila.append(cela.text)
    taboa.append(temp_fila)
    temp_fila = []
taboa

[['Mes', 'Temperatura media'],
 ['Xaneiro', '11'],
 ['Febreiro', '11'],
 ['Marzo', '12'],
 ['Abril', '13'],
 ['Maio', '15'],
 ['Xuño', '18'],
 ['Xullo', '19'],
 ['Agosto', '20'],
 ['Setembro', '18'],
 ['Outubro', '16'],
 ['Novembro', '13'],
 ['Decembro', '11']]

In [23]:
import pandas as pd
df = pd.DataFrame(taboa)
df

Unnamed: 0,0,1
0,Mes,Temperatura media
1,Xaneiro,11
2,Febreiro,11
3,Marzo,12
4,Abril,13
5,Maio,15
6,Xuño,18
7,Xullo,19
8,Agosto,20
9,Setembro,18


In [24]:
#df.rename(columns=pd.Series(['mes','temperatura']))
#df.iloc[0]
#df.rename(columns=df.iloc[0])
#df.drop(1)
#df.rename(columns=df.iloc[0]).drop(0)
df = df.rename(columns=df.iloc[0]).drop(0)

In [25]:
df

Unnamed: 0,Mes,Temperatura media
1,Xaneiro,11
2,Febreiro,11
3,Marzo,12
4,Abril,13
5,Maio,15
6,Xuño,18
7,Xullo,19
8,Agosto,20
9,Setembro,18
10,Outubro,16


In [26]:
#df.set_index('Mes')
df = df.set_index('Mes')

In [None]:
df

In [27]:
# https://bigdatawirtz.github.io/exemplo-web/05.html
url = 'https://bigdatawirtz.github.io/exemplo-web/05.html'
paxina = requests.get(url)

soup = BeautifulSoup(paxina.content,'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   Tablas
  </title>
  <meta charset="utf-8"/>
 </head>
 <body>
  <table border="2" title="Tabla de Gastos">
   <caption>
    Tabla de Gastos
   </caption>
   <thead>
    <tr>
     <th>
      Mes
     </th>
     <th>
      Ganacias
     </th>
    </tr>
   </thead>
   <tbody>
    <tr>
     <td>
      Enero
     </td>
     <td>
      100 €
     </td>
    </tr>
    <tr>
     <td>
      Febrero
     </td>
     <td>
      80 €
     </td>
    </tr>
   </tbody>
   <tfoot>
    <tr>
     <td>
      Total
     </td>
     <td>
      180 €
     </td>
    </tr>
   </tfoot>
  </table>
  <hr/>
  <table border="1" title="Tabla de pruebas">
   <caption>
    Tabla de pruebas de colspan y rowspan
   </caption>
   <tbody>
    <tr>
     <td colspan="2">
      Esto tiene un colspan de 2
     </td>
    </tr>
    <tr>
     <td rowspan="2">
      Esta tiene un rowspan de 2
     </td>
     <td>
      nada por aquí
     </td>
    </tr>
    <tr>
     <td>
      Una cosa
  

In [28]:
# Locate the table
taboa_gastos = soup.find('table')
taboa_gastos

<table border="2" title="Tabla de Gastos">
<caption>Tabla de Gastos</caption>
<thead>
<tr>
<th>Mes</th>
<th>Ganacias</th>
</tr>
</thead>
<tbody>
<tr>
<td>Enero</td>
<td>100 €</td>
</tr>
<tr>
<td>Febrero</td>
<td>80 €</td>
</tr>
</tbody>
<tfoot>
<tr>
<td>Total</td>
<td>180 €</td>
</tr>
</tfoot>
</table>

In [29]:
# p.ex: access to an elemento of the table header
taboa_gastos.find('th').text


'Mes'

In [30]:
lista = []
temp_lista = []

for fila in taboa_gastos.find_all('tr'):
    for linha in fila.find_all(['td','th']): #we can search for two elements at the same time
        temp_lista.append(linha.text)
    lista.append(temp_lista)
    temp_lista = []
    
lista

[['Mes', 'Ganacias'],
 ['Enero', '100 €'],
 ['Febrero', '80 €'],
 ['Total', '180 €']]

In [31]:
import pandas as pd

In [32]:
# Create a dataframe
df = pd.DataFrame(lista)
df

Unnamed: 0,0,1
0,Mes,Ganacias
1,Enero,100 €
2,Febrero,80 €
3,Total,180 €


In [33]:
# Create a dataframe and adjust headers
df = pd.DataFrame(lista)
df = df.rename(columns=df.iloc[0]).drop(0)
df

Unnamed: 0,Mes,Ganacias
1,Enero,100 €
2,Febrero,80 €
3,Total,180 €


In [34]:
# Adjust indexes
df.set_index('Mes')

Unnamed: 0_level_0,Ganacias
Mes,Unnamed: 1_level_1
Enero,100 €
Febrero,80 €
Total,180 €
