### Web Scraping é o termo utilizado para definir a prática de coletar automaticamente informações na Internet. Isto é feito, geralmente, por meio de programas que simulam a navegação humana na Web.

##  Ambiente e bibliotecas

In [1]:
import bs4
import urllib.request as urllib_request
import pandas

print("BeautifulSoup ->", bs4.__version__)
print("urllib ->", urllib_request.__version__)
print("pandas ->", pandas.__version__)

BeautifulSoup -> 4.11.1
urllib -> 3.9
pandas -> 1.4.4


## Meu primeiro scraping

In [3]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

url = 'https://alura-site-scraping.herokuapp.com/hello-world.php'

response = urlopen(url)
html = response.read()

soup = BeautifulSoup(html, 'html.parser')

print(soup.find('h1', id="hello-world").get_text())
print(soup.find('p').get_text())

In [None]:
from urllib.request import urlopen

url = 'https://alura-site-scraping.herokuapp.com/index.php'

response = urlopen(url)
html = response.read()
html

In [None]:
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError

url = 'https://www.alura.com.br'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}

try:
    req = Request(url, headers = headers)
    response = urlopen(req)
    print(response.read())
    
except HTTPError as e:
    print(e.status, e.reason)
    
except URLError as e:
    print(e.reason)
    

## Tratamento de string

In [None]:
from urllib.request import urlopen

url = 'https://alura-site-scraping.herokuapp.com/index.php'

response = urlopen(url)
html = response.read()
html

### Convertando o tipo bytes para string

In [None]:
type(html)

In [None]:
html = html.decode('utf-8')

In [None]:
type(html)

In [None]:
html

### Eliminando os caracteres de tabulação, quebra de linha etc.

In [None]:
html.split()

In [None]:
" ".join(html.split())

### Eliminando os espaços em branco entre as TAGS

In [None]:
" ".join(html.split()).replace('> <', '><')

### Função de tratamento de strings

In [None]:
def trata_html(input):
    return " ".join(input.split()).replace('> <', '><')

In [None]:
html

In [None]:
html = trata_html(html)
html

## Criando um objeto BeautifulSoup

In [None]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'html.parser')

soup

In [None]:
print(soup.prettify())

## Acessando tags

In [None]:
soup.html

In [None]:
soup.html.head.title

In [None]:
soup.title

In [None]:
soup.div.div.div.div.h5

In [None]:
soup.h5

## Acessando o conteúdo das tags

In [None]:
soup.html.head.title

In [None]:
soup.title

In [None]:
soup.title.get_text()

In [None]:
soup.h5.getText()

In [None]:
soup.get_text()

## Acessando os atributos de uma tag

In [None]:
soup.img

In [None]:
soup.img.attrs

In [None]:
soup.img.attrs.keys()

In [None]:
soup.img.attrs.values()

In [None]:
soup.img['class']

In [None]:
soup.img.get('src')

### Método find()

In [None]:
soup.find('img')

In [None]:
soup.img

### Método findAll()

In [None]:
soup.findAll('img')

### Comando equivalente ao método find()

In [None]:
soup.findAll('img', limit = 1)[0]

### Atalho para o método findAll()

In [None]:
soup('img')

### Passando listas de TAGs

In [None]:
soup.findAll(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

### Utilizando o argumento attributes

In [None]:
soup.findAll('p', {"class": "txt-value"})

### Buscando por conteúdo de uma TAG

In [None]:
soup.findAll('p', text = "Belo Horizonte - MG")

### Utilizando diretamente os atributos

In [None]:
soup.findAll('img', alt="Foto")

In [None]:
for item in soup.findAll('img', alt="Foto"):
    print(item.get('src'))

### Cuidado com o atributo "class"

In [None]:
soup.findAll('p', class_="txt-value")

### Obtendo todo o conteúdo de texto de uma página

In [None]:
soup.findAll(text = True)

## Outros métodos de pesquisa

In [None]:
html_teste = """
    <html>
        <body>
            <div id="container-a">
                <h1>Título A</h1>
                <h2 class="ref-a">Sub título A</h2>
                <p>Texto de conteúdo A</p>
            </div>
            <div id="container-b">
                <h1>Título B</h1>
                <h2 class="ref-b">Sub título B</h2>
                <p>Texto de conteúdo B</p>
            </div>
        </body>
    </html>
"""

### Tratamentos para a string HTML

In [None]:
html_teste

In [None]:
html_teste = trata_html(html_teste)
html_teste

### Criando o objeto BeautifulSoup

In [None]:
soup = BeautifulSoup(html_teste, 'html.parser')
soup

### Parents

In [None]:
soup.find('h2')

In [None]:
soup.find('h2').find_parent('div')

In [None]:
soup.find('h2').find_parents()

In [None]:
soup.findAll('h2')

In [None]:
for item in soup.findAll('h2'):
    print(item.find_parent('div'))

## Siblings

In [None]:
soup.find('h2').findNextSibling()

In [None]:
soup.find('h2').findPreviousSibling()

In [None]:
soup.find('p').findPreviousSiblings()

## Next e Previous

In [None]:
soup.find('h2').findNext()

In [None]:
soup.find('h2').findPrevious()

In [None]:
soup.find('h2').findAllNext()

## Identificando e selecionando os dados no HTML

### Obtendo o HTML e criando o objeto BeautifulSoup

In [None]:
response = urlopen('https://alura-site-scraping.herokuapp.com/index.php')
html = response.read().decode('utf-8')
soup = BeautifulSoup(html, 'html.parser')
soup

### Criando variáveis para armazenar informações

In [None]:
cards = []
card = {}

### Obtendo os dados do primeiro CARD

In [None]:
anuncio = soup.find('div', {'class': 'well card'})
anuncio

## Obtendo o VALOR do veículo anunciado

In [None]:
anuncio

In [None]:
anuncio.find('div', {'class': 'value-card'})

In [None]:
anuncio.find('p', {'class': 'txt-value'}).getText()

In [None]:
card['value'] = anuncio.find('p', {'class': 'txt-value'}).getText()

In [None]:
card

### Resumo

In [None]:
# Valor
card['value'] = anuncio.find('p', {'class': 'txt-value'}).getText()

## Obtendo as INFORMAÇÕES sobre o veículo anunciado

In [None]:
anuncio.find('div', {'class': 'body-card'}).findAll('p')

In [None]:
infos = anuncio.find('div', {'class': 'body-card'}).findAll('p')

In [None]:
for info in infos:
    print(info.get('class'), ' - ', info.get_text())

In [None]:
for info in infos:
    print(info.get('class')[0], ' - ', info.get_text())

In [None]:
for info in infos:
    print(info.get('class')[0].split('-'), ' - ', info.get_text())

In [None]:
for info in infos:
    print(info.get('class')[0].split('-')[-1], ' - ', info.get_text())

In [None]:
for info in infos:
    card[info.get('class')[0].split('-')[-1]] = info.get_text()

In [None]:
card

### Resumo

In [None]:
# Informações
infos = anuncio.find('div', {'class': 'body-card'}).findAll('p')
for info in infos:
    card[info.get('class')[0].split('-')[-1]] = info.get_text()

## Obtendo os ACESSÓRIOS do veículo anunciado

In [None]:
anuncio.find('div', {'class': 'body-card'}).ul.findAll('li')

In [None]:
items = anuncio.find('div', {'class': 'body-card'}).ul.findAll('li')
items

In [None]:
items.pop()

In [None]:
items

In [None]:
for item in items:
    print(item.getText().replace('► ', ''))

In [None]:
acessorios = []
for item in items:
    acessorios.append(item.getText().replace('► ', ''))
    
acessorios

In [None]:
card['items'] = acessorios

In [None]:
card

### Resumo

In [None]:
# Acessórios
items = anuncio.find('div', {'class': 'body-card'}).ul.findAll('li')
items.pop()
acessorios = []
for item in items:
    acessorios.append(item.getText().replace('► ', ''))
card['items'] = acessorios

## Criando um DataFrame com os dados coletados do Alura Motors

In [None]:
card

In [None]:
import pandas as pd

In [None]:
dataset = pd.DataFrame(card)

In [None]:
dataset

In [None]:
dataset = pd.DataFrame.from_dict(card, orient = 'index')
dataset

In [None]:
dataset = pd.DataFrame.from_dict(card, orient = 'index').T
dataset

In [None]:
dataset.to_csv('./output/data/dataset.csv', sep=';', index = False, encoding = 'utf-8-sig')

## Obtendo a FOTO do anúncio

In [None]:
image = anuncio.find('div', {'class': 'image-card'}).img
image

In [None]:
image.get('src')

In [None]:
print(image.get('src'))

In [None]:
image.get('src').split('/')[-1]

In [None]:
from urllib.request import urlretrieve

urlretrieve(image.get('src'), './output/img/' + image.get('src').split('/')[-1])

### Resumo

In [None]:
# Imagens
image = anuncio.find('div', {'class': 'image-card'}).img
urlretrieve(image.get('src'), './output/img/' + image.get('src').split('/')[-1])

## Identificando as informações no HTML

In [None]:
len(soup.find('div', {"id": "container-cards"}).findAll('div', class_="card"))

In [None]:
anuncios = soup.find('div', {"id": "container-cards"}).findAll('div', class_="card")

In [None]:
for anuncio in anuncios:
    print(str(anuncio) + "\n\n")

## Criando uma rotina de scraping

In [None]:
# Importando bibliotecas
from urllib.request import urlopen, urlretrieve
from bs4 import BeautifulSoup
import pandas as pd

# Declarando variável cards
cards = []

# Obtendo o HTML
response = urlopen('https://alura-site-scraping.herokuapp.com/index.php')
html = response.read().decode('utf-8')
soup = BeautifulSoup(html, 'html.parser')

# Obtendo as TAGs de interesse
anuncios = soup.find('div', {"id": "container-cards"}).findAll('div', class_="card")

# Coletando as informações dos CARDS
for anuncio in anuncios:
    card = {}
    
    # Valor
    card['value'] = anuncio.find('p', {'class': 'txt-value'}).getText()

    # Informações
    infos = anuncio.find('div', {'class': 'body-card'}).findAll('p')
    for info in infos:
        card[info.get('class')[0].split('-')[-1]] = info.get_text()

    # Acessórios
    items = anuncio.find('div', {'class': 'body-card'}).ul.findAll('li')
    items.pop()
    acessorios = []
    for item in items:
        acessorios.append(item.get_text().replace('► ', ''))
    card['items'] = acessorios
    
    # Adicionando resultado a lista cards
    cards.append(card)

    # Imagens
    image = anuncio.find('div', {'class': 'image-card'}).img
    urlretrieve(image.get('src'), './output/img/' + image.get('src').split('/')[-1])     

# Criando um DataFrame com os resultados
dataset = pd.DataFrame(cards)
dataset.to_csv('./output/data/dataset.csv', sep=';', index = False, encoding = 'utf-8-sig')
dataset

In [None]:
cards

## Identificando as informações no HTML

In [None]:
soup.find('span', class_="info-pages")

In [None]:
soup.find('span', class_="info-pages").get_text()

In [None]:
soup.find('span', class_="info-pages").get_text().split()

In [None]:
soup.find('span', class_="info-pages").get_text().split()[-1]

In [None]:
int(soup.find('span', class_="info-pages").get_text().split()[-1])

## Criando uma rotina de scraping

In [None]:
# Importando bibliotecas
from urllib.request import urlopen, urlretrieve
from bs4 import BeautifulSoup
import pandas as pd

# Declarando variável cards
cards = []

## Obtendo o HTML e o total de páginas
response = urlopen('https://alura-site-scraping.herokuapp.com/index.php')
html = response.read().decode('utf-8')
soup = BeautifulSoup(html, 'html.parser')
pages = int(soup.find('span', class_="info-pages").get_text().split()[-1])

## Iterando por todas as páginas do site
for i in range(pages):
    ## Obtendo o HTML
    response = urlopen('https://alura-site-scraping.herokuapp.com/index.php?page=' + str(i + 1))
    html = response.read().decode('utf-8')
    soup = BeautifulSoup(html, 'html.parser')

    # Obtendo as TAGs de interesse
    anuncios = soup.find('div', {"id": "container-cards"}).findAll('div', class_="card")

    # Coletando as informações dos CARDS
    for anuncio in anuncios:
        card = {}

        # Valor
        card['value'] = anuncio.find('p', {'class': 'txt-value'}).getText()

        # Informações
        infos = anuncio.find('div', {'class': 'body-card'}).findAll('p')
        for info in infos:
            card[info.get('class')[0].split('-')[-1]] = info.get_text()

        # Acessórios
        items = anuncio.find('div', {'class': 'body-card'}).ul.findAll('li')
        items.pop()
        acessorios = []
        for item in items:
            acessorios.append(item.get_text().replace('► ', ''))
        card['items'] = acessorios

        # Adicionando resultado a lista cards
        cards.append(card)

        # Imagens
        image = anuncio.find('div', {'class': 'image-card'}).img
        urlretrieve(image.get('src'), './output/img/' + image.get('src').split('/')[-1])     


# Criando um DataFrame com os resultados
dataset = pd.DataFrame(cards)
dataset.to_csv('./output/data/dataset.csv', sep=';', index = False, encoding = 'utf-8-sig')
dataset

In [None]:
cards

In [None]:
for i in range(25):
    print('https://alura-site-scraping.herokuapp.com/index.php?page=' + str(i + 1))

In [None]:
_