# Alura Course - scraping

## import libs

In [1]:
import bs4
import urllib.request as urllib_request
import pandas as pd

## My first scraping 

In [2]:
url = r'https://alura-site-scraping.herokuapp.com/hello-world.php'

In [3]:
response = urllib_request.urlopen(url)

In [4]:
html = response.read()

In [5]:
soup = bs4.BeautifulSoup(html, 'html.parser')

In [6]:
soup.find('h1', id='hello-world')

<h1 id="hello-world">Hello World!!!</h1>

In [7]:
soup.find('h1', id='hello-world').get_text()

'Hello World!!!'

In [8]:
soup.find('p').get_text()

'Web Scraping é o termo utilizado para definir a prática de coletar automaticamente informações na Internet. Isto é feito, geralmente, por meio de programas que simulam a navegação humana na Web.'

## Request with headers

In [9]:
url = r'https://www.alura.com.br/'
headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36' }

req = urllib_request.Request(url, headers=headers)
response = urllib_request.urlopen(req)
html = response.read()
soup = bs4.BeautifulSoup(html, 'html.parser')
soup.find('h1', class_='home__titles__main-title').get_text()

'Mergulhe em Tecnologia!'

## parse bytes to string

In [10]:
url = 'https://alura-site-scraping.herokuapp.com/index.php'

resposnse = urllib_request.urlopen(url)
html = response.read()

type(html)

bytes

In [11]:
html = html.decode('utf-8')

type(html)

str

## Getting data from "Alura motors" web site

In [12]:
url = r'https://alura-site-scraping.herokuapp.com/index.php'

In [13]:
html = urllib_request.urlopen(url).read().decode('utf-8')

In [14]:
soup = bs4.BeautifulSoup(html, 'html.parser')

In [15]:
ad = soup.find('div', class_='well card')

In [16]:
ad

<div class="well card">
<div class="col-md-3 image-card">
<img alt="Foto" height="155" src="https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/lamborghini-aventador/lamborghini-aventador-2932196__340.jpg" width="220"/>
</div>
<div class="col-md-6 body-card">
<p class="txt-name inline">LAMBORGHINI AVENTADOR</p>
<p class="txt-category badge badge-secondary inline">USADO</p>
<p class="txt-motor">Motor 1.8 16v</p>
<p class="txt-description">Ano 1993 - 55.286 km</p>
<ul class="lst-items">
<li class="txt-items">► 4 X 4</li>
<li class="txt-items">► Câmera de estacionamento</li>
<li class="txt-items">► Controle de tração</li>
<li class="txt-items">► Sensor de estacionamento</li>
<li class="txt-items">...</li>
</ul>
<p class="txt-location">Belo Horizonte - MG</p>
</div>
<div class="col-md-3 value-card">
<div class="value">
<p class="txt-value">R$ 338.000</p>
</div>
</div>
</div>

In [17]:
info_div = ad.find('div', class_='body-card')

car_info = {
    'name': info_div.find('p', class_='txt-name').get_text(),
    'category': info_div.find('p', class_='txt-category').get_text(),
    'engine': info_div.find('p', class_='txt-motor').get_text(),
    'description': info_div.find('p', class_='txt-description').get_text(),
    'location': info_div.find('p', class_='txt-location').get_text(),
    'items': [ it.get_text()[2:] for it in info_div.find('ul', class_='lst-items').find_all('li')[:-1] ]
}

car_info

{'name': 'LAMBORGHINI AVENTADOR',
 'category': 'USADO',
 'engine': 'Motor 1.8 16v',
 'description': 'Ano 1993 - 55.286 km',
 'location': 'Belo Horizonte - MG',
 'items': ['4 X 4',
  'Câmera de estacionamento',
  'Controle de tração',
  'Sensor de estacionamento']}

In [18]:
car_value = ad.find('div', class_='value-card').find('p').getText()
car_value

'R$ 338.000'

In [19]:
car_image = ad.find('div', class_='image-card').find('img').get('src')
print(car_image)

urllib_request.urlretrieve(car_image, f'./data/{car_image.split("/")[-1]}')

from IPython.core.display import display, HTML
display(HTML(str(ad.find('div', class_='image-card').find('img'))))

https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/lamborghini-aventador/lamborghini-aventador-2932196__340.jpg


In [20]:
car_info['value'] = int(car_value[3:].replace('.', ''))

In [21]:
car_info['image'] = car_image

In [22]:
car_info

{'name': 'LAMBORGHINI AVENTADOR',
 'category': 'USADO',
 'engine': 'Motor 1.8 16v',
 'description': 'Ano 1993 - 55.286 km',
 'location': 'Belo Horizonte - MG',
 'items': ['4 X 4',
  'Câmera de estacionamento',
  'Controle de tração',
  'Sensor de estacionamento'],
 'value': 338000,
 'image': 'https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/lamborghini-aventador/lamborghini-aventador-2932196__340.jpg'}

In [23]:
pd.DataFrame.from_dict(car_info, orient='index').T

Unnamed: 0,name,category,engine,description,location,items,value,image
0,LAMBORGHINI AVENTADOR,USADO,Motor 1.8 16v,Ano 1993 - 55.286 km,Belo Horizonte - MG,"[4 X 4, Câmera de estacionamento, Controle de ...",338000,https://caelum-online-public.s3.amazonaws.com/...


## Get all data from page

In [24]:
def car_ad_to_map(ad):
    image_div = ad.find('div', class_='image-card')
    info_div = ad.find('div', class_='body-card')
    value_div = ad.find('div', class_='value-card')
    
    return {
        'name': info_div.find('p', class_='txt-name').get_text(),
        'category': info_div.find('p', class_='txt-category').get_text(),
        'engine': info_div.find('p', class_='txt-motor').get_text(),
        'description': info_div.find('p', class_='txt-description').get_text(),
        'location': info_div.find('p', class_='txt-location').get_text(),
        'items': [ it.get_text()[2:] for it in info_div.find('ul', class_='lst-items').find_all('li')[:-1] ],
        'value': int(value_div.find('p').getText()[3:].replace('.', '')),
        'image': image_div.find('img').get('src')
    }
    

In [25]:
car_ad_to_map(ad)

{'name': 'LAMBORGHINI AVENTADOR',
 'category': 'USADO',
 'engine': 'Motor 1.8 16v',
 'description': 'Ano 1993 - 55.286 km',
 'location': 'Belo Horizonte - MG',
 'items': ['4 X 4',
  'Câmera de estacionamento',
  'Controle de tração',
  'Sensor de estacionamento'],
 'value': 338000,
 'image': 'https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/lamborghini-aventador/lamborghini-aventador-2932196__340.jpg'}

In [26]:
page_data = [ car_ad_to_map(ad) for ad in soup.find_all('div', class_='well card') ]

In [27]:
page_data[:2]

[{'name': 'LAMBORGHINI AVENTADOR',
  'category': 'USADO',
  'engine': 'Motor 1.8 16v',
  'description': 'Ano 1993 - 55.286 km',
  'location': 'Belo Horizonte - MG',
  'items': ['4 X 4',
   'Câmera de estacionamento',
   'Controle de tração',
   'Sensor de estacionamento'],
  'value': 338000,
  'image': 'https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/lamborghini-aventador/lamborghini-aventador-2932196__340.jpg'},
 {'name': 'BMW M2',
  'category': 'USADO',
  'engine': 'Motor 3.0 32v',
  'description': 'Ano 2018 - 83.447 km',
  'location': 'Belo Horizonte - MG',
  'items': ['Câmera de estacionamento',
   'Controle de estabilidade',
   'Travas elétricas',
   'Freios ABS'],
  'value': 346000,
  'image': 'https://caelum-online-public.s3.amazonaws.com/1381-scraping/01/img-cars/bmw-m2/bmw-m2-2970882__340.jpg'}]

## Getting all data from site

In [28]:
url = 'https://alura-site-scraping.herokuapp.com/index.php?page={page}'

In [29]:
car_data = []
page_number = 0

while True:
    page_number += 1
    response = urllib_request.urlopen(url.format(page=page_number))
    html = response.read().decode('utf-8')
    soup = bs4.BeautifulSoup(html, 'html.parser')
    
    ad_list = soup.find_all('div', class_='well card')
    
    if len(ad_list) == 0:
        break
    
    car_data = car_data + [ car_ad_to_map(ad) for ad in ad_list ]
    
len(car_data)

246

In [30]:
pd.DataFrame.from_records(car_data)

Unnamed: 0,name,category,engine,description,location,items,value,image
0,LAMBORGHINI AVENTADOR,USADO,Motor 1.8 16v,Ano 1993 - 55.286 km,Belo Horizonte - MG,"[4 X 4, Câmera de estacionamento, Controle de ...",338000,https://caelum-online-public.s3.amazonaws.com/...
1,BMW M2,USADO,Motor 3.0 32v,Ano 2018 - 83.447 km,Belo Horizonte - MG,"[Câmera de estacionamento, Controle de estabil...",346000,https://caelum-online-public.s3.amazonaws.com/...
2,ALFA,USADO,Motor 1.8 16v,Ano 2004 - 19.722 km,Rio de Janeiro - RJ,"[Central multimídia, Bancos de couro, Rodas de...",480000,https://caelum-online-public.s3.amazonaws.com/...
3,PUECH,USADO,Motor Diesel V8,Ano 1992 - 34.335 km,São Paulo - SP,"[Bancos de couro, Freios ABS, Rodas de liga, C...",133000,https://caelum-online-public.s3.amazonaws.com/...
4,LAMBORGHINI MURCIELAGO,USADO,Motor 1.0 8v,Ano 1991 - 464 km,Belo Horizonte - MG,"[Central multimídia, Teto panorâmico, Sensor c...",175000,https://caelum-online-public.s3.amazonaws.com/...
...,...,...,...,...,...,...,...,...
241,SUV REAR TIRE,USADO,Motor 3.0 32v,Ano 1998 - 74.292 km,São Paulo - SP,"[Câmera de estacionamento, Rodas de liga, Sens...",489000,https://caelum-online-public.s3.amazonaws.com/...
242,ANTIQUE,NOVO,Motor 2.0 16v,Ano 2019 - 0 km,Belo Horizonte - MG,"[Bancos de couro, Freios ABS, Sensor de estaci...",427000,https://caelum-online-public.s3.amazonaws.com/...
243,SPORT,USADO,Motor 2.0 16v,Ano 2001 - 102.776 km,Belo Horizonte - MG,"[Sensor crepuscular, Sensor de chuva, Vidros e...",203000,https://caelum-online-public.s3.amazonaws.com/...
244,IMPERIAL,USADO,Motor 1.8 16v,Ano 2011 - 101.787 km,Belo Horizonte - MG,"[Painel digital, Travas elétricas, Sensor de c...",474000,https://caelum-online-public.s3.amazonaws.com/...


In [31]:
pd.DataFrame.from_records(car_data).to_csv('./data/cars_ad.csv', sep=';', index=False)