In [1]:
import requests # Peticiones HTTP

## Requests intro
---

In [2]:
url = 'https://www.pagina12.com.ar/'
p12 = requests.get(url) # Petición por método GET

In [3]:
p12.status_code # 200 = Bien hecho

200

In [4]:
p12_html = p12.text # Texto HTML (string)

In [5]:
p12_content = p12.content # Texto HTML (bytes)

In [6]:
p12.headers # HTTP response headers

{'Date': 'Thu, 12 Mar 2020 13:40:07 GMT', 'Content-Type': 'text/html; charset=utf-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Set-Cookie': '__cfduid=d2f22f3a992ff501c62a1ba5a42db4cd01584020407; expires=Sat, 11-Apr-20 13:40:07 GMT; path=/; domain=.pagina12.com.ar; HttpOnly; SameSite=Lax', 'Vary': 'Accept-Encoding', 'X-DNS-Prefetch-Control': 'off', 'Strict-Transport-Security': 'max-age=15724800; includeSubDomains', 'X-Download-Options': 'noopen', 'X-Content-Type-Options': 'nosniff', 'X-XSS-Protection': '1; mode=block', 'X-Backend': 'prod_frontend_1', 'X-Backend-TTL': '180.000', 'X-Type': 'Dynamic URI', 'Age': '2', 'grace': '86400.000 none', 'ttl': '117.713', 'x-debug': '', 'X-Instance': 'cache-front-prod-varnish-76c5f88cf6-6j89c', 'x-restarts': '0', 'X-Cache': 'HIT (20)', 'CF-Cache-Status': 'DYNAMIC', 'Expect-CT': 'max-age=604800, report-uri="https://report-uri.cloudflare.com/cdn-cgi/beacon/expect-ct"', 'Server': 'cloudflare', 'CF-RAY': '572decd7d8d8ec8a-DFW', 'Conte

In [7]:
p12.request.headers # HTTP request headers

{'User-Agent': 'python-requests/2.22.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}

In [8]:
p12.request.method # request method

'GET'

In [9]:
p12.request.url # request url

'https://www.pagina12.com.ar/'

## Intro BeautifulSoup
---

In [10]:
from bs4 import BeautifulSoup # HTML parser

In [11]:
s = BeautifulSoup(p12.text, 'lxml') # Parser

In [12]:
type(s)

bs4.BeautifulSoup

In [13]:
print(s.prettify()[:80]) # HTML estructurado

<!DOCTYPE html>
<html class="no-js">
 <head>
  <meta charset="utf-8"/>
  <title>


In [14]:
ul_hot_sections = s.find('ul', attrs={'class': 'hot-sections'}) # Devuelve el primer elemento que haga match

In [15]:
sections = ul_hot_sections.find_all('li') # Devuelve todos los elementos que hagan match (list)

In [16]:
for section in sections:
  print(section.get_text()) # Devuelve todos los textos hijos

El país
Economía
Sociedad
Cultura y Espectáculos
El mundo
Deportes
Psicología
Contratapa


## Get info
---

In [17]:
sections[0].a.get('href') # Devuelve el valor de un atributo

'https://www.pagina12.com.ar/secciones/el-pais'

In [18]:
link_secciones = [section.a.get('href') for section in sections]
link_secciones

['https://www.pagina12.com.ar/secciones/el-pais',
 'https://www.pagina12.com.ar/secciones/economia',
 'https://www.pagina12.com.ar/secciones/sociedad',
 'https://www.pagina12.com.ar/suplementos/cultura-y-espectaculos',
 'https://www.pagina12.com.ar/secciones/el-mundo',
 'https://www.pagina12.com.ar/secciones/deportes',
 'https://www.pagina12.com.ar/secciones/psicologia',
 'https://www.pagina12.com.ar/secciones/contratapa']

In [19]:
s = requests.get(link_secciones[0])
s_section = BeautifulSoup(s.text, 'lxml')

In [20]:
featured_article = s_section.find('div', attrs={ 'class':'featured-article__container' })

In [21]:
featured_article.a.get('href')

'https://www.pagina12.com.ar/252484-los-metrodelegados-levantan-molinetes-en-el-subte'

In [22]:
article_list = s_section.find('ul', attrs={ 'class':'article-list' })

In [23]:
# Reto
def get_articles_links_from_section(sectionBS):
  links = []

  # Featured article
  links.append(sectionBS.find('div', attrs={ 'class':'featured-article__container'}).a.get('href'))

  # "Normal" articles
  for article in sectionBS.find('ul', attrs={ 'class':'article-list' }).find_all('li'):
    if article.get_text():
      links.append(article.find('h2').a.get('href'))

  return list(set(links))

lista_notas = get_articles_links_from_section(s_section)
lista_notas

['https://www.pagina12.com.ar/252480-alberto-fernandez-sobre-el-coronavirus-es-muy-importante-la-',
 'https://www.pagina12.com.ar/252265-desafios-de-la-educacion-superior',
 'https://www.pagina12.com.ar/252479-por-el-coronavirus-eduardo-duhalde-y-chiche-se-aislaron-prev',
 'https://www.pagina12.com.ar/252235-la-provincia-de-buenos-aires-como-problema',
 'https://www.pagina12.com.ar/252484-los-metrodelegados-levantan-molinetes-en-el-subte',
 'https://www.pagina12.com.ar/252421-el-gobierno-lanza-un-plan-de-pequenas-obras',
 'https://www.pagina12.com.ar/252281-el-aborto-divide-a-las-legisladoras-radicales-se-dobla-y-no-',
 'https://www.pagina12.com.ar/252310-la-justicia-investiga-un-multimillonaro-desvio-de-acero-de-l',
 'https://www.pagina12.com.ar/252222-ramos-padilla-denuncia-que-en-comodoro-py-buscan-beneficiar-',
 'https://www.pagina12.com.ar/252382-argentina-admitio-responsabilidad-y-pidio-disculpas',
 'https://www.pagina12.com.ar/252408-dibujos-urgentes-retratos-de-genocidas-y-test

## Error Handling
---

In [24]:
# status_code == 200

r = requests.get(url)

if r.status_code == 200:
  # Use the response
  print('Yeih!!')
else:
  # Show the error
  print('Oh no!')

Yeih!!


In [25]:
# Revisar si se obtuvo respuesta del servidor

url_mala = url.replace('2', '3') # https://www.pagina13.com.ar/

try:
  response = requests.get(url_mala)
  print(response)
except Exception as e:
  print('Error')
  print(e)

Error
HTTPSConnectionPool(host='www.pagina13.com.ar', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7fcca51cbd50>: Failed to establish a new connection: [Errno -2] Name or service not known'))


In [26]:
# Revisar si un tag existió

try:
  text = featured_article.somehtmltag.get_text()
  print(text)
except Exception as e:
  print('Error')
  print(e)

Error
'NoneType' object has no attribute 'get_text'


## Get info
---

In [27]:
url_nota = lista_notas[0]
print(url_nota)

https://www.pagina12.com.ar/252480-alberto-fernandez-sobre-el-coronavirus-es-muy-importante-la-


In [28]:
try:
  nota = requests.get(url_nota)
  if nota.status_code == 200:
    s_nota = BeautifulSoup(nota.text, 'lxml')

    # Title
    title = s_nota.find('div', attrs={ 'class':'article-title' }).get_text()
    date = s_nota.find('span', attrs={ 'pubdate':'pubdate' }).get('datetime')

    print(title)
    print(date)
except Exception as e:
  print('Error')
  print(e)
  print('\n')

Los metrodelegados levantan molinetes en el subteReclamo por la paritaria de 2019
2020-03-12


## Media content
---

In [29]:
media = s_nota.find('div', attrs={ 'class':'article-main-media-image' })
images = media.find_all('img')

if images:
  image = images[-1]
  image_src = image.get('data-src')
  print(image_src)
else:
  print('No images found')

https://images.pagina12.com.ar/styles/focal_3_2_960x640/public/media/articles/56759/alberto-20fernandez.jpeg?itok=PFZHNiHg


In [30]:
from IPython.display import Image

In [31]:
img_req = requests.get(image_src)

if img_req.status_code == 200:
  Image(img_req.content)
else:
  print('Error')

## Final scraper
---

In [32]:
def obtener_info(s_nota):
    
    # Creamos un diccionario vacío para poblarlo con la información
    ret_dict = {}
    
    # Extraemos la fecha
    fecha = s_nota.find('span', attrs={'pubdate':'pubdate'})
    if fecha:
        ret_dict['fecha'] = fecha.get('datetime')
    else:
        ret_dict['fecha'] = None
    
    # Extraemos el título
    titulo = s_nota.find('div', attrs={'class':'article-title'})
    if titulo:
        ret_dict['titulo'] = titulo.text
    else:
        ret_dict['titulo'] = None

    # Extraemos la volanta
    volanta = s_nota.find('div', attrs={'class':'article-prefix'})
    if volanta:
        ret_dict['volanta'] = volanta.get_text()
    else:
        ret_dict['volanta'] = None
    
    # Extraemos el copete
    copete = s_nota.find('div', attrs={'class':'article-summary'})
    if copete:
        ret_dict['copete'] = copete.get_text()
    else:
        ret_dict['copete'] = None
    
    autor = s_nota.find('div', attrs={'class':'article-author'})
    if autor:
        ret_dict['autor'] = autor.a.get_text()
    else:
        ret_dict['autor'] = None
    
    # Extraemos la imagen
    media = s_nota.find('div', attrs={'class':'article-main-media-image'})
    if media:
        imagenes = media.find_all('img')
        if len(imagenes) == 0:
            print('no se encontraron imágenes')
        else:
            imagen = imagenes[-1]
            img_src = imagen.get('data-src')
            try:
                img_req = requests.get(img_src)
                if img_req.status_code == 200:
                    ret_dict['imagen'] = img_req.content
                else:
                    ret_dict['imagen'] = None
            except:
                print('No se pudo obtener la imagen')
    else:
        print('No se encontró media')
    # Extraemos el cuerpo de la nota
    cuerpo = s_nota.find('div', attrs={'class':'article-text'})
    if cuerpo:
        ret_dict['texto'] = cuerpo.get_text()
    else:
        ret_dict['texto'] = None
    
    return ret_dict

In [40]:
obtener_info(s_nota) ['titulo']

'Los metrodelegados levantan molinetes en el subteReclamo por la paritaria de 2019'

In [34]:
def scrape_nota(url):
  try:
    nota = requests.get(url)
  except Exception as e:
    print(f'Error scrapeando {url}')
    print(e)
    print('\n')
    return None

  if nota.status_code != 200:
    print(f'Error scrapeando {url}')
    print(f'Status code = {nota.status_code}')
    print('\n')
    return None
  
  s_nota = BeautifulSoup(nota.text, 'lxml')

  ret_dict = obtener_info(s_nota)
  ret_dict['url'] = url

  return ret_dict

In [39]:
scrape_nota(url_nota)['titulo']

'Los metrodelegados levantan molinetes en el subteReclamo por la paritaria de 2019'