In [0]:
import requests # Peticiones HTTP

## Requests intro
---

In [0]:
url = 'https://www.pagina12.com.ar/'
p12 = requests.get(url) # Petición por método GET

In [3]:
p12.status_code # 200 = Bien hecho

200

In [0]:
p12_html = p12.text # Texto HTML (string)

In [0]:
p12_content = p12.content # Texto HTML (bytes)

In [6]:
p12.headers # HTTP response headers

{'Date': 'Thu, 12 Mar 2020 02:32:08 GMT', 'Content-Type': 'text/html; charset=utf-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Set-Cookie': '__cfduid=de1fefc1da6a61b05a4affc1d622e33481583980328; expires=Sat, 11-Apr-20 02:32:08 GMT; path=/; domain=.pagina12.com.ar; HttpOnly; SameSite=Lax', 'Vary': 'Accept-Encoding', 'X-DNS-Prefetch-Control': 'off', 'Strict-Transport-Security': 'max-age=15724800; includeSubDomains', 'X-Download-Options': 'noopen', 'X-Content-Type-Options': 'nosniff', 'X-XSS-Protection': '1; mode=block', 'X-Backend': 'prod_frontend_1', 'X-Backend-TTL': '180.000', 'X-Type': 'Dynamic URI', 'Age': '4', 'grace': '86400.000 none', 'ttl': '115.826', 'x-debug': '', 'X-Instance': 'cache-front-prod-varnish-76c5f88cf6-6j89c', 'x-restarts': '0', 'X-Cache': 'HIT (20)', 'CF-Cache-Status': 'DYNAMIC', 'Expect-CT': 'max-age=604800, report-uri="https://report-uri.cloudflare.com/cdn-cgi/beacon/expect-ct"', 'Server': 'cloudflare', 'CF-RAY': '572a1a5e183bf37d-ATL', 'Conte

In [7]:
p12.request.headers # HTTP request headers

{'User-Agent': 'python-requests/2.21.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}

In [8]:
p12.request.method # request method

'GET'

In [9]:
p12.request.url # request url

'https://www.pagina12.com.ar/'

## Intro BeautifulSoup
---

In [0]:
from bs4 import BeautifulSoup # HTML parser

In [0]:
s = BeautifulSoup(p12.text, 'lxml') # Parser

In [12]:
type(s)

bs4.BeautifulSoup

In [0]:
print(s.prettify()[:80]) # HTML estructurado

In [0]:
ul_hot_sections = s.find('ul', attrs={'class': 'hot-sections'}) # Devuelve el primer elemento que haga match

In [0]:
sections = ul_hot_sections.find_all('li') # Devuelve todos los elementos que hagan match (list)

In [16]:
for section in sections:
  print(section.get_text()) # Devuelve todos los textos hijos

El país
Economía
Sociedad
Cultura y Espectáculos
Ciencia
El mundo
Deportes
Contratapa


## Get info
---

In [17]:
sections[0].a.get('href') # Devuelve el valor de un atributo

'https://www.pagina12.com.ar/secciones/el-pais'

In [18]:
link_secciones = [section.a.get('href') for section in sections]
link_secciones

['https://www.pagina12.com.ar/secciones/el-pais',
 'https://www.pagina12.com.ar/secciones/economia',
 'https://www.pagina12.com.ar/secciones/sociedad',
 'https://www.pagina12.com.ar/suplementos/cultura-y-espectaculos',
 'https://www.pagina12.com.ar/secciones/ciencia',
 'https://www.pagina12.com.ar/secciones/el-mundo',
 'https://www.pagina12.com.ar/secciones/deportes',
 'https://www.pagina12.com.ar/secciones/contratapa']

In [0]:
s = requests.get(link_secciones[0])
s_section = BeautifulSoup(s.text, 'lxml')

In [0]:
featured_article = s_section.find('div', attrs={ 'class':'featured-article__container' })

In [21]:
featured_article.a.get('href')

'https://www.pagina12.com.ar/252422-legisladores-y-empleados-del-congreso-en-cuarentena'

In [0]:
article_list = s_section.find('ul', attrs={ 'class':'article-list' })

In [43]:
# Reto
def get_articles_links_from_section(sectionBS):
  links = []

  # Featured article
  links.append(sectionBS.find('div', attrs={ 'class':'featured-article__container'}).a.get('href'))

  # "Normal" articles
  for article in sectionBS.find('ul', attrs={ 'class':'article-list' }).find_all('li'):
    if article.get_text():
      links.append(article.find('h2').a.get('href'))

  return list(set(links))

lista_notas = get_articles_links_from_section(s_section)
lista_notas

['https://www.pagina12.com.ar/252310-la-justicia-investiga-un-multimillonaro-desvio-de-acero-de-l',
 'https://www.pagina12.com.ar/252265-desafios-de-la-educacion-superior',
 'https://www.pagina12.com.ar/252222-ramos-padilla-denuncia-que-en-comodoro-py-buscan-beneficiar-',
 'https://www.pagina12.com.ar/252421-el-gobierno-lanza-un-plan-de-pequenas-obras',
 'https://www.pagina12.com.ar/252226-alberto-fernandez-dijo-que-le-preocupan-situaciones-de-la-ju',
 'https://www.pagina12.com.ar/252235-la-provincia-de-buenos-aires-como-problema',
 'https://www.pagina12.com.ar/252251-alberto-fernandez-volvio-a-dar-clases-en-la-facultad-de-dere',
 'https://www.pagina12.com.ar/252422-legisladores-y-empleados-del-congreso-en-cuarentena',
 'https://www.pagina12.com.ar/252399-el-senado-sesionara-para-convertir-en-ley-las-limitaciones-a',
 'https://www.pagina12.com.ar/252408-dibujos-urgentes-retratos-de-genocidas-y-testigos-de-juicios',
 'https://www.pagina12.com.ar/252406-le-impusieron-una-multa-a-larreta-

## Error Handling
---

In [29]:
# status_code == 200

r = requests.get(url)

if r.status_code == 200:
  # Use the response
  print('Yeih!!')
else:
  # Show the error
  print('Oh no!')

Yeih!!


In [37]:
# Revisar si se obtuvo respuesta del servidor

url_mala = url.replace('2', '3') # https://www.pagina13.com.ar/

try:
  response = requests.get(url_mala)
  print(response)
except Exception as e:
  print('Error')
  print(e)

Error
HTTPSConnectionPool(host='www.pagina13.com.ar', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7f60afbe7400>: Failed to establish a new connection: [Errno -2] Name or service not known',))


In [41]:
# Revisar si un tag existió

try:
  text = featured_article.somehtmltag.get_text()
  print(text)
except Exception as e:
  print('Error')
  print(e)

Error
'NoneType' object has no attribute 'get_text'


In [46]:
url_nota = lista_notas[0]
print(url_nota)

https://www.pagina12.com.ar/252310-la-justicia-investiga-un-multimillonaro-desvio-de-acero-de-l


In [56]:
try:
  nota = requests.get(url_nota)
  if nota.status_code == 200:
    s_nota = BeautifulSoup(nota.text, 'lxml')

    # Title
    title = s_nota.find('div', attrs={ 'class':'article-title' }).get_text()
    date = s_nota.find('span', attrs={ 'pubdate':'pubdate' }).get('datetime')

    print(title)
    print(date)
except Exception as e:
  print('Error')
  print(e)
  print('\n')

La acreditación universitaria como desafío y oportunidad 
2020-03-11
