In [122]:
# <section>[0]
# This identifies something by STRUCTURE. That means, it presumes it to be <section>[0], that is, THE FIRST section, according to
# a structure usually seen in the html tree. The fact that such structure is seen at position [0] is taken as a sort of sanity check.
# Nevertheless, if this structure is seen AT POSITION [1] *AND* section [0] is seen having only one article, that's taken as a sign 
# that MAYBE a major headline has taken place. If this structure is seen at position [1], *BUT* <section>[0] has more than
# This structure seen in any other position other than [0] or [1], is taken as a probable error in scrapping and reported as a warning.
import requests
import bs4 as bs
import lxml
httpaddr = "https://www.clarin.com"

requests_ret = requests.get(httpaddr)
print(requests_ret, end = ' || ')
soup = bs.BeautifulSoup(requests_ret.text, 'lxml')
# Notas:
# Everything semmantically a header, is tagged with <h2>. To all scrapping practical
# purposes, all headers except h2 are deemded irrelevant
articles = soup.find_all("article"); print("articles :", len(articles), end = ' / ')
sections = soup.find_all("section"); print("sections :", len(sections), end = ' / ')
headers = soup.find_all("header"); print("header :", len(headers), end = ' / ')
h1s = soup.find_all("h1"); print("h1s :", len(h1s), end = ' / ')
h2s = soup.find_all("h2"); print("h2s :", len(h2s), end = ' / ')
h3s = soup.find_all("h3"); print("h3s :", len(h3s), end = ' / ')
h4s = soup.find_all("h4"); print("h4s :", len(h4s), end = ' / ')
h5s = soup.find_all("h5"); print("h5s :", len(h5s), end = ' / ')
h6s = soup.find_all("h6"); print("h6s :", len(h6s))
# only <h2>s hold relevance
# the only <h1> holds an SVG graphic
for discard_tag in ("script", "style"):
    for t in soup.find_all(discard_tag): t.decompose()


<Response [200]> || articles : 108 / sections : 19 / header : 16 / h1s : 1 / h2s : 141 / h3s : 0 / h4s : 0 / h5s : 0 / h6s : 0


In [123]:
# Rough Extraction -- for all articles -- all sections:
import pprint
articles_scraps = []
classes_to_scrap = ('volanta', 'author', 'author-rel', 'summary', 'section', 'section-rel', 'href')

for i, article in enumerate(articles):
    article_scrap_dict = {}

    # number article:
    article_scrap_dict['article-number'] = i

    # Get h2, contemplating absence or multiple
    article_h2s = list(article.find_all('h2'))
    if not article_h2s:
        pass # TODO: log it as anomalous WARNING
    if len(article_h2s) == 1:
        article_scrap_dict['h2'] = article_h2s[0].text.strip()
    else:   # TODO: log it as anomalous WARNING
        article_scrap_dict['h2'] = {}                           # FIXME: this should be a list!
        for j, h2 in enumerate(article_h2s):
            article_scrap_dict['h2'][str(j)] = h2.text.strip()

    for c in classes_to_scrap:
        article_ents = list(article.find_all(class_ = c))
        if not article_ents:
            continue   # if it doesn't exist, go next class
        if len(article_ents) == 1:
            article_scrap_dict[c] = article_ents[0].text.strip()
        else: # TODO: log it as anomalous WARNING
            article_scrap_dict[c]
            for j, ent in enumerate(article_ents):
                article_scrap_dict[c][str(j)] = ent.text.strip()
    
    article_scrap_dict['ul-cluster'] =\
        [articles.index(a) for a in filter(lambda x: x.name == 'article', article.parent.parent.descendants)]

    article_sections = []
    for k, section in enumerate(sections):
        if article in section.descendants:
            article_sections.append(k)
    article_scrap_dict['belongs-to-section'] = article_sections

    # check if article is 'last-moment' news
    # check by higher node classes
    for p in article.parents:
        if hasattr(p, 'attrs') and ('class' in p.attrs):
            if 'last-moment' in p.attrs['class']:
                article_scrap_dict['last-moment'] = True
            for ch in article.children:
                if ch.name == 'span' and not ch.attrs:
                    article_scrap_dict['last-moment-time'] = ch.text.strip()

    # harvest anchors
    article_anchors = list(article.find_all('a'))
    
    if article_anchors:
        if len(article_anchors) > 1: pass # TODO: Log warning
        article_scrap_dict['a-href'] = [str(a.attrs['href']) for a in article_anchors if hasattr(a, 'attrs')]
    else:
        pass # TODO: log it as anomalous WARNING

    # infer categories:
    article_scrap_dict['inferred-classes'] = {}
    # from harvested anchors
    if article_anchors and len(article_anchors) == 1:
        article_scrap_dict['inferred-classes']['from-slug'] = str(article_anchors[0].attrs['href']).split('/')[1]
    # from nearest previous sibling <header> from containing sections -- even if more than one 
    for sec in article_sections:
        cursor = sections[sec]
        while cursor and cursor.name != 'header':
            cursor = cursor.previous_sibling
            try:
                if cursor.name == 'header':     # get category from nearest preceding <header> text content
                    article_scrap_dict['inferred-classes']['from-header-tag-section-' + str(sec)] = cursor.text.strip()
                    break
            except:
                continue

    articles_scraps.append(article_scrap_dict)

for i, scrap in enumerate(articles_scraps):
    print(i)
    pprint.pprint(scrap)
    print('-' * 80)


0
{'a-href': ['/politica/juicio-cristina-kirchner-vivo-fiscal-aseguro-nestor-cristina-instalaron-matrices-extraordinarias-corrupcion-_0_89kgnjl6fi.html'],
 'article-number': 0,
 'author': 'Lucía Salinas',
 'belongs-to-section': [0],
 'h2': 'Alegato del fiscal: “Néstor y Cristina instalaron una de las matrices '
       'más extraordinarias de corrupción”',
 'inferred-classes': {'from-slug': 'politica'},
 'summary': 'La ex presidenta está conectada desde su despacho en el Senado a '
            'la primera jornada de los alegatos. Está acusada de corrupción '
            'por los contratos que favorecieron a Lázaro Báez.',
 'ul-cluster': [0, 1, 2, 3],
 'volanta': 'Juicio por la obra pública'}
--------------------------------------------------------------------------------
1
{'a-href': ['/politica/juicio-cristina-kirchner-declaracion-alberto-fernandez-fiscal-uso-alegato-obra-publica_0_XhBsSunhlx.html'],
 'article-number': 1,
 'belongs-to-section': [0],
 'h2': 'La declaración de Alberto Fe

In [124]:
print(articles[11])

<article class="flex-change">
<h2>El dramático relato de Ian Escobar, el defensor que se desvaneció en Lanús-Aldosivi: "Me desperté cuando me hacían la tomografía"</h2>
<span>12:16</span>
</article>
