In [114]:
import requests, bs4 as bs, lxml, copy
from pprint import pprint

httpaddr = "https://www.infobae.com"
requests_ret = requests.get(httpaddr)
print(requests_ret, end = ' || ')
soup = bs.BeautifulSoup(requests_ret.text, 'lxml')
for discard_tag in ("script", "style"):
    for t in soup.find_all(discard_tag): t.extract()
tags_tuple = ('article', 'section', 'header') + tuple(('h' + str(i) for i in range(1, 7)))
tags = {tag_str : soup.find_all(tag_str) for tag_str in tags_tuple}
for i, tag in enumerate(tags):
    print(f"{tag}s : {len(tags[tag])}", sep = '', end = '\n' if i == len(tags) - 1 else ' / ')

datamodel_dict = {'UKEY'                    : None,
                  'JOB'                     : None,
                  'TITLE'                   : None,
                  'TITLE_WORD_COUNT'        : None,
                  'ARTICLE'                 : None,     # New field - Article number among articles
                  'CLUSTER'                 : None,     # New field - Cluster number to which this article belongs
                  'CLUSTER_INDEX'           : None,     # New field - Article number inside cluster
                  'CLUSTER_SIZE'            : None,     # New field - Size of the cluster to which this belongs
                  'CLUSTER_UNIQUE'          : None,     # New field - does this artice belong to a single cluster?
                  'AUTHOR'                  : None,
                  'SUMMARY'                 : None,
                  'VOLANTA'                 : None,
                  'CATEGORY'                : None,
                  'SUBCATEGORY'             : None,     # New field - 
                  'SLUG'                    : None,
                  'SLUG_INTERNAL'           : None, 
                  'Origen'                  : None,
                  'FechaFiltro'             : None,
                  'FechaCreacion'           : None,
                  'FechaModificacion'       : None,
                 }

<Response [200]> || articles : 0 / sections : 0 / headers : 0 / h1s : 0 / h2s : 103 / h3s : 15 / h4s : 0 / h5s : 0 / h6s : 0


In [115]:
import datetime
anchored_articles = {}      # Pivoting on <h2>s
non_anchored_articles = {}  # 

for i, h2 in enumerate(tags['h2']):
    try:
        tuple(map(lambda tag: tag.name.lower(), h2.parents)).index('a')
    except ValueError:
        non_anchored_articles[h2] = h2
        print(f'Non-anchored <h2> found: [{i}]')
        continue
    anchored_articles[h2] = copy.copy(datamodel_dict)   # set a datamodel_dict apart 

for i, h2 in enumerate(anchored_articles):
    # Routine data
    anchored_articles[h2]['UKEY'] = 1234567890
    anchored_articles[h2]['JOB'] = 'Infobae'
    anchored_articles[h2]['VOLANTA'] = "N/A -- INFOBAE HAS NO VOLANTAS"
    anchored_articles[h2]['Origen'] = httpaddr
    anchored_articles[h2]['FechaFiltro'] = datetime.datetime.now()
    anchored_articles[h2]['FechaCreacion'] = datetime.datetime.now()
    anchored_articles[h2]['FechaModificacion'] = datetime.datetime.now()

    # ARTICLE : order of <h2> tag within <h2>s tags
    ARTICLE = str(tags['h2'].index(h2))
    anchored_articles[h2]['ARTICLE'] = ARTICLE

    # TITLE extraction
    TITLE = h2.get_text().strip()
    anchored_articles[h2]['TITLE'] = TITLE
    
    # AUTHOR extraction
    # AdHoc sanitization of AUTHOR fields
    anchor_cursor = h2
    while anchor_cursor.name.lower() != 'a':
        anchor_cursor = anchor_cursor.parent
    author_candidates_list = list(map(lambda tag: tag.get_text(), anchor_cursor.find_all(class_ = 'overlay_ctn')))
    author_candidate = author_candidates_list[0] if len(author_candidates_list) != 0 else None

    if author_candidate and isinstance(author_candidate, str):    # If at least there's an author_candidate and is a string (just in case)...
        if author_candidate.find(',') != -1: # Sometimes a clarification is made with a comma in the middle...
            author_candidate = author_candidate[: author_candidate.find(',')]   # ... strip it
        if author_candidate.find('-') != -1: # Sometimes accompanying VIDEO/AUDIO is denoted separated by '-' 
            author_candidate = author_candidate[: author_candidate.find('-')]
        author_candidate = author_candidate.strip()     # ... just to be sure...

        # Starting 'Por' denotes autorship...
        if author_candidate[0:4] == 'Por ':
            author_candidate = author_candidate[3:].strip() # At this point, compromise into assuming it's an author
        
        if author_candidate.upper() == author_candidate:    # If it's all uppercase, almost certainly ain't an author
            author_candidate = None
    else:
        author_candidate = None     # explicitly default to None if None or not-a-string, just for clarity

    AUTHOR = author_candidate
    anchored_articles[h2]['AUTHOR'] = AUTHOR

    # SUMMARY extraction
    summary_candidates_list = list(map(lambda tag: tag.get_text(), anchor_cursor.find_all(class_ = 'cst_deck')))
    summary_candidate = summary_candidates_list[0] if len(summary_candidates_list) != 0 else None
    
    SUMMARY = summary_candidate
    anchored_articles[h2]['SUMMARY'] = SUMMARY

    # SLUG extraction
    SLUG = h2.parent.get('href') # This is only going one parent upwards
    # I might wanna be looking for the nearest <a> tag upwards...
    # print(f'{i:3d}', tuple(map(lambda tag: tag.name, h2.parents))) # index of the nearest upwards <a>nchor tag
    anchored_articles[h2]['SLUG'] = SLUG
    
    # CATEGORY & SUBCATEGORY extraction
    if SLUG:    # Only if there's a SLUG present attempt to extract
        if SLUG[0] == '/':  # CATEGORY: Only if Relative address -- SLUG EXTERNAL == False
            anchored_articles[h2]['SLUG_INTERNAL'] = True
            anchored_articles[h2]['CATEGORY'] = SLUG.split('/')[1]
            if (SLUG.split('/')[2][0] in tuple(map(chr, range(ord('a'), ord('z') + 1))) + tuple(map(chr, range(ord('A'), ord('Z') + 1)))):
                anchored_articles[h2]['SUBCATEGORY'] = SLUG.split('/')[2]
        elif SLUG[0:4].lower() in ('http://', 'https://'):
            anchored_articles[h2]['SLUG_INTERNAL'] = False
            anchored_articles[h2]['CATEGORY'] = False
            anchored_articles[h2]['SUBCATEGORY'] = False
        else:
            anchored_articles[h2]['CATEGORY'] = None
            anchored_articles[h2]['SUBCATEGORY'] = None
            anchored_articles[h2]['SLUG_INTERNAL'] = None
    else:   # if SLUG is None 'Category' should be None too
        anchored_articles[h2]['CATEGORY'] = None
        anchored_articles[h2]['SUBCATEGORY'] = None
        anchored_articles[h2]['SLUG_INTERNAL'] = None
    
    anchored_articles[h2]['TITLE_WORD_COUNT'] = len(h2.get_text().strip().split(' '))
    # anchored_articles[h2]['DBG_TITLE_WORDS'] = h2.get_text().strip().split(' ')

# Summary of processed data
print('Summary Report')
print(f'\t<h2>s :', len(tags['h2']))
print(f'\tvalidated <h2>s :', len(anchored_articles))
print('=' * 80)
#
# for i, h2 in enumerate(tags['h2']):
# for i, h2 in enumerate(anchored_articles):
#     idx = tags['h2'].index(h2)
#     print(f'{idx:3d} :', tuple(map(lambda tag: tag.name, h2.parents)), '//', h2.attrs)

for i, h2 in enumerate(anchored_articles):
    if anchored_articles[h2]['AUTHOR']:
        print(f"<h2>[{tags['h2'].index(h2)}] :", anchored_articles[h2]['AUTHOR'])
print('=' * 100)

for i, h2 in enumerate(anchored_articles):
    print(f"{i:3d}")
    print('-' * len(f"{i:3d}"))
    for field in anchored_articles[h2]:
        if anchored_articles[h2][field] != None:
            print(f'\t{field}:: {anchored_articles[h2][field]}')
    print(f'{" " * 40}{"-" * 40}')
    
    print(f'PREV SIBLINGS:')
    for prev_sibling in h2.previous_siblings:
        print('\t', prev_sibling.name, prev_sibling.attrs, prev_sibling.get_text())
    
    print(f'NEXT SIBLINGS:')
    for next_sibling in h2.next_siblings:
        print('\t', next_sibling.name, next_sibling.attrs, next_sibling.get_text())
    
    print(f'=' * 80)

Non-anchored <h2> found: [62]
Non-anchored <h2> found: [63]
Non-anchored <h2> found: [64]
Non-anchored <h2> found: [100]
Non-anchored <h2> found: [101]
Non-anchored <h2> found: [102]
Summary Report
	<h2>s : 103
	validated <h2>s : 97
<h2>[6] : Ximena Casas
<h2>[7] : Nunzia Locatelli y Cintia Suárez
<h2>[8] : Silvana Boschi
<h2>[13] : Federico Cristofanelli
<h2>[17] : María Laura Balonga
<h2>[18] : Gisele Sousa Días
<h2>[19] : Pablo Wende
<h2>[20] : Diego Zorrero
<h2>[21] : The Economist
<h2>[22] : Yalilé Loaiza
<h2>[23] : Federico Fahsbender
<h2>[26] : Soledad Blardone
<h2>[27] : Román Lejtman
<h2>[28] : Mariano Boettner
<h2>[29] : Patricia Blanco y Martín Angulo
<h2>[30] : Alberto Amato
<h2>[31] : Adrián Pignatelli
<h2>[32] : Facundo Chaves
<h2>[34] : Fernanda Jara
<h2>[35] : Joaquín Mugica Díaz
<h2>[37] : Juan Bautista Tata Yofre
<h2>[38] : Laura Rocha
<h2>[42] : Fernando Meaños
<h2>[45] : David Cayón
<h2>[46] : Gabriela Cicero
<h2>[47] : Maximiliano Fernández
<h2>[48] : Luciano Luter