In [1]:
import requests, bs4 as bs, lxml, sys
sys.path.append('..'); import utils.BSutils

httpaddr = "https://www.lanacion.com.ar/"
FILEDUMP = 'dump/Lanacion_dump.txt'

requests_ret = requests.get(httpaddr)

print(requests_ret, end = ' || ')
soup = bs.BeautifulSoup(requests_ret.text, 'lxml')
for discard_tag in ("script", "style"): # discard <script> and <style> tags
    for t in soup.find_all(discard_tag): t.extract()
tags_tuple = ('article', 'section', 'header', 'footer', 'aside') + tuple(('h' + str(i) for i in range(1, 7)))
tags = {tag_str : soup.find_all(tag_str) for tag_str in tags_tuple}
for i, tag in enumerate(tags):
    print(f"{tag}s : {len(tags[tag])}", sep = '', end = '\n' if i == len(tags) - 1 else ' / ')

<Response [200]> || articles : 106 / sections : 274 / headers : 2 / footers : 1 / asides : 5 / h1s : 1 / h2s : 106 / h3s : 30 / h4s : 1 / h5s : 0 / h6s : 0


In [2]:
# These routines outputs a report on a given tag plus all their descendants accompanied with their respective attributes
import sys
def fmt_attrs(attrs_dict, width = 80, excl_attrs = []):
    key_width = max((len(k) for k in attrs_dict)) if attrs_dict else 0
    head_width = len("{'") + key_width + len("' : '")
    chunk_len = width - head_width
    strs_list = []

    for i, key in enumerate(attrs_dict):
        if key in excl_attrs:
            continue
        head_str = ("{'" if i == 0 else " '") + ("%-" + str(key_width) + "s") % (key, ) + "' : '"
        val_chunks = tuple((str(attrs_dict[key])[pos : pos + chunk_len] for pos in range(0, len(str(attrs_dict[key])), chunk_len)))
        
        for (j, chunk) in enumerate(val_chunks):
            strs_list.append(
                head_str + chunk if j == 0 else
                ' ' * head_width + chunk
            )
        if strs_list:
            strs_list.append(strs_list.pop() + "'")    

    if strs_list: 
        strs_list.append(strs_list.pop() + "'")

    return strs_list

def fmt_str(text, width):
    if hasattr(text, '__str__'):
        return (text[i : i + width] for i in range(0, len(text), width))
    else:
        return None

def tree_dump(tag, pfx, gettext_tags = [], excl_tags = [], excl_attrs = [], file = sys.stdout):
    stack = []
    stack.extend(reversed([ch for ch in tag.children if isinstance(ch, bs.element.Tag)]))

    attrs_report = fmt_attrs(tag.attrs, 135, excl_attrs) if fmt_attrs(tag.attrs, 135, excl_attrs) != [] else ['{}']
    for i, chunk in enumerate(attrs_report):
        print(pfx + (tag.name if i == 0 else ' ' * len(tag.name)), chunk, file = file)
    if tag.name in gettext_tags and (text := tag.get_text().strip()):
            text_lines = fmt_str(text, 100)
            print(pfx + len(tag.name) * ':' + '>-------', file = file)
            for text_line in text_lines:
                print(pfx + len(tag.name) * ':' + ' ' + text_line, file = file)
            print(pfx + len(tag.name) * ':' + '>-------', file = file)
    
    while stack:
        top = stack.pop()
    
        if top.name in excl_tags:
            continue
        else:
            tree_dump(top, pfx + '  |', gettext_tags = gettext_tags, excl_tags = excl_tags, excl_attrs = excl_attrs, file = file)

In [3]:
# Clusterize articles:
clusters_by_article = {}
clusters_list = []
data_pos_by_article = {}
for i, article in enumerate(soup.find_all('article')):
    # Tuck away cluster associated with each <article>, accessible by <article>
    clusters_by_article[article] = article.parent
    while len(clusters_by_article[article].find_all('article')) == 1:
        clusters_by_article[article] = clusters_by_article[article].parent  # save cluster, accessible by article
    # Tuck away clusters as they appear
    if clusters_by_article[article] not in clusters_list:
        clusters_list.append(clusters_by_article[article])  # save clusters, as they appear in order
    # Compute data-pos for each <article>
    data_pos_by_article[article] = article.attrs.get('data-pos')
# Report <article>'s per cluster
print('Number of clusters:', len(clusters_list))

idx_fld_width = len(str(len(clusters_list)))
val_fld_width = max(len(str(len(cluster.find_all('article')))) for cluster in clusters_list)
art_fld_width = len(tuple(soup.find_all('article')))

for i, cluster in enumerate(clusters_list):
    print(('Cluster[%' + str(idx_fld_width) + 'd] : %' + str(val_fld_width) + 'd') % (i, len(cluster.find_all('article'))), \
        '(%4s' % cluster.name,
        end = ')\n' if (i + 1) % 4 == 0 else ') | ')
print('\n', '=' * 80)

# Check for cluster nesting:
print("Check for cluster nesting\n", len("Check for cluster nesting") * '-', sep = '')
for i, cluster_i in enumerate(clusters_list):
    for j, cluster_j in enumerate(clusters_list):
        if cluster_j is cluster_i:
            continue
        if cluster_j in cluster_i.find_all(True):
            print(f'Cluster [{j}] is contained by cluster[{i}]')
    print()




Number of clusters: 31
Cluster[ 0] :  10 ( div) | Cluster[ 1] :   5 (  ul) | Cluster[ 2] :   4 ( div) | Cluster[ 3] :  40 ( div)
Cluster[ 4] :   4 ( div) | Cluster[ 5] :   2 ( div) | Cluster[ 6] :   5 ( div) | Cluster[ 7] :   8 ( div)
Cluster[ 8] :   3 ( div) | Cluster[ 9] :   5 ( div) | Cluster[10] :   3 ( div) | Cluster[11] :   5 ( div)
Cluster[12] :   4 ( div) | Cluster[13] :   6 ( div) | Cluster[14] :   2 ( div) | Cluster[15] :   2 ( div)
Cluster[16] :   3 ( div) | Cluster[17] :   7 ( div) | Cluster[18] : 106 (main) | Cluster[19] :   2 ( div)
Cluster[20] :   3 ( div) | Cluster[21] :   3 ( div) | Cluster[22] :   3 ( div) | Cluster[23] :   4 ( div)
Cluster[24] :   4 ( div) | Cluster[25] :   4 ( div) | Cluster[26] :   4 ( div) | Cluster[27] :   3 ( div)
Cluster[28] :   2 ( div) | Cluster[29] :   2 ( div) | Cluster[30] :   4 ( div) | 
Check for cluster nesting
-------------------------
Cluster [1] is contained by cluster[0]



Cluster [4] is contained by cluster[3]
Cluster [5] is conta

In [4]:
html_dump = False; report_dump = True; exploration_dump = False

hs_to_report_on_list = ['h' + str(i) for i in range(1, 7)]
tags_to_report_on_list = hs_to_report_on_list + ['section', 'article']
tags_to_report_on_list_max_wdth = max(len(t) for t in tags_to_report_on_list)

final_report = {}

with open(FILEDUMP, 'w') as f:
    # General Report:
    print("GENERAL REPORT:\n" + '=' * len("GENERAL REPORT:") + '\n', file = f)
    
    print("\tQUANTITIES BY TAG:\n" + '\t' + '-' * len("QUANTITIES BY TAG:"), file = f)
    print("\t", end = '', file = f)
    for tag_to_report in tags_to_report_on_list:
        print(('%-' + str(tags_to_report_on_list_max_wdth) + 's') % tag_to_report + ' = ', len(tuple(soup.find_all(tag_to_report))), sep = '', end = ' | ', file = f)
    print(file = f)
    print('\n' + ':' * 160, file = f)
    
    excl_attrs = ['srcset', 'data-id', 'data-notaid', 'data-source', 'role', 'alt', 'width', 'height', 'loading', 'fetchpriority', 'decoding', 'src']
    gettext_tags = ['h' + str(i) for i in range(1, 7)] + ['a'] + ['span'] + ['strong'] + ['time'] + ['section'] + ['p']
    
    article_reports = {}
    for i, article in enumerate(tags['article']):
        article_report = {}
        article_report['UKEY'] = '### PLACEHOLDER'
        article_report['JOB'] = 'LanacionJob'
        
        article_report['ARTICLE'] = i
        article_report['TITLE'] = article.find(class_ = 'com-title').get_text()
        article_report['TITLE_WORD_COUNT'] = len(tuple(article_report['TITLE'].split(' ')))

        article_report['LANACION_data_pos'] = data_pos_by_article[article]
        article_report['LANACION_data_pos_cluster'] = article_report['LANACION_data_pos'][:-2]
        article_report['LANACION_data_pos_cluster_member'] = article_report['LANACION_data_pos'][-2:]
        article_report['LANACION_hrefs_list'] = list(hrefs_set := set(href_tag.get('href') for href_tag in article.find_all(href = True)))
        
        # article_report['CLUSTER'] = clusters_list.index(clusters_by_article[article])
        # article_report['CLUSTER_INDEX'] = tuple(clusters_by_article[article].find_all('article')).index(article)
        # article_report['CLUSTER_SIZE'] = len(tuple(clusters_by_article[article].find_all('article')))
        # article_report['CLUSTER_UNIQUE'] = tuple((article in cluster.find_all(True)) for cluster in clusters_list).count(True)
        
        author_text = author_tags[0].get_text() if (author_tags := list(article.find_all('strong'))) else None
        article_report['AUTHOR'] = author_text
        article_report['SUMMARY'] = t[0].get_text() if (t := tuple(article.find_all(class_ = 'com-subhead'))) else None
        article_report['VOLANTA'] = lead_tag.get_text() if (lead_tag := article.find(class_ = 'com-lead')) else None
        

        if hrefs_set:
            internal_hrefs_set = {href for href in hrefs_set if href[0] == '/'}
            external_hrefs_set = hrefs_set - internal_hrefs_set
            
            if internal_hrefs_set:
                article_report['SLUG'] = (slug := list(internal_hrefs_set)[0])
                article_report['SLUG_INTERNAL'] = True
                
                category_list = slug.split('/')
                if len(category_list) >= 3:
                    article_report['CATEGORY'] = category_list[1]
                    if len(category_list) > 3:
                        article_report['SUBCATEGORY'] = category_list[2]
                    else:
                        article_report['SUBCATEGORY'] = None
                else:
                    article_report['CATEGORY'] = None
                    article_report['SUBCATEGORY'] = None
                
            elif external_hrefs_set:
                article_report['SLUG'] = list(external_hrefs_set)[0]
                article_report['SLUG_INTERNAL'] = False
        
        else:
            article_report['SLUG'] = None
            article_report['SLUG_INTERNAL'] = None
            article_report['CATEGORY'] = None
            article_report['SUBCATEGORY'] = None
        
        article_report['Origen'] = None
        article_report['FechaFiltro'] = None
        article_report['FechaCreacion'] = None
        article_report['FechaModificacion'] = None

        article_reports[article] = article_report
    
    for i, article in enumerate(tags['article']):
        print('\n' + '=' * 160, file = f)
        print(f'{i:03d}>\n', file = f)
        tree_dump(article, '', gettext_tags = gettext_tags, excl_tags = [], excl_attrs = excl_attrs, file = f)
        if html_dump:
            print('\n' + ' ' * 40 + '-' * 40 + '\n', file = f)
            print(article.prettify(), file = f)
            print('\n' + ' ' * 40 + '-' * 40 + '\n', file = f)
        if report_dump:
            print('\n' + ' ' * 40 + '-' * 40 + '\n', file = f)
            # Report goes here:
            key_field_width = max(len(k) for k, v in article_reports[article].items() if v)
            for key, val in article_reports[article].items():
                if val != None:
                    print((('\t%' + str(key_field_width) + 's') % key) + ' :', val, file = f)
            # tree_dump(clusters_dict[article], '', gettext_tags = ['h' + str(i) for i in range(1, 7)], excl_tags = ['a'], excl_attrs = excl_attrs, file = f)
            if exploration_dump:
                article_titles = tuple(article.find_all(class_ = 'com-title'))
                article_links = tuple(article.find_all(class_ = 'com-link'))
                article_subhead = tuple(article.find_all(class_ = 'com-subhead'))
                article_lead = tuple(article.find_all(class_ = 'com-lead'))
                article_hour = tuple(article.find_all(class_ = 'com-hour'))
                if len(article_titles):
                    print('# com-title :', len(article_titles), file = f)
                    print('com-title :', (article_titles), file = f)
                if len(article_links):
                    print('# com-link :', len(article_links), file = f)
                    print('com-link :', (article_links), file = f)
                if len(article_subhead):
                    print('# com-subhead :', len(article_subhead), file = f)
                    print('com-subhead :', (article_subhead), file = f)
                if len(article_lead):
                    print('# com-lead :', len(article_lead), file = f)
                    print('com-lead :', (article_lead), file = f)
                if len(article_hour):
                    print('# com-hour :', len(article_hour), file = f)
                    print('com-hour :', (article_hour), file = f)
            print('\n' + ' ' * 40 + '-' * 40 + '\n', file = f)
        
    print(('\n' + ':' * 160), '\n:::: FINAL REPORT ::::', '\n' + ':' * 160, sep = '', file = f)

    print(':::: SLUGS ::::', file = f)
    for i, j in ((val['ARTICLE'], val['LANACION_hrefs_list']) for (key, val) in article_reports.items()):
        print(f'article [{str(i):3s}] ({len(j)}):: {j} ', file = f)
    
    print(':::: AUTHORS ::::', file = f)
    for i, j in ((val['ARTICLE'], val['AUTHOR']) for (key, val) in article_reports.items()):
        print(f'article [{str(i):3s}] : AUTHOR :: {j} ', file = f)

    print(':::: AUTHORS INSPECTION ::::', file = f)
    c = 0
    for i, article in enumerate(tags['article']):
        author_presumptive_tags_list = [tag.get_text() for tag in article.find_all('strong')]
        if author_presumptive_tags_list:
            c += 1
            print(f'<article>[{i}] :', author_presumptive_tags_list[0], f':: {c}', file = f)


FileNotFoundError: [Errno 2] No such file or directory: 'dump/Lanacion_dump.txt'