In [41]:
import requests, bs4 as bs, lxml, sys
sys.path.append('..'); import utils.BSutils

httpaddr = "http://la100.cienradios.com/"
FILEDUMP = 'dump/La100_dump.txt'

requests_ret = requests.get(httpaddr)

print(requests_ret, end = ' || ')
soup = bs.BeautifulSoup(requests_ret.text, 'lxml')
for discard_tag in ("script", "style"): # discard <script> and <style> tags
    for t in soup.find_all(discard_tag): t.extract()
tags_tuple = ('article', 'section', 'header', 'footer', 'aside') + tuple(('h' + str(i) for i in range(1, 7)))
tags = {tag_str : soup.find_all(tag_str) for tag_str in tags_tuple}
for i, tag in enumerate(tags):
    print(f"{tag}s : {len(tags[tag])}", sep = '', end = '\n' if i == len(tags) - 1 else ' / ')


<Response [200]> || articles : 55 / sections : 13 / headers : 1 / footers : 1 / asides : 0 / h1s : 1 / h2s : 66 / h3s : 35 / h4s : 0 / h5s : 0 / h6s : 0


In [42]:
import sys
# These routines outputs a report on a given tag plus all their descendants accompanied with their respective attributes
def fmt_attrs(attrs_dict, width = 80, excl_attrs = []):
    key_width = max((len(k) for k in attrs_dict)) if attrs_dict else 0
    head_width = len("{'") + key_width + len("' : '")
    chunk_len = width - head_width
    strs_list = []

    for i, key in enumerate(attrs_dict):
        if key in excl_attrs:
            continue
        head_str = ("{'" if i == 0 else " '") + ("%-" + str(key_width) + "s") % (key, ) + "' : '"
        val_chunks = tuple((str(attrs_dict[key])[pos : pos + chunk_len] for pos in range(0, len(str(attrs_dict[key])), chunk_len)))
        
        for (j, chunk) in enumerate(val_chunks):
            strs_list.append(
                head_str + chunk if j == 0 else
                ' ' * head_width + chunk
            )
        if strs_list:
            strs_list.append(strs_list.pop() + "'")    

    if strs_list: 
        strs_list.append(strs_list.pop() + "'")

    return strs_list

def fmt_str(text, width):
    if hasattr(text, '__str__'):
        return (text[i : i + width] for i in range(0, len(text), width))
    else:
        return None

def tree_dump(tag, pfx, gettext_tags = [], excl_tags = [], excl_attrs = [], file = sys.stdout):
    stack = []
    stack.extend(reversed([ch for ch in tag.children if isinstance(ch, bs.element.Tag)]))

    attrs_report = fmt_attrs(tag.attrs, 135, excl_attrs) if fmt_attrs(tag.attrs, 135, excl_attrs) != [] else ['{}']
    for i, chunk in enumerate(attrs_report):
        print(pfx + (tag.name if i == 0 else ' ' * len(tag.name)), chunk, file = file)
    if tag.name in gettext_tags and (text := tag.get_text().strip()):
            text_lines = fmt_str(text, 100)
            print(pfx + len(tag.name) * ':' + '>-------', file = file)
            for text_line in text_lines:
                print(pfx + len(tag.name) * ':' + ' ' + text_line, file = file)
            print(pfx + len(tag.name) * ':' + '>-------', file = file)
    
    while stack:
        top = stack.pop()
    
        if top.name in excl_tags:
            continue
        else:
            tree_dump(top, pfx + '  |', gettext_tags = gettext_tags, excl_tags = excl_tags, excl_attrs = excl_attrs, file = file)

In [43]:
html_dump = False
report_dump = True
hs_to_report_on_list = ['h' + str(i) for i in range(1, 7)]
tags_to_report_on_list = hs_to_report_on_list + ['section', 'article']
tags_to_report_on_list_max_wdth = max(len(t) for t in tags_to_report_on_list)

with open(FILEDUMP, 'w') as f:
    # General Report:
    print("GENERAL REPORT:\n" + '=' * len("GENERAL REPORT:") + '\n', file = f)
    
    print("\tQUANTITIES BY TAG:\n" + '\t' + '-' * len("QUANTITIES BY TAG:"), file = f)
    print("\t", end = '', file = f)
    for tag_to_report in tags_to_report_on_list:
        print(('%-' + str(tags_to_report_on_list_max_wdth) + 's') % tag_to_report + ' = ', len(tuple(soup.find_all(tag_to_report))), sep = '', end = ' | ', file = f)
    print(file = f)
    print('\n' + ':' * 160, file = f)
    
    excl_attrs = ['srcset', 'data-id', 'data-notaid', 'data-source', 'role', 'alt', 'width', 'height', 'loading', 'fetchpriority', 'decoding', 'src']
    gettext_tags = ['h' + str(i) for i in range(1, 7)] + ['a'] + ['span'] + ['strong'] + ['time'] + ['p']
    
    for i, article in enumerate(tags['article']):
        print(f'{i:03d}>\n', file = f)
        tree_dump(article, '', gettext_tags = gettext_tags, excl_tags = [], excl_attrs = excl_attrs, file = f)
        if html_dump:
            print('\n' + ' ' * 40 + '-' * 40 + '\n', file = f)
            print(article.prettify(), file = f)
            print('\n' + ' ' * 40 + '-' * 40 + '\n', file = f)
        if report_dump:
            print('\n' + ' ' * 40 + '-' * 40 + '\n', file = f)
            # Report goes here:
            print('\t\t\t### Report goes here ###', file = f)
            print('\n' + ' ' * 40 + '-' * 40 + '\n', file = f)
        print('\n' + '=' * 140, file = f)
    for i, article in enumerate(tags['article']):
        if (h2s_list := list(article.find_all('h2'))):
            print(f'<article>[{i}]', *('\t'+ s for s in (h2.get_text() for h2 in h2s_list)), file = f)
    print(':'*80, 'href\'s\n', ':' * 160, file = f)
    for i, article in enumerate(tags['article']):
        if (hrefs_list := list(article.find_all( href = True))):
            print(f'<article>[{i}][{len(hrefs_list)}]', *('\t' + s for s in (href_tag.attrs['href'] for href_tag in hrefs_list)), file = f)    

