In [111]:
import requests, bs4 as bs, lxml, sys
sys.path.append('..'); import utils.BSutils

httpaddr = "https://radiomitre.cienradios.com/"
FILEDUMP = 'dump/Radiomitre_dump.txt'

requests_ret = requests.get(httpaddr)

print("Response Status:", requests_ret)
soup = bs.BeautifulSoup(requests_ret.text, 'lxml')
for discard_tag in ("script", "style"): # discard <script> and <style> tags
    for t in soup.find_all(discard_tag): t.extract()
tags_tuple = ('article', 'section', 'header', 'footer', 'aside') + tuple(('h' + str(i) for i in range(1, 7)))
tags = {tag_str : soup.find_all(tag_str) for tag_str in tags_tuple}
for i, tag in enumerate(tags):
    print(f"{tag}s : {len(tags[tag])}", sep = '', end = '\n' if (tag == 'aside') else ' / ')

Response Status: <Response [200]>
articles : 88 / sections : 20 / headers : 1 / footers : 1 / asides : 2
h1s : 1 / h2s : 92 / h3s : 45 / h4s : 0 / h5s : 0 / h6s : 0 / 

In [112]:
import sys
# These routines outputs a report on a given tag plus all their descendants accompanied with their respective attributes
def fmt_attrs(attrs_dict, width = 80, excl_attrs = []):
    key_width = max((len(k) for k in attrs_dict)) if attrs_dict else 0
    head_width = len("{'") + key_width + len("' : '")
    chunk_len = width - head_width
    strs_list = []

    for i, key in enumerate(attrs_dict):
        if key in excl_attrs:
            continue
        head_str = ("{'" if i == 0 else " '") + ("%-" + str(key_width) + "s") % (key, ) + "' : '"
        val_chunks = tuple((str(attrs_dict[key])[pos : pos + chunk_len] for pos in range(0, len(str(attrs_dict[key])), chunk_len)))
        
        for (j, chunk) in enumerate(val_chunks):
            strs_list.append(
                head_str + chunk if j == 0 else
                ' ' * head_width + chunk
            )
        if strs_list:
            strs_list.append(strs_list.pop() + "'")    

    if strs_list: 
        strs_list.append(strs_list.pop() + "'")

    return strs_list

def fmt_str(text, width):
    if hasattr(text, '__str__'):
        return (text[i : i + width] for i in range(0, len(text), width))
    else:
        return None

def tree_dump(tag, pfx, gettext_tags = [], excl_tags = [], excl_attrs = [], file = sys.stdout):
    stack = []
    stack.extend(reversed([ch for ch in tag.children if isinstance(ch, bs.element.Tag)]))

    attrs_report = fmt_attrs(tag.attrs, 135, excl_attrs) if fmt_attrs(tag.attrs, 135, excl_attrs) != [] else ['{}']
    for i, chunk in enumerate(attrs_report):
        print(pfx + (tag.name if i == 0 else ' ' * len(tag.name)), chunk, file = file)
    if tag.name in gettext_tags and (text := tag.get_text().strip()):
            text_lines = fmt_str(text, 100)
            print(pfx + len(tag.name) * ':' + '>-------', file = file)
            for text_line in text_lines:
                print(pfx + len(tag.name) * ':' + ' ' + text_line, file = file)
            print(pfx + len(tag.name) * ':' + '>-------', file = file)
    
    while stack:
        top = stack.pop()
    
        if top.name in excl_tags:
            continue
        else:
            tree_dump(top, pfx + '  |', gettext_tags = gettext_tags, excl_tags = excl_tags, excl_attrs = excl_attrs, file = file)

In [113]:
html_dump = False
report_dump = True
hs_to_report_on_list = ['h' + str(i) for i in range(1, 7)]
tags_to_report_on_list = hs_to_report_on_list + ['section', 'article']
tags_to_report_on_list_max_wdth = max(len(t) for t in tags_to_report_on_list)

with open(FILEDUMP, 'w') as f:
    # General Report:
    print("GENERAL REPORT:\n" + '=' * len("GENERAL REPORT:") + '\n', file = f)
    
    print("\tQUANTITIES BY TAG:\n" + '\t' + '-' * len("QUANTITIES BY TAG:"), file = f)
    print("\t", end = '', file = f)
    for tag_to_report in tags_to_report_on_list:
        print(('%-' + str(tags_to_report_on_list_max_wdth) + 's') % tag_to_report + ' = ', len(tuple(soup.find_all(tag_to_report))), sep = '', end = ' | ', file = f)
    print(file = f)
    print('\n' + ':' * 160, file = f)
    
    excl_attrs = ['srcset', 'data-id', 'data-notaid', 'data-source', 'role', 'alt', 'width', 'height', 'loading', 'fetchpriority', 'decoding', 'src']
    gettext_tags = ['h' + str(i) for i in range(1, 7)] + ['a'] + ['span'] + ['strong'] + ['time'] + ['p']
    
    for i, article in enumerate(tags['article']):
        print(f'{i:03d}>\n', file = f)
        tree_dump(article, '', gettext_tags = gettext_tags, excl_tags = [], excl_attrs = excl_attrs, file = f)
        if html_dump:
            print('\n' + ' ' * 40 + '-' * 40 + '\n', file = f)
            print(article.prettify(), file = f)
            print('\n' + ' ' * 40 + '-' * 40 + '\n', file = f)
        if report_dump:
            print('\n' + ' ' * 40 + '-' * 40 , file = f)
            # Report goes here:
            print('\t\t\t### Report goes here ###\n', file = f)
            # if not (articleHeadline_tags := list(article.find_all(class_ = 'articleHeadline'))):
                # print(articleHeadline_tags[0].prettify(), file = f)
            print(article.prettify(), file = f)
            print('\n' + ' ' * 40 + '-' * 40 + '\n', file = f)
        print('\n' + '=' * 160, file = f)
    print(':'*80, '<h3>\n', ':' * 160, file = f)
    for i, article in enumerate(tags['article']):
        if (h2s_list := list(article.find_all('h2'))):
            print(f'<article>[{i}]', *('\t'+ s for s in (h2.get_text() for h2 in h2s_list)), file = f)
    print(':'*80, 'href\'s\n', ':' * 160, file = f)
    for i, article in enumerate(tags['article']):
        if (hrefs_list := list(article.find_all( href = True))):
            print(f'<article>[{i}][{len(hrefs_list)}]', *('\t' + s for s in (href_tag.attrs['href'] for href_tag in hrefs_list)), file = f)    


In [137]:
total_articles_in_sections = 0
articles_in_sections = []
for i, tag in enumerate(tags['section']):
    articles_in_section = list(tag.find_all('article'))
    articles_in_sections.extend(articles_in_section)

    print(f'For section[{i:2}] contains {len(articles_in_section) if len(articles_in_section) != 0 else "no":2} articles', 
          end = '\n' if (i + 1) % 4 == 0 else '   ')

print(f'... for a total of {len(articles_in_sections)} articles contained within sections')
articles = list(tags['article'])
print(f'Total articles in page {len(articles)}')

articles_not_in_sections = articles[:]
for article in articles_in_sections:
    while article in articles_not_in_sections:
        articles_not_in_sections.remove(article)
print(f'Articles not in any section {len(articles_not_in_sections)}')

for i, article in enumerate(articles_not_in_sections):
    print(f'For [{i}]-th article_not_in_sections, {articles.index(article):2}-th in page')
    for j, h in enumerate(article.find_all(['h' + str(i) for i in range(1, 7)])):
        print(f'\t{h.name} [{j}]-th header : {h.get_text()}')

for i, article in enumerate(articles_not_in_sections):
    cursor = article
    print(f'[{articles.index(article)}] ', end = '')
    while cursor:
        print(cursor.name, '->', sep = '', end = '')
        cursor = cursor.parent
    print()

# empty_sections = [section for section in tags['section'] if list()]
for aside in list(soup.find_all('aside')):
    if aside_with_articles := list(aside.find_all('article')):
        for article in aside_with_articles:
            print(article.get_text())

For section[ 0] contains  4 articles   For section[ 1] contains  7 articles   For section[ 2] contains no articles   For section[ 3] contains  6 articles
For section[ 4] contains no articles   For section[ 5] contains  8 articles   For section[ 6] contains  4 articles   For section[ 7] contains  5 articles
For section[ 8] contains  5 articles   For section[ 9] contains  4 articles   For section[10] contains  4 articles   For section[11] contains no articles
For section[12] contains  5 articles   For section[13] contains  6 articles   For section[14] contains no articles   For section[15] contains  3 articles
For section[16] contains  3 articles   For section[17] contains  3 articles   For section[18] contains no articles   For section[19] contains 12 articles
... for a total of 79 articles contained within sections
Total articles in page 88
Articles not in any section 9
For [0]-th article_not_in_sections, 11-th in page
	h2 [0]-th header : Mundial de Qatar 2022: cómo quedaron los cruces

In [157]:
for m in dir(bs):
    
    

CData : ['PREFIX', 'SUFFIX', '__add__', '__class__', '__contains__', '__copy__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getnewargs__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__mod__', '__module__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__rmod__', '__rmul__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_all_strings', '_find_all', '_find_one', '_is_xml', '_lastRecursiveChild', '_last_descendant', 'append', 'capitalize', 'casefold', 'center', 'count', 'decomposed', 'default', 'encode', 'endswith', 'expandtabs', 'extend', 'extract', 'fetchNextSiblings', 'fetchParents', 'fetchPrevious', 'fetchPreviousSiblings', 'find', 'findAllNext', 'findAllPrevious', 'findNext', 'findNextSibling', 'findNextSiblings', 'findParent', 'findParents', 'findPrevious', 'findPreviousSibling'