In [141]:
import datetime
import httplib2
import collections
import functools
from lxml import etree as et
from typing import Callable, Set, Generator, Iterable

# Boe dashboard exploration

### Functionality to be provided

1. Fetch of the weeks' diary
2. Analysis of the diary
3. Create dependency tree
4. Search for key-concept summarization of the diary

## Feching from the BOE API

In [9]:
DATE = datetime.datetime(2020, 4, 20)

In [10]:
Response = collections.namedtuple('Response', 'headers content')

In [11]:
def summary_url_for_date(date: datetime.datetime)->str:
    ''' Create an URL for fetching the correspondant BOE summary.
    Refer to https://www.boe.es/datosabiertos/documentos/SumariosBOE_v_1_0.pdf
    '''
    
    pub = 'BOE'
    I = 'S'
    date_string = date.strftime('%Y%m%d')
    
    return f'https://boe.es/diario_boe/xml.php?id={pub}-{I}-{date_string}'

In [12]:
def fetch_page(url: str, max_tries=3)->Response:
    '''Try to fetch a given url up to max_tries times.'''
    
    h = httplib2.Http(".cache")
    
    for _ in range(max_tries):
        response = Response(*h.request(url, "GET"))
        if response.headers.get('status') == '200':
            break
    
    return response

In [13]:
response = fetch_page(summary_url_for_date(DATE))

## Processing of the diary

```
sumario/
  meta/
   |pub
   |fecha
   |fechaAnt
   |fechaSig
  sumario/
   |diario/
   |  sumario_nbo/
   |   |urlPdf
   |  seccion[1, 2, 2A, 2B, 3, 4, 5, 5A, 5B, 5C, T]/
   |   |departamento[1..*]/
   |   |  epigrafe[1..*]/
   |   |    epigrafeType/ item[1..*]
   |   |  item[1..*]/
   |   |    itemType/ titulo, urlPdf, urlHtm, urlXml, suplemento
        
```

In [15]:
def print_reponse_content(response: Response)->None:
    print(response.content.decode('utf-8'))

In [16]:
def tree_from_response(response: Response)->et._Element:
    '''Create lxml.etree._Element from an XML document content.'''
    
    return et.fromstring(response.content)

In [17]:
def use_tree_for_search(tree)->Callable:
    '''Return a function xpath:str->List for performing 
    xpath-based search over the provided lxml tree.'''
    
    namespaces = dict(xmlns="http://www.w3.org/1999/xhtml")
    return lambda xpath: tree.xpath(xpath, namespaces=namespaces)

def use_xpath_for_search(xpath)->Callable:
    '''Return a function tree:str->List for performing 
    the provided xpath-based search over the tree.'''
    
    namespaces = dict(xmlns="http://www.w3.org/1999/xhtml")
    return lambda tree: tree.xpath(xpath, namespaces=namespaces)

In [18]:
tree = tree_from_response(response)

In [176]:
class BOExpath:
    # Accessible from the diary's root
    publication_type = '/sumario/meta/pub'
    publication_date = '/sumario/meta/fecha'
    prev_publication_date = '/sumario/meta/fechaAnt'
    next_publication_date = '/sumario/meta/fechaSig'
    
    sumary = '/sumario/diario/sumario_nbo'
    section = '/sumario/diario/seccion'
    
    # Accessible for a section
    department = './departamento'
    
    # Accessible for a department
    epigraf = './epigrafe'
    items = './/item'
    
    # Accessible for an itemType node
    item_title = './titulo'
    item_pdf_url = './urlPdf'
    item_htm_url = './urlHtm'
    item_xml_url = './urlXml'
    
class BOEattributes:
    diary_nbo = 'nbo'
    summary_nbo_id = 'id'
    section_number = 'num'
    section_name = 'nombre'
    department_name = 'nombre'
    epigraph_name = 'nombre'
    item_id = 'id'
    pdf_url_sz_bytes = 'szBytes'
    pdf_url_sz_kbytes = 'szKBytes'
    item_control = 'control'

In [136]:
def get_sections(tree) -> Generator:
    sections = ((section.get(BOEattributes.section_number), section)
            for section
            in use_tree_for_search(tree)(BOExpath.section))
    
    return sections

In [139]:
def get_departments_per_section(sections) -> Generator:
    departments = (
            (section_number, department.get(BOEattributes.department_name), department)
            for (section_number, section)
            in sections
            for department
            in use_tree_for_search(section)(BOExpath.department))
    
    return departments

In [177]:
def get_items_per_department(departments) -> Generator:
    items = ((section_number, department_name, item)
        for (section_number, department_name, department)
        in departments
        for item
        in use_tree_for_search(department)(BOExpath.items))
    
    return items

In [180]:
def get_item_details(section_number:str, department_name:str, node)->Set:    
    search_details = use_tree_for_search(node)
    title_node = search_details(BOExpath.item_title)[0]
    pdf_url_node = search_details(BOExpath.item_pdf_url)[0]
    xml_url_node = search_details(BOExpath.item_xml_url)[0]
    htm_url_node = search_details(BOExpath.item_htm_url)[0]
    
    parent = node.getparent()
    is_epigraph = parent.tag.lower() == 'epigrafe'
    epigraph = parent = parent.get(BOEattributes.epigraph_name) if is_epigraph else ''
    
    details = {}
    details['id'] = node.get(BOEattributes.item_id)
    details['epigraph'] = epigraph
    details['section'] = section_number
    details['department'] = department_name
    details['title'] = title_node.text
    details['pdf_url'] = pdf_url_node.text
    details['xml_url'] = xml_url_node.text
    details['htm_url'] = htm_url_node.text
    
    return details

In [149]:
def get_details_per_item(items) -> Generator:
    details = (get_item_details(*item) for item in items)
    
    return details

In [142]:
def pipe(initial_value, *args:Callable):
    chain_function = lambda prev_result, func: func(prev_result)
    
    return functools.reduce(chain_function, args, initial_value)

In [183]:
items = list(pipe(tree,
        get_sections,
        get_departments_per_section,
        get_items_per_department,
        get_details_per_item))
items[0]

{'id': 'BOE-A-2020-4526',
 'epigraph': 'Productos petrolíferos. Precios',
 'section': '1',
 'department': 'MINISTERIO PARA LA TRANSICIÓN ECOLÓGICA Y EL RETO DEMOGRÁFICO',
 'title': 'Resolución de 9 de abril de 2020, de la Dirección General de Política Energética y Minas, por la que se publican los nuevos precios de venta, antes de impuestos, de los gases licuados del petróleo por canalización.',
 'pdf_url': '/boe/dias/2020/04/20/pdfs/BOE-A-2020-4526.pdf',
 'xml_url': '/diario_boe/xml.php?id=BOE-A-2020-4526',
 'htm_url': '/diario_boe/txt.php?id=BOE-A-2020-4526'}