In [1]:
from typing import Callable, Set, Generator, Iterable
import datetime
import collections

import pymysql
import functools
from lxml import etree as et
import httplib2

In [72]:
def pipe(initial_value: any, *args:Callable) -> any:
    '''Chaining functions in order of execution.'''
    
    chain_function = lambda prev_result, func: func(prev_result)
    
    return functools.reduce(chain_function, args, initial_value)

In [2]:
db_host = 'localhost'
db_database = 'boe'
db_user = 'root'
db_password = 'pass'

# Boe dashboard exploration

### Functionality to be provided

1. Fetch of the weeks' diary
2. Analysis of the diary
3. Create dependency tree
4. Search for key-concept summarization of the diary

## Feching from the BOE API

In [5]:
Response = collections.namedtuple('Response', 'headers content')

def summary_url_for_date(date: datetime.datetime)->str:
    ''' Create an URL for fetching the correspondant BOE summary.
    Refer to https://www.boe.es/datosabiertos/documentos/SumariosBOE_v_1_0.pdf
    '''
    
    pub = 'BOE'
    I = 'S'
    date_string = date.strftime('%Y%m%d')
    
    return f'https://boe.es/diario_boe/xml.php?id={pub}-{I}-{date_string}'

def fetch_page(url: str, max_tries=3)->Response:
    '''Try to fetch a given url up to max_tries times.'''
    
    h = httplib2.Http(".cache")
    
    for _ in range(max_tries):
        response = Response(*h.request(url, "GET"))
        if response.headers.get('status') == '200':
            break
    
    return response

## Processing of the diary

```
sumario/
  meta/
   |pub
   |fecha
   |fechaAnt
   |fechaSig
  sumario/
   |diario/
   |  sumario_nbo/
   |   |urlPdf
   |  seccion[1, 2, 2A, 2B, 3, 4, 5, 5A, 5B, 5C, T]/
   |   |departamento[1..*]/
   |   |  epigrafe[1..*]/
   |   |    epigrafeType/ item[1..*]
   |   |  item[1..*]/
   |   |    itemType/ titulo, urlPdf, urlHtm, urlXml, suplemento
        
```

In [9]:
def print_reponse_content(response: Response)->None:
    print(response.content.decode('utf-8'))
    
def tree_from_response(response: Response)->et._Element:
    '''Create lxml.etree._Element from an XML document content.'''
    
    return et.fromstring(response.content)

def use_tree_for_search(tree)->Callable:
    '''Return a function xpath:str->List for performing 
    xpath-based search over the provided lxml tree.'''
    
    namespaces = dict(xmlns="http://www.w3.org/1999/xhtml")
    return lambda xpath: tree.xpath(xpath, namespaces=namespaces)

def use_xpath_for_search(xpath)->Callable:
    '''Return a function tree:str->List for performing 
    the provided xpath-based search over the tree.'''
    
    namespaces = dict(xmlns="http://www.w3.org/1999/xhtml")
    return lambda tree: tree.xpath(xpath, namespaces=namespaces)

In [12]:
class BOExpath:
    # Accessible from the diary's root
    publication_type = '/sumario/meta/pub'
    publication_date = '/sumario/meta/fecha'
    prev_publication_date = '/sumario/meta/fechaAnt'
    next_publication_date = '/sumario/meta/fechaSig'
    
    sumary = '/sumario/diario/sumario_nbo'
    section = '/sumario/diario/seccion'
    
    # Accessible for a section
    department = './departamento'
    
    # Accessible for a department
    epigraf = './epigrafe'
    items = './/item'
    
    # Accessible for an itemType node
    item_title = './titulo'
    item_pdf_url = './urlPdf'
    item_htm_url = './urlHtm'
    item_xml_url = './urlXml'
    
class BOEattributes:
    diary_nbo = 'nbo'
    summary_nbo_id = 'id'
    section_number = 'num'
    section_name = 'nombre'
    department_name = 'nombre'
    epigraph_name = 'nombre'
    item_id = 'id'
    pdf_url_sz_bytes = 'szBytes'
    pdf_url_sz_kbytes = 'szKBytes'
    item_control = 'control'

In [13]:
def get_sections(tree) -> Generator:
    sections = ((section.get(BOEattributes.section_number), section)
            for section
            in use_tree_for_search(tree)(BOExpath.section))
    
    return sections

def get_departments_per_section(sections) -> Generator:
    departments = (
            (section_number, department.get(BOEattributes.department_name), department)
            for (section_number, section)
            in sections
            for department
            in use_tree_for_search(section)(BOExpath.department))
    
    return departments

def get_items_per_department(departments) -> Generator:
    items = ((section_number, department_name, item)
        for (section_number, department_name, department)
        in departments
        for item
        in use_tree_for_search(department)(BOExpath.items))
    
    return items

def get_item_details(section_number:str, department_name:str, node)->Set:    
    search_details = use_tree_for_search(node)
    title_node = search_details(BOExpath.item_title)[0]
    pdf_url_node = search_details(BOExpath.item_pdf_url)[0]
    xml_url_node = search_details(BOExpath.item_xml_url)[0]
    htm_url_node = search_details(BOExpath.item_htm_url)[0]
    
    parent = node.getparent()
    is_epigraph = parent.tag.lower() == 'epigrafe'
    epigraph = parent = parent.get(BOEattributes.epigraph_name) if is_epigraph else ''
    
    details = {}
    details['id'] = node.get(BOEattributes.item_id)
    details['epigraph'] = epigraph
    details['section'] = section_number
    details['department'] = department_name
    details['title'] = title_node.text
    details['pdf_url'] = pdf_url_node.text
    details['xml_url'] = xml_url_node.text
    details['htm_url'] = htm_url_node.text
    
    return details

def get_details_per_item(items) -> Generator:
    details = (get_item_details(*item) for item in items)
    
    return details

## Inserting into the db

In [40]:
def boe_diary_entry_query(item):
    return f"""
        INSERT INTO boe_diary_entry
        (id, date, title, section, department, epigraph, pdf_url, xml_url, htm_url)
        VALUES ('{item.get('id')}', '{DATE.isoformat()}', '{item.get('title')}', 
                '{item.get('section')}', '{item.get('department')}', '{item.get('epigraph')}',
                '{item.get('pdf_url')}', '{item.get('xml_url')}', '{item.get('htm_url')}');
    """
    
def insert_items(db_host, db_user, db_password, db_database, items):
    connection = pymysql.connect(host=db_host,
                                 user=db_user,
                                 password=db_password,
                                 db=db_database)

    with connection:
        cursor = connection.cursor()
        for entry in items:
            cursor.execute(boe_diary_entry_query(entry))

## Execution

In [21]:
DATE = datetime.datetime(2020, 4, 25)
response = fetch_page(summary_url_for_date(DATE))
tree = tree_from_response(response)
items = list(pipe(tree,
        get_sections,
        get_departments_per_section,
        get_items_per_department,
        get_details_per_item))
insert_items(db_host, db_user, db_password, db_database, items)

items[0]

In [71]:
pipe(items,
    functools.partial(map, lambda d: d.get('title')),
    functools.partial(map, lambda d: (len(d), d)),
    functools.partial(filter, lambda d: d[0] > 600),
    tuple)

((694,
  'Resolución de 30 de octubre de 2019, del Instituto Nacional de Investigación y Tecnología Agraria y Alimentaria O.A. M.P., por la que se publica el Convenio con la Universidad Politécnica de Madrid, el Centro de Investigaciones Energéticas, Medioambientales y Tecnológicas, O.A., M.P., y la Universidad Autónoma de Madrid, para regular las condiciones para la distribución presupuestaria de los fondos concedidos para la realización del Programa de actividades de I+D "Tecnología destinada a la sostenibilidad de los sistemas agrícolas", convocatoria de ayudas para la realización de programas de actividades de I + D entre grupos de investigación de la Comunidad de Madrid en Tecnologías 2018.'),
 (730,
  'Anuncio de licitación de: Subdirección de Compras de la Sociedad Estatal Correos y Telégrafos S.A. Objeto: Adquisición por la Sociedad Estatal Correos y Telégrafos, S.A.S.M.E. (Correos), de 4.750.000 mascarillas autofiltrantes para el Grupo Correos, constituido por las siguientes s