In [1]:
from typing import Callable, Set, Generator, Iterable
import datetime
import collections
import pymysql
import functools
from lxml import etree as et
import httplib2

import helpers
import boe

In [2]:
db_host = 'localhost'
db_database = 'boe'
db_user = 'root'
db_password = 'pass'

use_db_settings = lambda func, *args: func(db_host, db_user, db_password, db_database, *args)

# Boe summary entry extraction

## Processing of the diary

```
sumario/
  meta/
   |pub
   |fecha
   |fechaAnt
   |fechaSig
  sumario/
   |diario/
   |  sumario_nbo/
   |   |urlPdf
   |  seccion[1, 2, 2A, 2B, 3, 4, 5, 5A, 5B, 5C, T]/
   |   |departamento[1..*]/
   |   |  epigrafe[1..*]/
   |   |    epigrafeType/ item[1..*]
   |   |  item[1..*]/
   |   |    itemType/ titulo, urlPdf, urlHtm, urlXml, suplemento
        
```

In [3]:
def get_sections(tree) -> Generator:
    sections = ((section.get(boe.SummaryAttribute.section_number), section)
            for section
            in helpers.use_tree_for_search(tree)(boe.SummaryXpath.section))
    
    return sections

def get_departments_per_section(sections) -> Generator:
    departments = (
            (section_number, department.get(boe.SummaryAttribute.department_name), department)
            for (section_number, section)
            in sections
            for department
            in helpers.use_tree_for_search(section)(boe.SummaryXpath.department))
    
    return departments

def get_items_per_department(departments) -> Generator:
    items = ((section_number, department_name, item)
        for (section_number, department_name, department)
        in departments
        for item
        in helpers.use_tree_for_search(department)(boe.SummaryXpath.items))
    
    return items

def get_item_details(section_number:str, department_name:str, node)->Set:    
    search_details = helpers.use_tree_for_search(node)
    title_node = search_details(boe.SummaryXpath.item_title)[0]
    pdf_url_node = search_details(boe.SummaryXpath.item_pdf_url)[0]
    xml_url_node = search_details(boe.SummaryXpath.item_xml_url)[0]
    htm_url_node = search_details(boe.SummaryXpath.item_htm_url)[0]
    
    parent = node.getparent()
    is_epigraph = parent.tag.lower() == 'epigrafe'
    epigraph = parent = parent.get(boe.SummaryAttribute.epigraph_name) if is_epigraph else ''
    
    details = {}
    details['id'] = node.get(boe.SummaryAttribute.item_id)
    details['epigraph'] = epigraph
    details['section'] = section_number
    details['department'] = department_name
    details['title'] = title_node.text
    details['pdf_url'] = pdf_url_node.text
    details['xml_url'] = xml_url_node.text
    details['htm_url'] = htm_url_node.text
    
    return details

def get_details_per_item(items) -> Generator:
    details = (get_item_details(*item) for item in items)
    
    return details

## Inserting into the db

In [4]:
def boe_diary_entry_query(item):
    return f"""
        INSERT INTO boe_diary_entry
        (id, date, title, section, department, epigraph, pdf_url, xml_url, htm_url)
        VALUES ('{item.get('id')}', '{DATE.isoformat()}', '{item.get('title')}', 
                '{item.get('section')}', '{item.get('department')}', '{item.get('epigraph')}',
                '{item.get('pdf_url')}', '{item.get('xml_url')}', '{item.get('htm_url')}');
    """
    
def insert_items(db_host, db_user, db_password, db_database, items):
    connection = pymysql.connect(host=db_host,
                                 user=db_user,
                                 password=db_password,
                                 db=db_database)

    with connection:
        cursor = connection.cursor()
        for entry in items:
            cursor.execute(boe_diary_entry_query(entry))

## Execution

In [5]:
DATE = datetime.datetime(2020, 4, 28)
response = helpers.fetch_page(boe.summary_url_for_date(DATE))
tree = helpers.tree_from_response(response)
items = list(helpers.pipe(tree,
        get_sections,
        get_departments_per_section,
        get_items_per_department,
        get_details_per_item))
use_db_settings(insert_items, items)

items[0]

{'id': 'BOE-A-2020-4689',
 'epigraph': 'Estado de alarma. Servicios esenciales',
 'section': '1',
 'department': 'MINISTERIO DE SANIDAD',
 'title': 'Resolución de 24 de abril de 2020, de la Secretaría General de Sanidad, por la que se modifica el Anexo de la Orden SND/310/2020, de 31 de marzo, por la que se establecen como servicios esenciales determinados centros, servicios y establecimientos sanitarios.',
 'pdf_url': '/boe/dias/2020/04/28/pdfs/BOE-A-2020-4689.pdf',
 'xml_url': '/diario_boe/xml.php?id=BOE-A-2020-4689',
 'htm_url': '/diario_boe/txt.php?id=BOE-A-2020-4689'}

In [6]:
helpers.pipe(items,
    functools.partial(map, lambda d: d.get('title')),
    functools.partial(map, lambda d: (len(d), d)),
    functools.partial(filter, lambda d: d[0] > 600),
    tuple)

()