In [1]:
import pdfquery
import requests
from datetime import datetime, timedelta
from lxml import html
import copy as copy

In [2]:
template = pdfquery.PDFQuery("./template.pdf")
template.load()

pdf = pdfquery.PDFQuery("./example.pdf")
pdf.load()

In [3]:
# Obter bounding boxes a partir do pdf 'template' #
def enlargeBbox(bbox):
    res = [x for x in bbox]
    res[0]-=20
    res[1]-=2
    res[2]+=20
    res[3]+=2
    res = ",".join([str(x) for x in res])
    #"x0,y0,x1,y1"
    return res

#valores do pdf 'template'
#usados para obter as bounding boxes dos valores
attrs = {
    "norte": "16834",
    "norte_o": "803",
    "centro": "3789",
    "centro_o": "244",
    "LVT": "12473",
    "LVT_o": "387",
    "alentejo": "263",
    "alentejo_o": "1",
    "algarve": "380",
    "algarve_o": "15",
    "açores": "140",
    "açores_o": "15",
    "madeira": "90",
    "madeira_o": "0",

    "suspeitos": "334923",
    "confirmados": "33969",
    "nao_confirmados": "299318",
    "aguardam": "1636",
    "recuperados": "20526",
    "obitos": "1465",
    "contactos": "28088",
    #casos especiais, 4ª pagina
    "UCI": "58",
    "internados": "421"
}


bboxes = {
    
}

res = {
    
}

def select_match(objs,query):
    #filtrar None's
    matches = list(filter(lambda x: not x.text is None,objs))
    #seleccionar primeiro elemento com texto correto
    matches = list(filter(lambda x: x.text.strip()==str(query), matches)) 
    return matches[0]

for attr in attrs.keys():
    #obter bboxes
    val = attrs[attr]
    matchObj = select_match(template.pq(f"*:contains('{val}')"),val)
    bbox = enlargeBbox(matchObj.layout.bbox)
    bboxes[attr] = bbox

In [4]:
# Obter dados dado um pdf #

def get_values(pdf):
    extract_query = [('with_parent','LTPage[page_index="0"]')]
    #adicionar query por bounding box por cada atributo
    for attr in bboxes.keys():
        if(attr!="UCI" and attr!="internados"):
            extract_query.append( (attr, f":in_bbox('{bboxes[attr]}')") )
    
    extract_query.append( ('with_parent','LTPage[page_index="3"]') )
    extract_query.append( ('UCI', f":in_bbox('{bboxes['UCI']}')") )
    extract_query.append( ('internados', f":in_bbox('{bboxes['internados']}')") )
    
    #extrair matches
    res = pdf.extract(extract_query)
    for key in res.keys():
        res[key].sort(key=len)
        if(len(res[key])):
            res[key] = int(res[key][0].text.strip())
    return res

In [5]:
#output de um pdf exemplo
display(get_values(pdf))

{'norte': 16789,
 'norte_o': 795,
 'centro': 3753,
 'centro_o': 240,
 'LVT': 11493,
 'LVT_o': 370,
 'alentejo': 260,
 'alentejo_o': 1,
 'algarve': 372,
 'algarve_o': 15,
 'açores': 137,
 'açores_o': 15,
 'madeira': 91,
 'madeira_o': 0,
 'suspeitos': 328873,
 'confirmados': 32895,
 'nao_confirmados': 294112,
 'aguardam': 1866,
 'recuperados': 19869,
 'obitos': 1436,
 'contactos': 28064,
 'UCI': 58,
 'internados': 432}

In [6]:
#Obter dados da página do Min.Saúde
page = requests.get('https://covid19.min-saude.pt/relatorio-de-situacao/')
tree = html.fromstring(page.content)

todayStr = datetime.today().strftime('%d/%m/%Y')
yesterdayStr = datetime.strftime(datetime.now() - timedelta(1), '%d/%m/%Y')

In [7]:
tree = html.fromstring(page.content)
today_link = tree.xpath(f"//a[contains(text(), '{todayStr}')]")
all_links = tree.xpath("//*[@class='single_content']/ul/li/a")

#Se não tem dados de hoje
if(not len(today_link)):
    print("Getting yesterday's stats")
    todayStr = yesterdayStr

#Obtém últimos 6 pdf's
pdf_links = [x.attrib['href'] for x in all_links[:6]]
for (i,link) in enumerate(pdf_links):

    r = requests.get(link, stream=True)

    with open(f"./latest/{i}.pdf", 'wb') as fd:
        for chunk in r.iter_content(2048):
            fd.write(chunk)

    print("downloading ",link)

#Obtém dados dos pdfs
print("Parsing pdf's...")
pdfs = [pdfquery.PDFQuery(f"./latest/{x}.pdf") for x in range(6)]
print("Getting values...")
vals = [get_values(x) for x in pdfs]

downloading  https://covid19.min-saude.pt/wp-content/uploads/2020/06/96_DGS_boletim_20200606.pdf
downloading  https://covid19.min-saude.pt/wp-content/uploads/2020/06/95_DGS_boletim_20200605V2.pdf
downloading  https://covid19.min-saude.pt/wp-content/uploads/2020/06/94_DGS_boletim_20200604.pdf
downloading  https://covid19.min-saude.pt/wp-content/uploads/2020/06/93_DGS_boletim_20200603.pdf
downloading  https://covid19.min-saude.pt/wp-content/uploads/2020/06/92_DGS_boletim_20200602.pdf
downloading  https://covid19.min-saude.pt/wp-content/uploads/2020/06/91_DGS_boletim_20200601.pdf
Parsing pdf's...
Getting values...


In [8]:
#Agrupa dados dos últimos dias numa lista
_attrs = copy.copy(vals[0])

for k in _attrs.keys():
    _attrs[k] = []

for file_vals in vals:
    for attr in file_vals:
        _attrs[attr].append(file_vals[attr])
        
display(_attrs)

{'norte': [16855, 16834, 16819, 16804, 16789, 16760],
 'norte_o': [804, 803, 801, 796, 795, 791],
 'centro': [3799, 3789, 3770, 3765, 3753, 3747],
 'centro_o': [244, 244, 240, 240, 240, 239],
 'LVT': [12818, 12473, 12137, 11828, 11493, 11335],
 'LVT_o': [395, 387, 383, 380, 370, 363],
 'alentejo': [266, 263, 262, 260, 260, 259],
 'alentejo_o': [1, 1, 1, 1, 1, 1],
 'algarve': [382, 380, 376, 376, 372, 372],
 'algarve_o': [15, 15, 15, 15, 15, 15],
 'açores': [141, 140, 138, 138, 137, 137],
 'açores_o': [15, 15, 15, 15, 15, 15],
 'madeira': [90, 90, 90, 90, 91, 90],
 'madeira_o': [0, 0, 0, 0, 0, 0],
 'suspeitos': [337333, 334923, 333106, 331094, 328873, 326278],
 'confirmados': [34351, 33969, 33592, 33261, 32895, 32700],
 'nao_confirmados': [301169, 299318, 297773, 295889, 294112, 291858],
 'aguardam': [1813, 1636, 1741, 1944, 1866, 1720],
 'recuperados': [20807, 20526, 20323, 20079, 19869, 19552],
 'obitos': [1474, 1465, 1455, 1447, 1436, 1424],
 'contactos': [29013, 28088, 28685, 28093,

In [9]:
#Cálcula casos ativos
_attrs['ativos'] = []
for i in range(6):
    _attrs['ativos'].append( _attrs['confirmados'][i] - _attrs['recuperados'][i] - _attrs['obitos'][i])
print(_attrs['ativos'])

[12070, 11978, 11814, 11735, 11590, 11724]


In [19]:
#pretty print dos valores
def diff_str(val,end=""):
    if(end=="%"):
        val_str = str(round(val,2))
    else:
        val_str = str(val)
    if(val >=0):
        return f"+{val_str}{end}"
    else:
        return f"{val_str}{end}"

def latest(k):
    return str(_attrs[k][0])
def var(k):
    return diff_str(_attrs[k][0] - _attrs[k][1])
def var_d(k,days=1):
    if(_attrs[k][0+days] == 0):
        return "--"
    diff = _attrs[k][0] - _attrs[k][0+days]
    diff_p = diff*100/_attrs[k][0+days]
    return diff_str(diff_p,end="%")


def row_str(attr):
    return f"|{latest(attr)}|{var(attr)}|{var_d(attr,1)}|{var_d(attr,3)}|{var_d(attr,5)}|"
    

def aumento():
    novos = int(var('confirmados'))
    ativos = int(latest('ativos'))
    diff_p = novos*100/ativos
    return diff_str(diff_p,end="%")

template_txt = f"""

# ATUALIZAÇÃO DIÁRIA - {todayStr}

---

|👥 Totais|Variação|📈 1 dia|📈 3 dias|📈 5 dias|
:--|:--|:--|:--|:--|
{row_str('confirmados')}
|**✔️ Recuperados**|**Variação**|**📈 1 dia**|**📈 3 dias**|**📈 5 dias**|
{row_str('recuperados')}
|**☠️ Óbitos**|**Variação**|**📈 1 dia**|**📈 3 dias**|**📈 5 dias**|
{row_str('obitos')}
|**🏥 Internados**|**Variação**|**📈 1 dia**|**📈 3 dias**|**📈 5 dias**|
{row_str('internados')}
|🛌 **UCI**|**Variação**|**📈 1 dia**|**📈 3 dias**|**📈 5 dias**|
{row_str('UCI')}
|😷 **Ativos**|**Variação**|**📈 1 dia**|**📈 3 dias**|**📈 5 dias**|
{row_str('ativos')}

---

|📊 **Aumento de Novos Casos face a Casos Ativos:**|
:--|
| {aumento()}|

---

## Por região

**Casos Confirmados**

|Região|👥 Totais|Variação|📈 1 dia|📈 3 dias|📈 5 dias|
:--|:--|:--|:--|:--|:--|
|**Norte**{row_str('norte')}
|**Centro**{row_str('centro')}
|**LVT**{row_str('LVT')}
|**Alentejo**{row_str('alentejo')}
|**Algarve**{row_str('algarve')}
|**Açores**{row_str('açores')}
|**Madeira**{row_str('madeira')}

**Óbitos**

|Região|**☠️ Óbitos**|**Variação**|**📈 1 dia**|**📈 3 dias**|**📈 5 dias**|
:--|:--|:--|:--|:--|:--|
|**Norte**{row_str('norte_o')}
|**Centro**{row_str('centro_o')}
|**LVT**{row_str('LVT_o')}
|**Alentejo**{row_str('alentejo_o')}
|**Algarve**{row_str('algarve_o')}
|**Açores**{row_str('açores_o')}
|**Madeira**{row_str('madeira_o')}

---

**Dados obtidos automaticamente do site da DGS**

[Código fonte disponível aqui](https://github.com/binarybreakpoint/dgs-script)
"""


print(template_txt)



# ATUALIZAÇÃO DIÁRIA - 06/06/2020

---

|👥 Totais|Variação|📈 1 dia|📈 3 dias|📈 5 dias|
:--|:--|:--|:--|:--|
|34351|+382|+1.12%|+3.28%|+5.05%|
|**✔️ Recuperados**|**Variação**|**📈 1 dia**|**📈 3 dias**|**📈 5 dias**|
|20807|+281|+1.37%|+3.63%|+6.42%|
|**☠️ Óbitos**|**Variação**|**📈 1 dia**|**📈 3 dias**|**📈 5 dias**|
|1474|+9|+0.61%|+1.87%|+3.51%|
|**🏥 Internados**|**Variação**|**📈 1 dia**|**📈 3 dias**|**📈 5 dias**|
|414|-7|-1.66%|-3.27%|-12.1%|
|🛌 **UCI**|**Variação**|**📈 1 dia**|**📈 3 dias**|**📈 5 dias**|
|57|-1|-1.72%|+1.79%|-10.94%|
|😷 **Ativos**|**Variação**|**📈 1 dia**|**📈 3 dias**|**📈 5 dias**|
|12070|+92|+0.77%|+2.85%|+2.95%|

---

|📊 **Aumento de Novos Casos face a Casos Ativos:**|
:--|
| +3.16%|

---

## Por região

**Casos Confirmados**

|Região|👥 Totais|Variação|📈 1 dia|📈 3 dias|📈 5 dias|
:--|:--|:--|:--|:--|:--|
|**Norte**|16855|+21|+0.12%|+0.3%|+0.57%|
|**Centro**|3799|+10|+0.26%|+0.9%|+1.39%|
|**LVT**|12818|+345|+2.77%|+8.37%|+13.08%|
|**Alentejo**|266|+3|+1.14%|+2.31%|+2.7