# Covid-19 statistics for Cyprus

In [2094]:
import numpy as np
import pandas as pd
import plotly.express as px
from datetime import datetime, timedelta
import urllib.parse
import requests

### Get data

In [699]:
#url_rapid_tests = "https://www.data.gov.cy/sites/default/files/CY%20Covid19%20-%20AntigenTests%20Data%20-%20New_124.csv"
#df_r = pd.read_csv(url_rapid_tests)

url_daily_stats = "https://www.data.gov.cy/sites/default/files/CY%20Covid19%20Open%20Data%20-%20Extended%20-%20new_242.csv"
df_s = pd.read_csv(url_daily_stats)

url_vaccination =  "https://www.data.gov.cy/sites/default/files/CY%20Vaccination%20Data%20by%20Target%20Group_18.csv"
df_v1 = pd.read_csv(url_vaccination)



In [700]:
df_v1.tail(45)

Unnamed: 0,YearWeekISO,Denominator,NumberDosesReceived,NumberDosesExported,FirstDose,SecondDose,DoseAdditional1,District,TargetGroup,Vaccine,Population
1757,2021-W50,,0,0.0,0,0,0,ALL,LTCF,AZ,888005
1758,2021-W51,888005.0,0,0.0,323,357,5937,ALL,ALL,MOD,888005
1759,2021-W51,888005.0,0,0.0,1075,2886,18296,ALL,ALL,COM,888005
1760,2021-W51,888005.0,0,0.0,2,4,0,ALL,ALL,AZ,888005
1761,2021-W51,888005.0,0,0.0,52,545,0,ALL,ALL,JANSS,888005
1762,2021-W51,46307.0,0,0.0,250,365,0,ALL,Age10_14,COM,888005
1763,2021-W51,46307.0,0,0.0,0,0,0,ALL,Age10_14,AZ,888005
1764,2021-W51,46307.0,0,0.0,0,0,0,ALL,Age10_14,JANSS,888005
1765,2021-W51,46307.0,0,0.0,32,2,0,ALL,Age10_14,MOD,888005
1766,2021-W51,28169.0,0,0.0,0,0,0,ALL,Age15_17,AZ,888005


In [702]:
df_s.tail(20)

Unnamed: 0,date,daily new cases,daily deaths,Hospitalised Cases,Severe Cases,Cases In ICUs,Incubated Cases,PCR_daily tests performed,RA_daily tests performed,total_daily tests performed,total cases,total deaths,total PCR tests,total RA tests,total tests,Notes
649,18/12/2021,508,2.0,177,59,21,21,9501,66343.0,75844.0,144713,619,2481918,15653848.0,18135766.0,
650,19/12/2021,448,1.0,175,60,24,22,9415,50413.0,59828.0,145161,620,2491333,15704261.0,18195594.0,
651,20/12/2021,835,3.0,179,60,24,23,11586,94897.0,106483.0,145996,623,2502919,15799158.0,18302077.0,
652,21/12/2021,806,0.0,176,59,28,27,10815,76716.0,87531.0,146802,623,2513734,15875874.0,18389608.0,
653,22/12/2021,883,1.0,172,64,28,26,10295,93308.0,103603.0,147685,624,2524029,15969182.0,18493211.0,
654,23/12/2021,978,1.0,162,66,29,25,12269,80964.0,93233.0,148663,625,2536298,16050146.0,18586444.0,
655,24/12/2021,917,2.0,168,62,28,22,10963,112532.0,123495.0,149580,627,2547261,16162678.0,18709939.0,
656,25/12/2021,268,1.0,157,63,28,24,7073,9158.0,16231.0,149848,628,2554334,16171836.0,18726170.0,
657,26/12/2021,912,0.0,162,74,31,26,4535,49180.0,53715.0,150760,628,2558869,16221016.0,18779885.0,
658,27/12/2021,1925,2.0,161,78,31,27,12917,108210.0,121127.0,152685,630,2571786,16329226.0,18901012.0,


### Cases

In [954]:
df_s1 = df_s.copy()

df_s1["date"] = pd.to_datetime(df_s1["date"], format= "%d/%m/%Y") 
df_s1 = df_s1.replace(":", np.nan)

In [1091]:
px.scatter(
    df_s1,
    x="date",
    y="daily new cases",
    labels={"date":"Date", "daily new cases":"Cases"},
    width=900,
    height=600,
    title='Daily new cases')\
    .update_traces(mode='lines+markers')

### 7-day moving average

In [956]:
# 7-days moving average
df_s1["cases_7days"] = df_s1["daily new cases"].rolling(window=7, center=False).mean() 

px.scatter(
    df_s1,
    x="date",
    y="cases_7days",
    labels={"date":"Date", "cases_7days":"Cases"},
    width=600,
    height=400,
    title='Cases (7-days moving average)')\
    .update_traces(mode='lines', 
                   line = dict(color='firebrick', width=5))

#    .update_traces(mode='lines+markers', 
#                   line = dict(color='firebrick', width=5),
#                   marker=dict(size=0,color="firebrick"))

### Deaths

In [957]:
# Fill NaN with zeros 
df_s1["daily deaths"] = df_s1["daily deaths"].fillna(0) 

In [958]:
# 7-days moving average
df_s1["deaths_7days"] = df_s1["daily deaths"].rolling(window=7, center=False).mean() 

px.scatter(
    df_s1,
    x="date",
    y="deaths_7days",
    labels={"date":"Date", "deaths_7days":"Deaths"},
    width=600,
    height=400,
    title='Deaths (7-days moving average)')\
    .update_traces(mode='lines', 
                   line = dict(color='firebrick', width=3))

# Vaccinations per age

Download vaccination data

In [1856]:
# Download and save raw data

url_vaccination =  "https://www.data.gov.cy/sites/default/files/CY%20Vaccination%20Data%20by%20Target%20Group_18.csv"
df_v1 = pd.read_csv(url_vaccination)

last_week = df_v1["YearWeekISO"].iloc[-1]
df_v1.to_csv(f"./data/vaccination_dataset_until_{last_week}.csv")

In [1859]:
# Load local data
df_v1 = pd.read_csv(glob.glob("./data/vaccination_dataset_until_*.csv")[0], index_col=[0])

In [1863]:
# Actual Plot

fig = px.bar(
    df_grp,
    x="target_group",
    y="%_diff_prev_categ",
    color="vacc_category",
    color_discrete_map={
                "Boosted" : "#0C456D",
                "Fully vaccinated" : "#316F9A",
                "At least one dose" : "#79A9CB",
                "Unvaccinated" : "#D22727"},
    labels={"%_diff_prev_categ" : 'Percentage', "vacc_category" : "Category", "target_group": "Age group"},
    custom_data = ["vacc_category", "%_vaccinations"],
    width=800,
    height=600,
    title="Cyprus immunity wall"
)
#hover_data={"%_vaccinations" : ":.0f",
#           "%_diff_prev_categ" : False,
#           "n_stack_bar": False},



fig.update_traces(
    hovertemplate="<b>%{customdata[0]}</b> %{customdata[1]:.0f}%<extra></extra>",
    #width=[0.6]*len(target_groups)
)

fig.update_layout(
    font=dict(size=15, color="#2F2E31"),
    xaxis = dict(
        tickmode='array',
        tickvals=[0, 1, 2, 3, 4, 5, 6, 7, 8],
        ticktext=['<b>All<br>ages</b>'] + list(df_grp["target_group"].unique()[1:]),
        range=[-1,9]
    ),
    legend={'traceorder':'reversed'},
    plot_bgcolor="#FFEBD9",
    paper_bgcolor="#FFEBD9",
    hoverlabel_font=dict(color="white"),
    bargap=0.25
)
fig.show()

In [426]:
# Hovering tricks 

# https://plotly.com/python/hover-text-and-formatting/#hover-labels
# https://stackoverflow.com/questions/59057881/python-plotly-how-to-customize-hover-template-on-with-what-information-to-show 

## Create database with urls for the daily reports (pdf)

In [1113]:
# Get all urls for files in the PIO website with corona announcements (PDF)

from bs4 import BeautifulSoup
import requests

def find_files():
    url = "https://www.pio.gov.cy/coronavirus/categories/press#30"
    soup = BeautifulSoup(requests.get(url).text)

    hrefs = []

    for a in soup.find_all('a'):
        hrefs.append(a['href'])

    return hrefs

list_of_links = find_files()

## show what you've found:
#for link in list_of_links:
#    print(link)

#28
#29
#30
#31
#32
#33
#34
#35
#36
#37
https://www.pio.gov.cy/coronavirus/uploads/08012022--Δελτίο Τύπου - Ανακοίνωση Υπουργείου Υγείας για αύξηση σημείων δειγματοληψίας.pdf
https://www.pio.gov.cy/coronavirus/uploads/Σημεία δειγματοληψίας για διενέργεια rapid test μέσω του προγράμματος του Υπουργείου Υγείας – 10 Ιανουαρίου 2022.pdf
https://www.pio.gov.cy/coronavirus/uploads/06012021_diefkriniseisneametraEL.pdf
https://www.pio.gov.cy/coronavirus/uploads/06012022--Δελτίο Τύπου - Ανακοίνωση Υπουργού Υγείας Μιχάλη Χατζηπαντέλα για το κόστος και την αξιοπιστία των εξετάσεων PCR και Rapid test.pdf
https://www.pio.gov.cy/coronavirus/uploads/05012022--Δελτίο Τύπου - Φαρμακεία που θα είναι ανοιχτά για διενέργεια rapid test στις 6 Ιανουαρίου.pdf
https://www.pio.gov.cy/coronavirus/uploads/05012022--Δήλωση του Υπουργού Υγείας για τις αποφάσεις του Υπουργικού Συμβουλίου.pdf
https://www.pio.gov.cy/coronavirus/uploads/04012022--Ανακοίνωση - Αναθεωρημένες ώρες λειτουργίας Εμβολιαστικού Κέντρου Αμμοχώ

In [1775]:
# Isolate only those urls corresponding to daily reports 

substrings_in_path = [
    "Ανακοίνωση του Υπουργείου Υγείας σχετικά με νέα περιστατικά της νόσου COVID-19",
    "Ανακοίνωση για επιβεβαίωση κρουσμάτων κορωνοϊού",
    "anakoinosikrousmata",
]
links_daily_reports = [link for link in list_of_links \
                       if any(substring in link for substring in substrings_in_path)]                       

for link in links_daily_reports:
    print(link)

https://www.pio.gov.cy/coronavirus/uploads/Ανακοίνωση του Υπουργείου Υγείας σχετικά με νέα περιστατικά της νόσου COVID-19 – 9 Ιανουαρίου 2022.pdf
https://www.pio.gov.cy/coronavirus/uploads/Ανακοίνωση του Υπουργείου Υγείας σχετικά με νέα περιστατικά της νόσου COVID-19 – 8 Ιανουαρίου 2022.pdf
https://www.pio.gov.cy/coronavirus/uploads/Ανακοίνωση του Υπουργείου Υγείας σχετικά με νέα περιστατικά της νόσου COVID-19 – 7 Ιανουαρίου 2022.pdf
https://www.pio.gov.cy/coronavirus/uploads/Ανακοίνωση του Υπουργείου Υγείας σχετικά με νέα περιστατικά της νόσου COVID-19 – 6 Ιανουαρίου 2022.pdf
https://www.pio.gov.cy/coronavirus/uploads/Ανακοίνωση του Υπουργείου Υγείας σχετικά με νέα περιστατικά της νόσου COVID-19 – 5 Ιανουαρίου 2022.pdf
https://www.pio.gov.cy/coronavirus/uploads/Ανακοίνωση του Υπουργείου Υγείας σχετικά με νέα περιστατικά της νόσου COVID-19 – 4 Ιανουαρίου 2022.pdf
https://www.pio.gov.cy/coronavirus/uploads/Ανακοίνωση του Υπουργείου Υγείας σχετικά με νέα περιστατικά της νόσου COVID-19 – 

In [None]:
map_month = {
    "Ιανουαρίου" : 1,
    "Φεβρουαρίου" : 2,
    "Μαρτίου" : 3,
    "Απριλίου" : 4,
    "Μαΐου" : 5,
    "Ιουνίου" : 6,
    "Ιουλίου" : 7,
    "Αυγούστου" : 8,
    "Σεπτεμβρίου" : 9,
    "Οκτωβρίου" : 10,
    "Νοεμβρίου" : 11,
    "Δεκεμβρίου" : 12,
}

def month_key_in_string(date_string):
    months_list = [month if month in date_string else None for month in map_month.keys()] # None except on matching month
    month_key = list(filter(None, months_list)) # remove None
    return month_key[0]

def replace_month_with_number(date_string):
    month_key = month_key_in_string(date_string)
    new_string = date_string.replace(month_key, str(map_month[month_key]))
    return new_string

def extract_datetime(link):
    if "Ανακοίνωση του Υπουργείου Υγείας σχετικά με νέα περιστατικά της νόσου COVID-19" in link:
        date_str_gr = link.split("– ")[1].split(".")[0]
        date1 = replace_month_with_number(date_str_gr) # defined above
        dt_report = datetime.strptime(date1, "%d %m %Y")
    else:
        date1 =  link.split("uploads/")[1].split("--")[0]
        dt_report = datetime.strptime(date1, "%d%m%Y")# .date()
        
    return dt_report

def percent_encode_url(url):
    """ encode utf-8 greek charachters to percent-encoded """
    url_percent_encoded = url.split("uploads/")[0] \
        + "uploads/" \
        + urllib.parse.quote(url.split("uploads/")[1].encode('utf-8'))
    return url_percent_encoded


In [None]:
# Extract date as datetime from url
dt_reports = [extract_datetime(link) for link in links_daily_reports]

# Check if link is valid (.ok -> True if link valid, False if not)
valid_url = [requests.get(percent_encode_url(url)).ok for url in links_daily_reports]

# Get all urls as percent-encoded
links_perc_encoded = [percent_encode_url(url) for url in links_daily_reports]

In [1776]:
# Add date and url in a dataframe
db_url_reports = pd.DataFrame({
    "date" : dt_reports,
    "url" : links_daily_reports,
    "url_perc" : links_perc_encoded,
    "valid_url_perc" : valid_url
})

# Sort by date
db_url = db_url_reports.sort_values("date")

# Save as csv
db_url.to_csv("./data/database_url_reports.csv")

## Download daily reports

* Only since July 16, 2021
    * This is when vaccination status was first included

In [2379]:
df_url = pd.read_csv("./data/database_url_reports.csv")
df_url["url"].loc[398:400].values

array(['https://www.pio.gov.cy/coronavirus/uploads/Ανακοίνωση του Υπουργείου Υγείας σχετικά με νέα περιστατικά της νόσου COVID-19 – 4 Δεκεμβρίου 2021.pdf',
       'https://www.pio.gov.cy/coronavirus/uploads/Ανακοίνωση του Υπουργείου Υγείας σχετικά με νέα περιστατικά της νόσου COVID-19 – 4 Δεκεμβρίου 2021.pdf',
       'https://www.pio.gov.cy/coronavirus/uploads/Ανακοίνωση του Υπουργείου Υγείας σχετικά με νέα περιστατικά της νόσου COVID-19 – 6 Δεκεμβρίου 2021.pdf'],
      dtype=object)

In [2387]:
#df_url["date"].replace("-","_",regex=True)

In [1232]:
import time 

# Load urls
df_url = pd.read_csv("./data/database_url_reports.csv")
#df_url

# Download only those after July 16, 2022 (when vaccination status is reported)

df_url_v1 = df_url.query("date >= '2021-07-16'")

for index, row in df_url_v1.iterrows():
    date_str = row["date"]
    print (f"Downloading report (pdf) of {date_str}...")
    filename_report = row["date"].replace("-","_")
    path_report = "./data/reports/" + filename_report + ".pdf"
    #print(url_percent_encoded)

    # download and save report as pdf
    response = requests.get(row["url_perc"])
    with open(path_report, 'wb') as f:
        f.write(response.content)
    print(f"Valid url: {response.ok}")

    response.close()
    #time.sleep(1)
    
    #break
        

Downloading report (pdf) of 2021-07-16...
Valid url: True
Downloading report (pdf) of 2021-07-17...
Valid url: True
Downloading report (pdf) of 2021-07-18...
Valid url: True
Downloading report (pdf) of 2021-07-19...
Valid url: True
Downloading report (pdf) of 2021-07-20...
Valid url: True
Downloading report (pdf) of 2021-07-21...
Valid url: True
Downloading report (pdf) of 2021-07-22...
Valid url: True
Downloading report (pdf) of 2021-07-23...
Valid url: True
Downloading report (pdf) of 2021-07-24...
Valid url: True
Downloading report (pdf) of 2021-07-25...
Valid url: True
Downloading report (pdf) of 2021-07-26...
Valid url: True
Downloading report (pdf) of 2021-07-27...
Valid url: True
Downloading report (pdf) of 2021-07-28...
Valid url: True
Downloading report (pdf) of 2021-07-29...
Valid url: True
Downloading report (pdf) of 2021-07-30...
Valid url: True
Downloading report (pdf) of 2021-07-31...
Valid url: True
Downloading report (pdf) of 2021-08-01...
Valid url: True
Downloading re

Valid url: True
Downloading report (pdf) of 2021-12-04...
Valid url: True
Downloading report (pdf) of 2021-12-06...
Valid url: True
Downloading report (pdf) of 2021-12-07...
Valid url: True
Downloading report (pdf) of 2021-12-08...
Valid url: True
Downloading report (pdf) of 2021-12-09...
Valid url: True
Downloading report (pdf) of 2021-12-10...
Valid url: True
Downloading report (pdf) of 2021-12-11...
Valid url: True
Downloading report (pdf) of 2021-12-12...
Valid url: True
Downloading report (pdf) of 2021-12-13...
Valid url: True
Downloading report (pdf) of 2021-12-14...
Valid url: True
Downloading report (pdf) of 2021-12-15...
Valid url: True
Downloading report (pdf) of 2021-12-16...
Valid url: True
Downloading report (pdf) of 2021-12-17...
Valid url: True
Downloading report (pdf) of 2021-12-18...
Valid url: True
Downloading report (pdf) of 2021-12-19...
Valid url: True
Downloading report (pdf) of 2021-12-20...
Valid url: True
Downloading report (pdf) of 2021-12-21...
Valid url: Tru

## Get vaccination status from report (pdf)

In [None]:
# !pip install pdfminer
import io

from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.layout import LAParams
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfpage import PDFPage

import re

In [1593]:
map_numbers = {
    "Ένα" : "1",
    "Δύο" : "2",
    "Τρία" : "3", 
    "Τέσσερα" : "4",
    "Πέντε" : "5",
    "Έξι" : "6",
    "Επτά" : "7",
    "Εφτά" : "7",
    "Οχτώ" : "8",
    "Εννιά" : "9"
}

def pdf_to_text(path):
    with open(path, 'rb') as fp:
        rsrcmgr = PDFResourceManager()
        outfp = io.StringIO()
        laparams = LAParams()
        device = TextConverter(rsrcmgr, outfp, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp):
            interpreter.process_page(page)
    text = outfp.getvalue()
    
    return text

def number_key_in_string(date_string):
    numbers_list = [number if number in date_string else None for number in map_numbers.keys()] # None except on matching month
    number_key = list(filter(None, numbers_list)) # remove None
    return number_key[0]

def extract_info_text(text):
    m1 = re.search('Ποσοστό (\s*.+?)%', text) # "\s*" ignores white space
    perc_unv = m1.group(1) if m1 else None

    m2 = re.search('σήμερα, (.+?):', text)
    date_pdf = m2.group(1) if m2 else None

    m3 = re.search("(\w+?) ασθενείς COVID-19 νοσηλεύονται", text) 
    hosp_num = m3.group(1) if m3 else None
    
    #m4 = re.search("ανίχνευσης αντιγόνου \(antigen rapid test\). -  (.+?) \(ποσοστό θετικότητας", text) 
    #case_num = m4.group(1) if m4 else None
 
    m4 = re.search("test\.(.+?) ποσοστό", text) 
    case_num = m4.group(1) if m4 else None
    if case_num is not None:
        case_num = case_num.split("-")[1].strip()
    
    m5 = re.search("- (.+?) άτομ", text)
    death_num = m5.group(1) if m5 else "0"
    if (death_num != "0"):
        if(len(death_num.strip()) > 1): 
            word_key = number_key_in_string(death_num)
            death_num = map_numbers[word_key]
    #print(death_num)
  
    
    info = [perc_unv, date_pdf, hosp_num, case_num, death_num]
    
    return info

In [1831]:
# Extract info from text 

dates_url = []
percs_unv = []
dates_pdf = []
hosps_num = []
cases_num = []
deaths_num = []

for report in paths_reports:
    date_url = report.split("reports/")[1].split(".pdf")[0].replace("_", "-")
    dates_url.append(date_url)
    text = pdf_to_text(report)
    text = text.replace("\n","").replace("\xa0","").replace("(","").replace(")", "")
    
    info = extract_info_text(text)
    percs_unv.append(info[0])
    dates_pdf.append(info[1])
    hosps_num.append(info[2])
    cases_num.append(info[3])
    deaths_num.append(info[4])
    
    print(f"Date:{info[1]}({date_url}): {info[2]} hosp ({info[0]}% unvacc)")
    print(f"{info[3]} new cases, {info[4]} deaths")
    
    
    if info[0] is None:
        print(text)
        break

Date:11 Σεπτεμβρίου 2021(2021-09-11): 106 hosp (83,97% unvacc)
136 new cases, 1 deaths
Date:14 Αυγούστου 2021(2021-08-14): 244 hosp (81,97% unvacc)
401 new cases, 1 deaths
Date:1 Ιανουαρίου 2022(2022-01-01): 187 hosp (85,03% unvacc)
2,332 new cases, 0 deaths
Date:9 Οκτωβρίου 2021(2021-10-09): 60 hosp ( 60% unvacc)
134 new cases, 0 deaths
Date:26 Ιουλίου 2021(2021-07-26): 270 hosp ( 85,5% unvacc)
851 new cases, 1 deaths
Date:28 Νοεμβρίου 2021(2021-11-28): 114 hosp ( 65.79% unvacc)
337 new cases,  2 deaths
Date:1 Οκτωβρίου 2021(2021-10-01): 73 hosp (61,85% unvacc)
59 new cases, 2 deaths
Date:29 Δεκεμβρίου 2021(2021-12-29): 174 hosp (82,76% unvacc)
3002 new cases,  5 deaths
Date:30 Οκτωβρίου 2021(2021-10-30): 58 hosp ( 63,8% unvacc)
155 new cases, 3 deaths
Date:29 Σεπτεμβρίου 2021(2021-09-29): 81 hosp (67,86% unvacc)
123 new cases, 0 deaths
Date:4 Αυγούστου 2021(2021-08-04): 273 hosp (89,02% unvacc)
575 new cases, 1 deaths
Date:2 Αυγούστου 2021(2021-08-02): 287 hosp (88,62% unvacc)
493 ne

Date:6 Δεκεμβρίου 2021(2021-12-06): 116 hosp ( 75% unvacc)
706 new cases,  1 deaths
Date:10 Νοεμβρίου 2021(2021-11-10): 72 hosp (65,28% unvacc)
275 new cases, 1 deaths
Date:3 Δεκεμβρίου 2021(2021-12-03): 111 hosp ( 69.37% unvacc)
594 new cases,  1 deaths
Date:2 Νοεμβρίου 2021(2021-11-02): 70 hosp ( 60% unvacc)
249 new cases, 1 deaths
Date:21 Σεπτεμβρίου 2021(2021-09-21): 94 hosp (73,41% unvacc)
117 new cases, 0 deaths
Date:26 Οκτωβρίου 2021(2021-10-26): 57 hosp (63,16% unvacc)
162 new cases, 0 deaths
Date:12 Δεκεμβρίου 2021(2021-12-12): 146 hosp ( 77.4% unvacc)
393 new cases,  2 deaths
Date:15 Αυγούστου 2021(2021-08-15): 240 hosp (82,09% unvacc)
165 new cases, 0 deaths
Date:11 Νοεμβρίου 2021(2021-11-11): 80 hosp ( 62,5% unvacc)
264 new cases, 0 deaths
Date:19 Δεκεμβρίου 2021(2021-12-19): 175 hosp ( 79,32% unvacc)
448 new cases,  1 deaths
Date:2 Δεκεμβρίου 2021(2021-12-02): 112 hosp ( 66.08% unvacc)
538 new cases, 0 deaths
Date:16 Οκτωβρίου 2021(2021-10-16): 51 hosp (66,67% unvacc)
126 

In [1832]:
# Turn percentages to float 
percs_unv1 = [float(perc.replace(",", ".")) for perc in percs_unv]
cases_num1 = [float(num.replace(",", "")) for num in cases_num]

# Put info in df
df_hosp_unv = pd.DataFrame({
    "date" : dates_url,
    "hospitalizations_dailyrep" : hosps_num,
    "perc_hosp_unvaccinated" : percs_unv1,
    "daily new cases" : cases_num1,
    "daily deaths" : deaths_num,
})

df_hosp_unv["perc_hosp_vaccinated"] = 100 - df_hosp_unv["perc_hosp_unvaccinated"].values
df_hosp_unv["date"] = pd.to_datetime(df_hosp_unv["date"], format="%Y-%m-%d")
df_hosp_unv["hospitalizations_dailyrep"] = df_hosp_unv["hospitalizations_dailyrep"].astype(float)
df_hosp_unv["daily deaths"] = df_hosp_unv["daily deaths"].astype(float)
df_hosp_unv

Unnamed: 0,date,hospitalizations_dailyrep,perc_hosp_unvaccinated,daily new cases,daily deaths,perc_hosp_vaccinated
0,2021-09-11,106.0,83.97,136.0,1.0,16.03
1,2021-08-14,244.0,81.97,401.0,1.0,18.03
2,2022-01-01,187.0,85.03,2332.0,0.0,14.97
3,2021-10-09,60.0,60.00,134.0,0.0,40.00
4,2021-07-26,270.0,85.50,851.0,1.0,14.50
...,...,...,...,...,...,...
172,2021-12-28,168.0,80.96,2241.0,0.0,19.04
173,2021-08-09,278.0,88.13,446.0,3.0,11.87
174,2021-09-14,103.0,79.62,126.0,2.0,20.38
175,2021-11-09,71.0,63.39,266.0,0.0,36.61


In [1833]:
# Add missing values
# 5.12.2022 -> the uploaded pdf for this day is the same as the one for the day before.
# Get actual data from : 
# https://www.pio.gov.cy/%CE%B1%CE%BD%CE%B1%CE%BA%CE%BF%CE%B9%CE%BD%CF%89%CE%B8%CE%AD%CE%BD%CF%84%CE%B1-%CE%AC%CF%81%CE%B8%CF%81%CE%BF.html?id=24586#flat 
df_data_2021_12_05 = pd.DataFrame({
    "date" : datetime.strptime('2021-12-05', '%Y-%m-%d'),
    "hospitalizations_dailyrep" : 119,
    "perc_hosp_unvaccinated" : 68.91, 
    "daily new cases" : 307,
    "daily deaths" : 0,
    "perc_hosp_vaccinated" : 100 - 68.91
}, index=[0])

df_hosp_unv = pd.concat([df_hosp_unv, df_data_2021_12_05], axis=0)
df_hosp_unv = df_hosp_unv.sort_values("date") # sort
df_hosp_unv

Unnamed: 0,date,hospitalizations_dailyrep,perc_hosp_unvaccinated,daily new cases,daily deaths,perc_hosp_vaccinated
58,2021-07-16,190.0,89.50,953.0,1.0,10.50
160,2021-07-17,203.0,90.20,928.0,0.0,9.80
143,2021-07-18,218.0,91.30,858.0,1.0,8.70
85,2021-07-19,228.0,91.20,1056.0,5.0,8.80
115,2021-07-20,231.0,90.00,995.0,1.0,10.00
...,...,...,...,...,...,...
171,2022-01-05,205.0,80.00,5202.0,4.0,20.00
70,2022-01-06,235.0,77.45,3777.0,0.0,22.55
137,2022-01-07,230.0,77.83,5244.0,0.0,22.17
65,2022-01-08,244.0,77.05,3959.0,4.0,22.95


In [1834]:
# Save as csv
last_date = str(df_hosp_unv["date"].iloc[-1].date()).replace("-", "_")
df_hosp_unv.to_csv(f"./data/hosp_per_vacc_until_{last_date}.csv")

## Download dataset with info for all days

In [1726]:
# Data from pio 
url_daily_since_start = "https://www.data.gov.cy/sites/default/files/CY%20Covid19%20Open%20Data%20-%20Extended%20-%20new_242.csv"
df_k = pd.read_csv(url_daily_since_start)

# Format DF 
df_k1 = df_k.copy()
df_k1 = df_k1.replace(":", np.nan)
df_k1["date"] = pd.to_datetime(df_k1["date"], format= "%d/%m/%Y") 
df_k1 = df_k1.sort_values("date")
last_date_group =  df_k1["date"].iloc[-1].date()
print(last_date_pio)

# Save dataset
df_k1.to_csv(f"./data/dataset_raw_until_{last_date_group}.csv")
df_k1


2022-01-06


Unnamed: 0,date,daily new cases,daily deaths,Hospitalised Cases,Severe Cases,Cases In ICUs,Incubated Cases,PCR_daily tests performed,RA_daily tests performed,total_daily tests performed,total cases,total deaths,total PCR tests,total RA tests,total tests,Notes
0,2020-03-09,2,,,,,,,,,2,0,,,,
1,2020-03-10,1,,,,,,,,,3,0,,,,
2,2020-03-11,3,,,,,,,,,6,0,,,,
3,2020-03-12,0,,,,,,,,,6,0,,,,
4,2020-03-13,8,,,,,,,,,14,0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
664,2022-01-02,3538,1.0,203,84,38,33,7912,83570.0,91482.0,172697,639,2642705,16889926.0,19532631.0,
665,2022-01-03,5024,2.0,201,75,36,32,13840,150518.0,164358.0,177721,641,2656545,17040444.0,19696989.0,
666,2022-01-04,5457,5.0,209,76,36,32,18576,104225.0,122801.0,183178,646,2675121,17144669.0,19819790.0,
667,2022-01-05,5202,4.0,205,70,31,28,14588,115488.0,130076.0,188380,650,2689709,17260157.0,19949866.0,


### Merge raw dataset with information (extra dates + vaccination status) from individual daily reports

In [1814]:
# Load raw dataset
df_k1 = pd.read_csv(glob.glob("./data/dataset_raw_until_*.csv")[0],  index_col=[0])
df_k1["date"] = pd.to_datetime(df_k1["date"], format= "%Y-%m-%d") 
df_k1


Unnamed: 0,date,daily new cases,daily deaths,Hospitalised Cases,Severe Cases,Cases In ICUs,Incubated Cases,PCR_daily tests performed,RA_daily tests performed,total_daily tests performed,total cases,total deaths,total PCR tests,total RA tests,total tests,Notes
0,2020-03-09,2,,,,,,,,,2,0,,,,
1,2020-03-10,1,,,,,,,,,3,0,,,,
2,2020-03-11,3,,,,,,,,,6,0,,,,
3,2020-03-12,0,,,,,,,,,6,0,,,,
4,2020-03-13,8,,,,,,,,,14,0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
664,2022-01-02,3538,1.0,203.0,84.0,38.0,33.0,7912.0,83570.0,91482.0,172697,639,2642705.0,16889926.0,19532631.0,
665,2022-01-03,5024,2.0,201.0,75.0,36.0,32.0,13840.0,150518.0,164358.0,177721,641,2656545.0,17040444.0,19696989.0,
666,2022-01-04,5457,5.0,209.0,76.0,36.0,32.0,18576.0,104225.0,122801.0,183178,646,2675121.0,17144669.0,19819790.0,
667,2022-01-05,5202,4.0,205.0,70.0,31.0,28.0,14588.0,115488.0,130076.0,188380,650,2689709.0,17260157.0,19949866.0,


In [1835]:
# Load data from individual daily reports
df_hosp_unv = pd.read_csv(glob.glob("./data/hosp_per_vacc_until_*.csv")[0],  index_col=[0])

# fix a few things of df
df_hosp_unv["date"] = pd.to_datetime(df_hosp_unv["date"], format= "%Y-%m-%d")
last_date_daily =  df_hosp_unv["date"].iloc[-1].date()
print(last_date_daily)
df_hosp_unv

2022-01-09


Unnamed: 0,date,hospitalizations_dailyrep,perc_hosp_unvaccinated,daily new cases,daily deaths,perc_hosp_vaccinated
58,2021-07-16,190.0,89.50,953.0,1.0,10.50
160,2021-07-17,203.0,90.20,928.0,0.0,9.80
143,2021-07-18,218.0,91.30,858.0,1.0,8.70
85,2021-07-19,228.0,91.20,1056.0,5.0,8.80
115,2021-07-20,231.0,90.00,995.0,1.0,10.00
...,...,...,...,...,...,...
171,2022-01-05,205.0,80.00,5202.0,4.0,20.00
70,2022-01-06,235.0,77.45,3777.0,0.0,22.55
137,2022-01-07,230.0,77.83,5244.0,0.0,22.17
65,2022-01-08,244.0,77.05,3959.0,4.0,22.95


In [1836]:
# Add cases & deaths for dates missing from group_data
if last_date_daily > last_date_group:
    columns_keep = ["date", "daily new cases", "daily deaths", "hospitalizations_dailyrep"] # columns to add now
    df_extra = df_hosp_unv[columns_keep].query("date > @last_date_group") # get missing dates
    df_extra = df_extra.rename(columns={"hospitalizations_dailyrep" : "Hospitalised Cases"})
    columns_diff = [col for col in df_k1.columns.to_list() if col not in df_extra.columns.to_list()] # get difference of columns
    for col in columns_diff:
        df_extra[col] = np.nan # fill new columns with nan
    
    # Concatenate
    df_join_tmp = pd.concat([df_k1, df_extra], axis=0)
    
df_join_tmp

Unnamed: 0,date,daily new cases,daily deaths,Hospitalised Cases,Severe Cases,Cases In ICUs,Incubated Cases,PCR_daily tests performed,RA_daily tests performed,total_daily tests performed,total cases,total deaths,total PCR tests,total RA tests,total tests,Notes
0,2020-03-09,2.0,,,,,,,,,2.0,0.0,,,,
1,2020-03-10,1.0,,,,,,,,,3.0,0.0,,,,
2,2020-03-11,3.0,,,,,,,,,6.0,0.0,,,,
3,2020-03-12,0.0,,,,,,,,,6.0,0.0,,,,
4,2020-03-13,8.0,,,,,,,,,14.0,0.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,2022-01-05,5202.0,4.0,205.0,70.0,31.0,28.0,14588.0,115488.0,130076.0,188380.0,650.0,2689709.0,17260157.0,19949866.0,
668,2022-01-06,3777.0,0.0,235.0,73.0,31.0,29.0,12452.0,64803.0,77255.0,192157.0,650.0,2702161.0,17324960.0,20027121.0,
137,2022-01-07,5244.0,0.0,230.0,,,,,,,,,,,,
65,2022-01-08,3959.0,4.0,244.0,,,,,,,,,,,,


In [1837]:
# Add vaccination info for hospitalizations 

df_vacc_info = df_hosp_unv[["date", "perc_hosp_unvaccinated", "perc_hosp_vaccinated", "hospitalizations_dailyrep"]]
df_vacc_info = df_vacc_info.rename(columns={"hospitalizations_dailyrep": "Hospitalizations from reports"})
df_boost = df_join_tmp.merge(df_vacc_info, on="date", how="left")
last_date_boost = str(df_boost["date"].iloc[-1].date()).replace("-", "_")

# Add numer of vacc and uvnacc hospitalisations
df_boost["n_hospitalized_unvaccinated"] = df_boost["Hospitalised Cases"] * df_boost["perc_hosp_unvaccinated"] / 100
df_boost["n_hospitalized_vaccinated"] = df_boost["Hospitalised Cases"] * df_boost["perc_hosp_vaccinated"] / 100
df_boost

# Save extented dataset
df_boost.to_csv(f"./data/dataset_boosted_until_{last_date_boost}.csv")

# Plots

## 1. Hospitalizations per vaccination status

In [1838]:
# Load extented dataset 
df = pd.read_csv(glob.glob("./data/dataset_boosted_until_*.csv")[0],  index_col=[0])
df["date"] = pd.to_datetime(df["date"], format= "%Y-%m-%d") 

In [1839]:
# Turn df to long format 

df_vacc = df.drop(["perc_hosp_unvaccinated", "n_hospitalized_unvaccinated", ], axis=1)\
                    .rename(columns={"perc_hosp_vaccinated" : "perc_hospitalised",
                           "n_hospitalized_vaccinated" : "n_hospitalized"})\
                    .assign(Vaccination="Vaccinated")

df_unvacc = df.drop(["perc_hosp_vaccinated", "n_hospitalized_vaccinated",], axis=1)\
                    .rename(columns={"perc_hosp_unvaccinated" : "perc_hospitalised",
                           "n_hospitalized_unvaccinated" : "n_hospitalized"})\
                    .assign(Vaccination="Unvaccinated")

df_h1 = pd.concat([df_vacc, df_unvacc])
df_h1

Unnamed: 0,date,daily new cases,daily deaths,Hospitalised Cases,Severe Cases,Cases In ICUs,Incubated Cases,PCR_daily tests performed,RA_daily tests performed,total_daily tests performed,total cases,total deaths,total PCR tests,total RA tests,total tests,Notes,perc_hospitalised,Hospitalizations from reports,n_hospitalized,Vaccination
0,2020-03-09,2.0,,,,,,,,,2.0,0.0,,,,,,,,Vaccinated
1,2020-03-10,1.0,,,,,,,,,3.0,0.0,,,,,,,,Vaccinated
2,2020-03-11,3.0,,,,,,,,,6.0,0.0,,,,,,,,Vaccinated
3,2020-03-12,0.0,,,,,,,,,6.0,0.0,,,,,,,,Vaccinated
4,2020-03-13,8.0,,,,,,,,,14.0,0.0,,,,,,,,Vaccinated
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,2022-01-05,5202.0,4.0,205.0,70.0,31.0,28.0,14588.0,115488.0,130076.0,188380.0,650.0,2689709.0,17260157.0,19949866.0,,80.00,205.0,164.0000,Unvaccinated
668,2022-01-06,3777.0,0.0,235.0,73.0,31.0,29.0,12452.0,64803.0,77255.0,192157.0,650.0,2702161.0,17324960.0,20027121.0,,77.45,235.0,182.0075,Unvaccinated
669,2022-01-07,5244.0,0.0,230.0,,,,,,,,,,,,,77.83,230.0,179.0090,Unvaccinated
670,2022-01-08,3959.0,4.0,244.0,,,,,,,,,,,,,77.05,244.0,188.0020,Unvaccinated


In [1840]:
# Make plot

fig = px.area(
    df_h1, 
    x="date", 
    y="n_hospitalized", 
    color="Vaccination",
    custom_data=["Vaccination", "perc_hospitalised"],
    width=800,
    height=500,
    color_discrete_map={
        "Unvaccinated" : "#D22727",
        "Vaccinated" : "#316F9A",
    },
    labels={"n_hospitalized": "Hospitalizations", "Vaccination": "Status"},
    title="Hospitalizations by vaccination status (since July 16, 2021)"
)

fig.update_layout(
    hoverlabel_font=dict(color="#2F2E31"), #=white
    font=dict(size=15, color="#2F2E31"),
    legend={'traceorder':'reversed'},
    plot_bgcolor="#FFEBD9",
    paper_bgcolor="#FFEBD9",
    yaxis=dict(
        showgrid=True, 
        gridcolor="#FCD19C",
        gridwidth=0.2,  # tried different values, same issue
    ),
    xaxis=dict(
        showgrid=False,
        range=["2021-07-16", df_h1["date"].iloc[-1].date()],
        title=""
    ),
    hovermode='x unified',
)

fig.update_traces(
    hovertemplate="<b>%{customdata[0]}</b>: %{y:.0f} ( %{customdata[1]:.1f}% )<extra></extra>"\
    #+ " <br> %{x}",
)

fig.show()




# 2. Hospitalizations by severity

* Three categories: ICU intubated, ICU no intubation,  no ICU

In [None]:
# Load extented dataset 
df = pd.read_csv(glob.glob("./data/dataset_boosted_until_*.csv")[0],  index_col=[0])
df["date"] = pd.to_datetime(df["date"], format= "%Y-%m-%d") 

In [1842]:
# Add non-icu cases
df1 = df.copy()
df1["no_icu"] = df1["Hospitalised Cases"] - df1["Incubated Cases"]
df1["icu_nointub"] = df1["Cases In ICUs"] - df1["Incubated Cases"]
df1

# Create long-form df
df_icu_intub = df1.drop(["no_icu", "icu_nointub"], axis=1)\
                    .rename(columns={"Incubated Cases" : "Patients"})\
                    .assign(Category="ICU, intubated")

df_icu_no_intub = df1.drop(["no_icu", "Incubated Cases"], axis=1)\
                    .rename(columns={"icu_nointub" : "Patients"})\
                    .assign(Category="ICU, not intubated")

df_no_icu = df1.drop(["icu_nointub", "Incubated Cases"], axis=1)\
                    .rename(columns={"no_icu" : "Patients"})\
                    .assign(Category="not ICU")

df_icu_long = pd.concat([df_icu_intub, 
                         df_icu_no_intub,
                         df_no_icu])
df_icu_long

Unnamed: 0,date,daily new cases,daily deaths,Hospitalised Cases,Severe Cases,Cases In ICUs,Patients,PCR_daily tests performed,RA_daily tests performed,total_daily tests performed,...,total PCR tests,total RA tests,total tests,Notes,perc_hosp_unvaccinated,perc_hosp_vaccinated,Hospitalizations from reports,n_hospitalized_unvaccinated,n_hospitalized_vaccinated,Category
0,2020-03-09,2.0,,,,,,,,,...,,,,,,,,,,"ICU, intubated"
1,2020-03-10,1.0,,,,,,,,,...,,,,,,,,,,"ICU, intubated"
2,2020-03-11,3.0,,,,,,,,,...,,,,,,,,,,"ICU, intubated"
3,2020-03-12,0.0,,,,,,,,,...,,,,,,,,,,"ICU, intubated"
4,2020-03-13,8.0,,,,,,,,,...,,,,,,,,,,"ICU, intubated"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,2022-01-05,5202.0,4.0,205.0,70.0,31.0,177.0,14588.0,115488.0,130076.0,...,2689709.0,17260157.0,19949866.0,,80.00,20.00,205.0,164.0000,41.0000,not ICU
668,2022-01-06,3777.0,0.0,235.0,73.0,31.0,206.0,12452.0,64803.0,77255.0,...,2702161.0,17324960.0,20027121.0,,77.45,22.55,235.0,182.0075,52.9925,not ICU
669,2022-01-07,5244.0,0.0,230.0,,,,,,,...,,,,,77.83,22.17,230.0,179.0090,50.9910,not ICU
670,2022-01-08,3959.0,4.0,244.0,,,,,,,...,,,,,77.05,22.95,244.0,188.0020,55.9980,not ICU


In [1864]:
# Plot using graph objects (allows more flexibility)

import plotly.graph_objects as go

categories = ["ICU, intubated", "ICU, not intubated", "not ICU"]
color_map={
    "ICU, intubated" : "#8E007D",
    "ICU, not intubated" : "#C800AF",
    "not ICU" : "#008564",
}
fig = go.Figure()
for category in categories:
    df_tmp = df_icu_long.query("Category == @category")
    fig.add_trace(go.Scatter(x=df_tmp["date"], 
                             y=df_tmp["Patients"], 
                             fill='tonexty',
                             name=category,
                             stackgroup='one', # to stack them
                             line=dict(color=color_map[category])
                            )
                 )
    
fig.add_trace(go.Scatter(
                x=df_icu_long["date"], 
                y=df_icu_long["Severe Cases"],
                mode='lines',
                name='Severe',
                line=dict(color="#FFBD8A", width=3)) # , dash="dash", "dot", FFBD8A, "#FFA45F", #75E1FF"
             )


fig.update_layout(
    title=dict(text="Hospitalizations by severity"),
    font=dict(size=15, color="#2F2E31"),
    legend={'traceorder':'reversed'},
    plot_bgcolor="#FFEBD9",
    paper_bgcolor="#FFEBD9",
    yaxis=dict(
        title=dict(text="Hospitalizations"),
        showgrid=True, 
        gridcolor="#FCD19C",
        gridwidth=0.2,  # tried different values, same issue
    ),
    xaxis=dict(
        showgrid=False,
        #range=["2021-07-16","2022-01-06"],
        title=""
    ),
    hovermode='x unified',
    hoverlabel_font=dict(color="#2F2E31")

)

fig.show()

In [1865]:
# Plot using plotly express
fig = px.area(
    df_icu_long, 
    x="date", 
    y="Patients", 
    color="Category",
    custom_data = ["Category"],
    width=950,
    height=500,
    color_discrete_map={
        "ICU, intubated" : "#8E007D",
        "ICU, not intubated" : "#C800AF",
        "not ICU" : "#008564",
    },
    labels={"Patients": "Hospitalizations"},
    
    title="Hospitalizations by severity"
)

fig.update_layout(
    font=dict(size=15, color="#2F2E31"),
    legend={'traceorder':'reversed'},
    plot_bgcolor="#FFEBD9",
    paper_bgcolor="#FFEBD9",
    yaxis=dict(
        showgrid=True, 
        gridcolor="#FCD19C",
        gridwidth=0.2,  # tried different values, same issue
    ),
    xaxis=dict(
        showgrid=False,
        #range=["2021-07-16","2022-01-06"],
        title=""
    ),
    hovermode='x unified',
    hoverlabel_font=dict(color="#2F2E31")

)

fig.update_traces(
    hovertemplate="%{customdata[0]}: %{y:.0f} <extra></extra>"#+
    #"<br> %{x}",
    #width=[0.6]*len(target_groups)
)



fig.show()




# 3. Cases +  Hospitalizations + Deaths

In [1883]:
# Load extented dataset 
df = pd.read_csv(glob.glob("./data/dataset_boosted_until_*.csv")[0],  index_col=[0])
df["date"] = pd.to_datetime(df["date"], format= "%Y-%m-%d") 

In [1885]:
# Get df in long-format
df2 = df.copy()
df2["cases_movavg"] = df2["daily new cases"].rolling(window=7, center=False).mean() 
df2 = df2[["date", "daily new cases", "daily deaths", "Hospitalised Cases", "cases_movavg"]]
df2 = df2.fillna(0)
df2 = df2.rename(columns={
    "daily new cases":"cases",
    "daily deaths" : "deaths",
    "Hospitalised Cases": "hospitalizations"
})

df2

Unnamed: 0,date,cases,deaths,hospitalizations,cases_movavg
0,2020-03-09,2.0,0.0,0.0,0.000000
1,2020-03-10,1.0,0.0,0.0,0.000000
2,2020-03-11,3.0,0.0,0.0,0.000000
3,2020-03-12,0.0,0.0,0.0,0.000000
4,2020-03-13,8.0,0.0,0.0,0.000000
...,...,...,...,...,...
667,2022-01-05,5202.0,4.0,205.0,4350.285714
668,2022-01-06,3777.0,0.0,235.0,4339.714286
669,2022-01-07,5244.0,0.0,230.0,4367.714286
670,2022-01-08,3959.0,4.0,244.0,4600.142857


In [1886]:
# Create long-form df
df_cases = df2.drop(["deaths", "hospitalizations", "cases_movavg"], axis=1)\
                    .rename(columns={"cases" : "number"})\
                    .assign(Category="Cases")

df_cases_movavg = df2.drop(["deaths", "hospitalizations", "cases"], axis=1)\
                    .rename(columns={"cases_movavg" : "number"})\
                    .assign(Category="Cases, 7-day average")

df_deaths = df2.drop(["cases", "hospitalizations", "cases_movavg"], axis=1)\
                    .rename(columns={"deaths" : "number"})\
                    .assign(Category="Deaths")

df_hospitalisations = df2.drop(["cases", "deaths", "cases_movavg"], axis=1)\
                    .rename(columns={"hospitalizations" : "number"})\
                    .assign(Category="Hospitalisations")

df2_long = pd.concat([df_cases,
                      df_cases_movavg,
                      df_deaths,
                      df_hospitalisations])
df2_long

Unnamed: 0,date,number,Category
0,2020-03-09,2.0,Cases
1,2020-03-10,1.0,Cases
2,2020-03-11,3.0,Cases
3,2020-03-12,0.0,Cases
4,2020-03-13,8.0,Cases
...,...,...,...
667,2022-01-05,205.0,Hospitalisations
668,2022-01-06,235.0,Hospitalisations
669,2022-01-07,230.0,Hospitalisations
670,2022-01-08,244.0,Hospitalisations


In [1967]:
fig = px.line(
    df2_long, 
    x='date', 
    y='number', 
    color='Category',
    custom_data = ["Category"],
    width=900,
    height=600,
    color_discrete_map={
        "Cases": "#C5C5C5",
        "Cases, 7-day average": "#1379FD",
        "Hospitalisations" : "#FB202B",
        "Deaths" : "#FFAE00"# "#48D7AC",#"#FFB220",
    },
    labels={"number": "Number", "date" : "", "Category" : ""},
    title="Cases, Hospitalisations, Deaths"
)

fig.update_layout(
    font=dict(size=15, color="#2F2E31"),
    plot_bgcolor="#FFEBD9",
    paper_bgcolor="#FFEBD9",
    yaxis=dict(
        showgrid=True, 
        gridcolor="#FCD19C",
        gridwidth=0.2,  # tried different values, same issue
    ),
    xaxis=dict(
        showgrid=False,
        #range=["2021-07-16","2022-01-06"],
        title=""
    ),
    hovermode='x unified',
)

fig.update_traces(
    hovertemplate="%{customdata[0]}: %{y:.0f} <extra></extra>",
    line=dict(width=4)
)

fig.show()

# Plot also deaths alone 

In [1970]:
# Create death + death_moving_average

df2_death = df2_long.query("Category=='Deaths'")
df2_death_avg = df2_death.copy()
df2_death_avg["number"] = df2_death_avg["number"].rolling(window=14, center=False).mean() 
df2_death_avg["Category"] = "Deaths, 14-day average"

df_death_long = pd.concat([df2_death,df2_death_avg])
df_death_long

Unnamed: 0,date,number,Category
0,2020-03-09,0.000000,Deaths
1,2020-03-10,0.000000,Deaths
2,2020-03-11,0.000000,Deaths
3,2020-03-12,0.000000,Deaths
4,2020-03-13,0.000000,Deaths
...,...,...,...
667,2022-01-05,1.857143,"Deaths, 14-day average"
668,2022-01-06,1.785714,"Deaths, 14-day average"
669,2022-01-07,1.642857,"Deaths, 14-day average"
670,2022-01-08,1.857143,"Deaths, 14-day average"


In [2334]:
fig = px.line(
    df_death_long, 
    x='date', 
    y='number', 
    color='Category',
    custom_data = ["Category"],
    width=900,
    height=600,
    color_discrete_map={
        "Deaths" : "#B9C4C5",
        "Deaths, 14-day average":"#FFAE00" # "#FFAE00"# "#FFB220"
    },
    labels={"number": "Number", "date" : "", "Category" : ""},
    title="Deaths"
)

fig.update_layout(
    font=dict(size=15, color="#2F2E31"),
    plot_bgcolor="#FFEBD9",
    paper_bgcolor="#FFEBD9",
    yaxis=dict(
        showgrid=True, 
        gridcolor="#FCD19C",
        gridwidth=0.2,  # tried different values, same issue
    ),
    xaxis=dict(
        showgrid=False,
        #range=["2021-07-16","2022-01-06"],
        title=""
    ),
    hovermode='x unified',
)


fig.update_traces( hovertemplate="%{customdata[0]}: %{y:.0f}<extra></extra>")

fig.update_traces(
    line=dict(width=7),
    selector=dict(name="Deaths, 14-day average")
)

fig.show()

# Hospitalizations per vaccination status *after using correct denominator*

* Find % of vaccinated in group of 18+ for each week

* From other df get % of vaccinated and not for each day (since 16 July) -> transform this to weekly cases


In [2013]:
# Load extented dataset 
df = pd.read_csv(glob.glob("./data/dataset_boosted_until_*.csv")[0],  index_col=[0])
df["date"] = pd.to_datetime(df["date"], format= "%Y-%m-%d") 

In [2033]:
df.sort_values("date").tail(10)

Unnamed: 0,date,daily new cases,daily deaths,Hospitalised Cases,Severe Cases,Cases In ICUs,Incubated Cases,PCR_daily tests performed,RA_daily tests performed,total_daily tests performed,...,total deaths,total PCR tests,total RA tests,total tests,Notes,perc_hosp_unvaccinated,perc_hosp_vaccinated,Hospitalizations from reports,n_hospitalized_unvaccinated,n_hospitalized_vaccinated
662,2021-12-31,5048.0,2.0,180.0,79.0,34.0,28.0,14971.0,154505.0,169476.0,...,638.0,2626268.0,16775904.0,19402172.0,,85.48,14.52,180.0,153.864,26.136
663,2022-01-01,2332.0,0.0,187.0,82.0,36.0,32.0,8525.0,30452.0,38977.0,...,638.0,2634793.0,16806356.0,19441149.0,,85.03,14.97,187.0,159.0061,27.9939
664,2022-01-02,3538.0,1.0,203.0,84.0,38.0,33.0,7912.0,83570.0,91482.0,...,639.0,2642705.0,16889926.0,19532631.0,,83.26,16.74,203.0,169.0178,33.9822
665,2022-01-03,5024.0,2.0,201.0,75.0,36.0,32.0,13840.0,150518.0,164358.0,...,641.0,2656545.0,17040444.0,19696989.0,,81.1,18.9,201.0,163.011,37.989
666,2022-01-04,5457.0,5.0,209.0,76.0,36.0,32.0,18576.0,104225.0,122801.0,...,646.0,2675121.0,17144669.0,19819790.0,,78.0,22.0,209.0,163.02,45.98
667,2022-01-05,5202.0,4.0,205.0,70.0,31.0,28.0,14588.0,115488.0,130076.0,...,650.0,2689709.0,17260157.0,19949866.0,,80.0,20.0,205.0,164.0,41.0
668,2022-01-06,3777.0,0.0,235.0,73.0,31.0,29.0,12452.0,64803.0,77255.0,...,650.0,2702161.0,17324960.0,20027121.0,,77.45,22.55,235.0,182.0075,52.9925
669,2022-01-07,5244.0,0.0,230.0,,,,,,,...,,,,,,77.83,22.17,230.0,179.009,50.991
670,2022-01-08,3959.0,4.0,244.0,,,,,,,...,,,,,,77.05,22.95,244.0,188.002,55.998
671,2022-01-09,3012.0,4.0,251.0,,,,,,,...,,,,,,75.3,24.7,251.0,189.003,61.997


In [2014]:
# Inspect numbers
df5 = df.copy()
df5[["Hospitalised Cases", 
    "n_hospitalized_unvaccinated", 
    "perc_hosp_unvaccinated", 
    "n_hospitalized_vaccinated", 
    "perc_hosp_vaccinated"]]\
    .iloc[-5:]\
    .sum()\
    .divide(5)

Hospitalised Cases             233.0000
n_hospitalized_unvaccinated    180.4043
perc_hosp_unvaccinated          77.5260
n_hospitalized_vaccinated       52.5957
perc_hosp_vaccinated            22.4740
dtype: float64

In [2142]:
# Add weeks to df in ascending

def ascending_weeks_from_one(week_list):
    "return weeks number as increasing and not resetted by year changes"
    
    ascending_weeks = []
    new_week = week_list[0]
    
    for i, w in enumerate(week_list):  
        if (w != new_week) & (w != week_list[i-1]):
            new_week +=1
        ascending_weeks.append(new_week)
    ascending_weeks = [wk - ascending_weeks[0] + 1 for wk in ascending_weeks]
    
    return ascending_weeks

# Add weeks ("%V" ISO week number, with Monday as first day of week)
df5['week'] = df5['date'].dt.strftime('%V') 

# Get weeks in ascending format
week_tmp = df5['week'].astype(int).values
df5['week'] = ascending_weeks_from_one(week_tmp) 

# Add week day name
df5["weekday"] = df5["date"].dt.day_name()

# Get first Thursday in list (day 0 is Monday)
first_mid_week = df5.query("weekday == 'Thursday'")\
                    .loc[:, "date"]\
                    .iloc[0]

# For each week add middle-of-week's date
df5["midweek"] = df5["week"].apply(lambda x: first_mid_week + timedelta(7 * (x - 1) ))


In [2143]:
df5.dtypes

date                             datetime64[ns]
daily new cases                         float64
daily deaths                            float64
Hospitalised Cases                      float64
Severe Cases                            float64
Cases In ICUs                           float64
Incubated Cases                         float64
PCR_daily tests performed               float64
RA_daily tests performed                float64
total_daily tests performed             float64
total cases                             float64
total deaths                            float64
total PCR tests                         float64
total RA tests                          float64
total tests                             float64
Notes                                    object
perc_hosp_unvaccinated                  float64
perc_hosp_vaccinated                    float64
Hospitalizations from reports           float64
n_hospitalized_unvaccinated             float64
n_hospitalized_vaccinated               

In [2216]:
# Group_by - midweek
dfhosp = df5.groupby("midweek")\
            .mean()

dfhosp = dfhosp.reset_index()\
        .query("midweek >= '2021-07-16'")
dfhosp["midweek"].iloc[0]

Timestamp('2021-07-22 00:00:00')

### Now get vaccinations per week

In [2268]:
# Load local data
df_v = pd.read_csv(glob.glob("./data/vaccination_dataset_until_*.csv")[0], index_col=[0])

In [2269]:
# Add date corresponding to middle (Thursday) of each week

def date_in_mid_of_week(year_week):
    """returns date of middle of this week"""
    
    weekdates = []
    for day in range(7):
        week_date = datetime.datetime.strptime(year_week + '-{}'.format(day), "%Y-W%W-%w")
        weekdates.append(week_date)
    date_mid_week = [dt for dt in weekdates if dt.weekday()==3] # Thursday
    
    return date_mid_week[0]

dfv1 = df_v.copy()
dfv1["midweek"] = dfv1["YearWeekISO"].apply(lambda x: date_in_mid_of_week(x))

In [2271]:
# Get vacc for all data for target group ALL
dfv2 = dfv1.copy()

age_groups_18plus = ['Age18_24', 'Age25_49', 'Age50_59', 'Age60_69', 'Age70_79', 'Age80+']
age_groups = ['ALL', 'Age10_14', 'Age15_17', 'Age18_24', 'Age25_49', 
              'Age50_59', 'Age60_69', 'Age70_79', 'Age80+', "Age18+"]

df18plus = dfv2.query("TargetGroup in @age_groups_18plus")\
                .groupby(["midweek", "Vaccine"])\
                .sum()\
                .assign(TargetGroup="Age18+")\
                .reset_index()
df18plus["Population"] = dfv2["Population"].iloc[0]

dfv3 = pd.concat([dfv2, df18plus])
dfv3 = dfv3.query("TargetGroup in @age_groups")
dfv3["CompletingVaccinationScheme"] = dfv3.apply(lambda x: x["FirstDose"] if x["Vaccine"]=="JANSS"\
                                                     else x["SecondDose"]
                                                 , axis = 1)

dfv3["AtLeastOneDose"] = dfv3["FirstDose"]

# OPEN QUESTION: if "DoseAdditional1" in Janssen refers to 3rd dose or if it's an another term for second 
# for now I am working assuming the second case.
dfv3["Boosted"] = dfv3.apply(lambda x: x["SecondDose"] + x["DoseAdditional1"] if x["Vaccine"]=="JANSS"\
                                                     else x["DoseAdditional1"]
                                                 , axis=1)
dfv3

Unnamed: 0,YearWeekISO,Denominator,NumberDosesReceived,NumberDosesExported,FirstDose,SecondDose,DoseAdditional1,District,TargetGroup,Vaccine,Population,midweek,CompletingVaccinationScheme,AtLeastOneDose,Boosted
0,2020-W52,888005.0,9750,,369,0,0,ALL,ALL,COM,888005,2020-12-31,0,369,0
1,2020-W52,84850.0,0,,5,0,0,ALL,Age18_24,COM,888005,2020-12-31,0,5,0
2,2020-W52,330295.0,0,,66,0,0,ALL,Age25_49,COM,888005,2020-12-31,0,66,0
3,2020-W52,107634.0,0,,17,0,0,ALL,Age50_59,COM,888005,2020-12-31,0,17,0
4,2020-W52,94714.0,0,,35,0,0,ALL,Age60_69,COM,888005,2020-12-31,0,35,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171,,717452.0,0,0.0,390,373,4978,,Age18+,MOD,888005,2021-12-16,373,390,4978
172,,717452.0,0,0.0,2,4,0,,Age18+,AZ,888005,2021-12-23,4,2,0
173,,717452.0,0,0.0,1075,2886,18296,,Age18+,COM,888005,2021-12-23,2886,1075,18296
174,,717452.0,0,0.0,52,545,0,,Age18+,JANSS,888005,2021-12-23,52,52,545


Unnamed: 0,midweek,Denominator,NumberDosesReceived,NumberDosesExported,FirstDose,SecondDose,DoseAdditional1,Population,CompletingVaccinationScheme,AtLeastOneDose,Boosted
0,2020-12-31,717452.0,0,0.0,369,0,0,888005,0,369,0
1,2021-01-07,717452.0,0,0.0,5716,1,0,888005,1,5716,0
2,2021-01-14,717452.0,0,0.0,4423,324,0,888005,324,4423,0
3,2021-01-21,717452.0,0,0.0,4887,2371,0,888005,2371,4887,0
4,2021-01-28,717452.0,0,0.0,4433,2848,0,888005,2848,4433,0
5,2021-02-04,717452.0,0,0.0,3438,4330,0,888005,4330,3438,0
6,2021-02-11,717452.0,0,0.0,6644,4605,0,888005,4605,6644,0
7,2021-02-18,717452.0,0,0.0,9731,4399,0,888005,4399,9731,0
8,2021-02-25,717452.0,0,0.0,16285,3663,0,888005,3663,16285,0
9,2021-03-04,717452.0,0,0.0,14821,5990,0,888005,5990,14821,0


In [2319]:
# Select a target group & Groupby Date and sum 
df_ta = dfv3.copy()
df_ta = dfv3.query("TargetGroup == 'Age18+'")\
            .groupby("midweek")\
            .sum()\
            .reset_index()

df_ta["Population"] = dfv3["Population"].iloc[0]
df_ta["Denominator"] = dfv3.query("TargetGroup == 'Age18+'")["Denominator"].iloc[0]

# Add cumulative sum and percentage of target group population
df_ta["ComplVacc_cumsum"] = df_ta["CompletingVaccinationScheme"].cumsum()
df_ta["ComplVacc_percent"] = df_ta.apply(lambda x: x["ComplVacc_cumsum"]/x["Denominator"]*100, axis=1)

df_ta["AtLeastOne_cumsum"] = df_ta["AtLeastOneDose"].cumsum()
df_ta["AtLeastOne_percent"] = df_ta.apply(lambda x: x["AtLeastOne_cumsum"]/x["Denominator"]*100, axis=1)

df_ta["Boosted_cumsum"] = df_ta["Boosted"].cumsum()
df_ta["Boosted_percent"] = df_ta.apply(lambda x: x["Boosted_cumsum"]/x["Denominator"]*100, axis=1)

# Merge with hospitalisations df

In [2320]:
dfhosp

Unnamed: 0,midweek,daily new cases,daily deaths,Hospitalised Cases,Severe Cases,Cases In ICUs,Incubated Cases,PCR_daily tests performed,RA_daily tests performed,total_daily tests performed,...,total deaths,total PCR tests,total RA tests,total tests,perc_hosp_unvaccinated,perc_hosp_vaccinated,Hospitalizations from reports,n_hospitalized_unvaccinated,n_hospitalized_vaccinated,week
71,2021-07-22,931.714286,2.571429,250.714286,64.142857,27.714286,26.857143,7549.428571,67108.428571,74657.857143,...,402.428571,1592596.0,7942430.0,9535027.0,89.8,10.2,250.714286,225.006571,25.707714,72
72,2021-07-29,643.571429,3.0,285.571429,82.142857,39.714286,37.285714,7150.571429,59647.285714,66797.857143,...,421.857143,1643900.0,8398280.0,10042180.0,88.797143,11.202857,285.571429,253.667129,31.9043,73
73,2021-08-05,537.0,3.0,286.428571,94.142857,50.142857,44.857143,7730.428571,42145.571429,49876.0,...,442.428571,1696487.0,8733093.0,10429580.0,89.594286,10.405714,286.428571,256.636057,29.792514,74
74,2021-08-12,412.857143,2.714286,260.571429,90.714286,52.285714,48.571429,6603.142857,41310.0,47913.142857,...,463.857143,1745546.0,9040271.0,10785820.0,84.481429,15.518571,260.571429,220.365786,40.205643,75
75,2021-08-19,351.571429,4.142857,212.285714,82.142857,48.142857,44.0,6538.285714,40620.142857,47158.428571,...,486.0,1790636.0,9320639.0,11111270.0,82.035714,17.964286,212.285714,174.029914,38.2558,76
76,2021-08-26,277.714286,2.571429,172.428571,68.571429,35.714286,30.571429,5996.0,37096.714286,43092.714286,...,508.285714,1835726.0,9593313.0,11429040.0,84.308571,15.691429,172.428571,145.436014,26.992557,77
77,2021-09-02,252.714286,2.285714,137.285714,48.428571,22.857143,18.285714,5825.571429,43730.142857,49555.714286,...,525.428571,1876842.0,9874756.0,11751600.0,81.582857,18.417143,137.285714,112.007343,25.278371,78
78,2021-09-09,166.142857,1.857143,123.571429,40.0,19.571429,17.142857,5490.714286,51098.142857,56588.857143,...,540.714286,1916121.0,10208790.0,12124910.0,81.941429,18.058571,123.571429,101.149757,22.421671,79
79,2021-09-16,146.571429,1.428571,99.714286,40.857143,18.714286,17.571429,5154.714286,50659.142857,55813.857143,...,552.571429,1952012.0,10571140.0,12523150.0,77.237143,22.762857,99.714286,77.112129,22.602157,80
80,2021-09-23,115.714286,0.714286,84.285714,35.428571,16.428571,14.142857,5493.857143,48718.0,54211.857143,...,557.571429,1990920.0,10915790.0,12906710.0,75.627143,24.372857,84.285714,63.679429,20.606286,81


In [2322]:
# Merge hospitalization  & vaccination dataframes

colums_to_merge = ['midweek', "Denominator",
       'CompletingVaccinationScheme', 'AtLeastOneDose', 'Boosted',
       'ComplVacc_cumsum', 'ComplVacc_percent', 'AtLeastOne_cumsum',
       'AtLeastOne_percent', 'Boosted_cumsum', 'Boosted_percent']

dfvh = dfhosp.merge(df_ta[colums_to_merge], on="midweek", how="left")


In [2328]:
# Add hospitalizations per 100_000 vaccinated and 100_000 unvaccinated

dfvh["Hosp_in_100_000_vac"] = dfvh["n_hospitalized_vaccinated"] * 100_000 / dfvh["ComplVacc_cumsum"]
dfvh["Hosp_in_100_000_unvac"] = dfvh["n_hospitalized_unvaccinated"] \
                                    * 100_000\
                                    / (dfvh["Denominator"] - dfvh["ComplVacc_cumsum"])

dfvh["ratio_hosp_unvacc_vacc"] = dfvh["Hosp_in_100_000_unvac"] / dfvh["Hosp_in_100_000_vac"]

In [2329]:
# Turn df to long format 

df_vacc = dfvh.drop(["Hosp_in_100_000_unvac"], axis=1)\
                    .rename(columns={"Hosp_in_100_000_vac" : "n_hospitalized"})\
                    .assign(Vaccination="Vaccinated")

df_unvacc = dfvh.drop(["Hosp_in_100_000_vac"], axis=1)\
                    .rename(columns={"Hosp_in_100_000_unvac" : "n_hospitalized"})\
                    .assign(Vaccination="Unvaccinated")

dfvhl = pd.concat([df_vacc, df_unvacc])


In [2378]:
# Make plot

fig = px.area(
    dfvhl.query("n_hospitalized == n_hospitalized"), #get rid of nans
    x="midweek", 
    y="n_hospitalized", 
    color="Vaccination",
    custom_data=["Vaccination", "ratio_hosp_unvacc_vacc"],
    width=800,
    height=500,
    color_discrete_map={
        "Unvaccinated" : "#D22727",
        "Vaccinated" : "#316F9A",
    },
    labels={"n_hospitalized": "Hospitalizations per 100 000 ", "Vaccination": ""},
    title="Hospitalizations by vaccination status per 100 000 people (since July 16, 2021)"
)

fig.update_layout(
    hoverlabel_font=dict(color="#2F2E31"), #=white
    font=dict(size=15, color="#2F2E31"),
    legend={'traceorder':'reversed'},
    plot_bgcolor="#FFEBD9",
    paper_bgcolor="#FFEBD9",
    yaxis=dict(
        showgrid=True, 
        gridcolor="#FCD19C",
        gridwidth=0.2,  # tried different values, same issue
    ),
    xaxis=dict(
        showgrid=False,
        range=["2021-07-16", df_h1["date"].iloc[-1].date()],
        title=""
    ),
    hovermode='x unified',
)

fig.update_traces(
    hovertemplate="<b>%{customdata[1]:.1f} times</b> more likely to <br>be hospitalized if unvacc." + \
    "<br><b>%{customdata[0]}</b>: %{y:.0f} <extra></extra>",
    selector=dict(name="Unvaccinated")
)

fig.update_traces(
    hovertemplate="<b>%{customdata[0]}</b>: %{y:.0f} <extra></extra>",
    selector=dict(name="Vaccinated")
)


fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=0.98,
    xanchor="right",
    x=0.9
))



fig.show()




In [2355]:
## Make plot

# get rid of nans & sort
df_tmp = dfvhl.query("n_hospitalized == n_hospitalized")\
            .sort_values(by='midweek')

fig = px.line(
    df_tmp, 
    x="midweek", 
    y="ratio_hosp_unvacc_vacc", 
    width=800,
    height=500,
    labels={"ratio_hosp_unvacc_vacc" : "Ratio unvaccinated : vaccinated"},
    title="How many times more likely is to be hospitalized if unvaccinated"
)

fig.update_layout(
    font=dict(size=15, color="#2F2E31"),
    plot_bgcolor="#FFEBD9",
    paper_bgcolor="#FFEBD9",
    yaxis=dict(
        showgrid=True, 
        gridcolor="#FCD19C",
        gridwidth=0.2,  # tried different values, same issue
    ),
    xaxis=dict(
        showgrid=False,
        title=""
    ),
)


fig.update_traces(
    hovertemplate="%{y:.0f} times more likely<extra></extra>",
    line=dict(width=4, color="#F53E94"),
    #selector=dict(name="Deaths, 14-day average")
)

fig.show()