# Switching to the API

In [615]:
import requests as req
import pandas as pd
import plotly.express as px
import datetime as dt
import datetime
import plotly
import requests
import json

In [616]:
pd.set_option('display.max_rows', 500)

In [617]:
df_historic = pd.read_csv('data_incl_historical_values.csv').drop(columns=['Unnamed: 0'])
df_historic.Added_date_dtm = pd.to_datetime(df_historic.Added_date_dtm, dayfirst=True)
df_historic.Exposure_date_dtm = pd.to_datetime(df_historic.Exposure_date_dtm, dayfirst=True)
df_historic.tier = df_historic.tier.astype('str')
df_historic.date_delay = df_historic.date_delay.astype('float')

In [618]:
base = 'https://discover.data.vic.gov.au'
resp = req.get('https://discover.data.vic.gov.au/api/3/action/datastore_search?resource_id=afb52611-6061-4a2b-9110-74c920bede77')
next_link = resp.json()['result']['_links']['next']
records = resp.json()['result']['records']
total = resp.json()['result']['total']
while len(records) < total:
    resp = req.get(base+next_link)
    records.extend(resp.json()['result']['records'])
    next_link = resp.json()['result']['_links']['next']

In [619]:
len(records)

278

In [620]:
width = 580

In [621]:
df_api = pd.DataFrame(records)
df_api['tier'] = df_api.Advice_title.apply(lambda x: x.split(' ')[1])
df_api.Added_date_dtm = pd.to_datetime(df_api.Added_date_dtm, dayfirst=True)
df_api.Exposure_date_dtm = pd.to_datetime(df_api.Exposure_date_dtm, dayfirst=True)
df_api['date_delay'] = df_api.Added_date_dtm - df_api.Exposure_date_dtm
df_api.date_delay = df_api.date_delay.dt.days.astype('float')

In [622]:
for c in ['Suburb', 'Site_title', 'Site_streetaddress', 'Site_state', 'Site_postcode', 'Notes', 'Advice_title', 'Advice_instruction']:
    df_api[c] = df_api[c].str.replace('\r', '')

In [623]:
df_official = pd.concat([df_api, df_historic])
df_official.drop_duplicates(subset=['Site_title', 'Exposure_date_dtm', 'Added_date_dtm', 'Exposure_time_start_24', 'Exposure_time_end_24',
       'tier', 'date_delay'], inplace=True)
df_official.shape

(860, 19)

### get lat and long

In [637]:
df_official.Site_postcode = df_official.Site_postcode.fillna(0).astype('int')

In [639]:
postcode_df = pd.DataFrame(columns=['postcode', 'lat', 'lon'])
postcode_list = []

for postcode, row in df_official.groupby('Site_postcode').count().sort_values('_id', ascending=False).iterrows():
    if postcode != 0:
        res = requests.get(f'http://v0.postcodeapi.com.au/suburbs/{postcode}.json')
        if res.status_code == 200:
            res_ = res.json()
            if res_:
                postcode_list.append(res_)
            else:
                print(postcode, res_)
        else:
            print(res_)

3086 []


In [640]:
postcode_df = pd.DataFrame(columns=['postcode', 'suburb', 'lat', 'lon'])
for all_codes in postcode_list:
    for el in all_codes:
        postcode_dict = {
            'postcode' : el['postcode'],
            'suburb': el['name'],
            'lat' : el['latitude'],
            'lon' : el['longitude']
        }
        postcode_df = postcode_df.append(postcode_dict, ignore_index=True)

In [641]:
postcode_df.postcode = postcode_df.postcode.astype('float')
df_official.Site_postcode = df_official.Site_postcode.astype('float')

In [642]:
map_merge = pd.merge(df_official, postcode_df, left_on=['Site_postcode','Suburb'], right_on = ['postcode','suburb'], indicator=True, how = 'outer')
map_both = map_merge[map_merge._merge == 'both']

In [643]:
map_left = map_merge[map_merge._merge == 'left_only'].drop(columns=['lat', 'lon','suburb', 'postcode'])
map_merge_left = pd.merge(map_left, postcode_df, left_on=['Site_postcode'], right_on = ['postcode'])

In [644]:
map_df = pd.concat([map_both, map_merge_left])

In [645]:
geo_counts = map_df.groupby(['lat','lon','suburb']).count().sort_values('_id', ascending=False).reset_index()[['lat', 'lon', '_id','suburb']]

In [646]:
geo_counts_info = pd.merge(geo_counts, map_df[['lat','lon','suburb','Site_postcode']], on=['lat','lon','suburb']).drop_duplicates()
# geo_counts_info.Exposure_date_dtm = pd.to_datetime(geo_counts_info.Exposure_date_dtm)

# minday = min(geo_counts_info.Exposure_date_dtm.dt.dayofyear)
# maxday = max(geo_counts_info.Exposure_date_dtm.dt.dayofyear)
# geo_counts_info['Exposure_day'] = (geo_counts_info.Exposure_date_dtm.dt.dayofyear-minday)/(maxday-minday)

In [647]:
fig_map = px.scatter_mapbox(geo_counts_info.sort_values('_id', ascending=False), lat='lat', lon='lon', 
                            opacity=0.5, 
                            hover_data=['suburb','Site_postcode'],
                            mapbox_style='carto-positron', 
                            color = '_id',
                            color_continuous_scale=px.colors.sequential.Plasma[1:-1],
                            size='_id', 
                            size_max=25,
                            height=800, width=800, center={'lat':-37.45,'lon':144.995})
fig_map.update_layout(
    coloraxis_colorbar_title='')
fig_map.show()

In [648]:
fig_map.write_html('exposuremap.html')

### Graphs

In [657]:
nbins = df_official.Added_date_dtm.max() - df_official.Exposure_date_dtm.min() 
fig_date = px.histogram(df_official.sort_values(by='tier'), 
                        x='Exposure_date_dtm', template='plotly_dark', 
                        color='tier', color_discrete_sequence=px.colors.qualitative.Set1[0:3], 
                        title='Exposure date categorised by tier', nbins=nbins.days+1)

fig_date.add_vline(x=dt.datetime(2021, 5, 27, 12, 0))
fig_date.add_annotation(x = '2021-05-27', y= 60, text='  5km', xanchor='left', ax=10, ay=20, showarrow=False)
fig_date.add_vline(x=dt.datetime(2021, 6, 3, 12, 0))
fig_date.add_annotation(x = '2021-06-03', y= 50, text='  10km', xanchor='left', ax=10, ay=20, showarrow=False)
fig_date.add_vline(x=dt.datetime(2021, 6, 10, 12, 0))
fig_date.add_annotation(x = '2021-06-10', y= 40, text='  25km', xanchor='left', ax=10, ay=20, showarrow=False)
fig_date.add_vline(x=dt.datetime(2021, 7, 15, 12, 0))
fig_date.add_annotation(x = '2021-07-15', y= 70, text='  5km', xanchor='right', ax=10, ay=20, showarrow=False)


fig_date.update_layout(
    bargap=0.2, # gap between bars of adjacent location coordinates
    xaxis_title = "Exposure date", 
    yaxis_title = "Number of exposure sites", 
    width = width
)


fig_date.update_xaxes(
    tickangle = 60, 
    nticks = 10,
    range=[df_official.Exposure_date_dtm.min() - dt.timedelta(hours=12), df_official.Added_date_dtm.max() + dt.timedelta(hours=12)]
)

fig_date.show()

In [627]:
fig_added = px.histogram(df_official.sort_values(by='tier'), 
                        x='Added_date_dtm', template='plotly_dark', 
                        color='tier', color_discrete_sequence=px.colors.qualitative.Set1[0:3], 
                        title='Added date categorised by tier', nbins=nbins.days+1)
fig_added.add_vline(x=dt.datetime(2021, 5, 27, 12, 0))
fig_added.add_annotation(x = '2021-05-27', y= 80, text='  5km', xanchor='left', ax=10, ay=20, showarrow=False)
fig_added.add_vline(x=dt.datetime(2021, 6, 3, 12, 0))
fig_added.add_annotation(x = '2021-06-03', y= 70, text='  10km', xanchor='left', ax=10, ay=20, showarrow=False)
fig_added.add_vline(x=dt.datetime(2021, 6, 10, 12, 0))
fig_added.add_annotation(x = '2021-06-10', y= 60, text='  25km', xanchor='left', ax=10, ay=20, showarrow=False)
fig_added.add_vline(x=dt.datetime(2021, 7, 15, 12, 0))
fig_added.add_annotation(x = '2021-07-15', y= 80, text='  5km', xanchor='right', ax=10, ay=20, showarrow=False)


fig_added.update_layout(
    bargap=0.2, # gap between bars of adjacent location coordinates
    xaxis_title = "Added date", 
    yaxis_title = "Number of exposure sites", 
    width = width
)


fig_added.update_xaxes(
    tickangle = 60, 
    nticks = 10,
    range=[df_official.Exposure_date_dtm.min() - dt.timedelta(hours=12), df_official.Added_date_dtm.max() + dt.timedelta(hours=12)]
)

fig_added.show()

In [651]:
fig_delay = px.histogram(df_official, x='date_delay', template='plotly_dark',
                         color_discrete_sequence=[px.colors.qualitative.Set1[1]],
                         title='Reporting delay (reporting time minus exposure time)')
fig_delay.add_vline(x = df_official.date_delay.describe()['50%'], annotation_text='median')
fig_delay.update_layout(width=width, 
                       xaxis_title = "Delay between exposure and report", 
                       yaxis_title = "Number of exposure sites")

In [652]:
df_delay_anal = df_official.groupby('Exposure_date_dtm').mean().reset_index()

fig_delay_anal = px.line(df_delay_anal, 
                        x='Exposure_date_dtm', template='plotly_dark', y='date_delay', 
                        color_discrete_sequence=px.colors.qualitative.Set1[0:3], 
                        title='Mean reporting delay over days')
fig_delay_anal.add_vline(x=dt.datetime(2021, 5, 27, 12, 0))
fig_delay_anal.add_annotation(x = '2021-05-27', y= 20, text='  5km', xanchor='left', ax=10, ay=20, showarrow=False)
fig_delay_anal.add_vline(x=dt.datetime(2021, 6, 3, 12, 0))
fig_delay_anal.add_annotation(x = '2021-06-03', y= 20, text='  10km', xanchor='left', ax=10, ay=20, showarrow=False)
fig_delay_anal.add_vline(x=dt.datetime(2021, 6, 10, 12, 0))
fig_delay_anal.add_annotation(x = '2021-06-10', y= 20, text='  25km', xanchor='left', ax=10, ay=20, showarrow=False)
fig_delay_anal.add_vline(x=dt.datetime(2021, 7, 15, 12, 0))
fig_delay_anal.add_annotation(x = '2021-07-15', y= 20, text='  5km', xanchor='left', ax=10, ay=20, showarrow=False)



fig_delay_anal.add_vline(x=dt.datetime(2021, 5, 23, 12, 0))
fig_delay_anal.add_annotation(x = '2021-05-23', y= 20, text='  start', xanchor='right', ax=10, ay=20, showarrow=False)
fig_delay_anal.add_annotation(x = '2021-05-23', y= 19, text='  contact', xanchor='right', ax=10, ay=20, showarrow=False)
fig_delay_anal.add_annotation(x = '2021-05-23', y= 18, text='  tracing', xanchor='right', ax=10, ay=20, showarrow=False)


fig_delay_anal.update_layout(
    bargap=0.2, # gap between bars of adjacent location coordinates
    xaxis_title = "Exposure date", 
    yaxis_title = "Mean reporting delay", 
    width = width
)


fig_delay_anal.update_xaxes(
    tickangle = 60, 
    nticks = 10,
    range=[df_official.Exposure_date_dtm.min() - dt.timedelta(hours=12), df_official.Added_date_dtm.max() + dt.timedelta(hours=12)]
)

fig_delay_anal.show()

In [653]:
cols = px.colors.n_colors( (253,231,37), (168,1,184), 11)
cols_norm = ['rgb'+str(tuple([i for i in j])) for j in cols]
cols_cut = cols_norm[:10]
cols_cut.extend(['rgb(168,1,184)' for i in range(20)])

In [654]:
fig_repdelay = px.histogram(df_official.sort_values(by='date_delay'), 
                        x='Exposure_date_dtm', template='plotly_dark', 
                        color='date_delay', color_discrete_sequence=cols_cut, 
                        title='Reporting delay for different exposure dates', nbins=nbins.days+1)
fig_repdelay.add_vline(x=dt.datetime(2021, 5, 27, 12, 0))
fig_repdelay.add_annotation(x = '2021-05-27', y= 80, text='  5km', xanchor='left', ax=10, ay=20, showarrow=False)
fig_repdelay.add_vline(x=dt.datetime(2021, 6, 3, 12, 0))
fig_repdelay.add_annotation(x = '2021-06-03', y= 60, text='  10km', xanchor='left', ax=10, ay=20, showarrow=False)
fig_repdelay.add_vline(x=dt.datetime(2021, 6, 10, 12, 0))
fig_repdelay.add_annotation(x = '2021-06-10', y= 40, text='  25km', xanchor='left', ax=10, ay=20, showarrow=False)
fig_repdelay.add_vline(x=dt.datetime(2021, 7, 15, 12, 0))
fig_repdelay.add_annotation(x = '2021-07-15', y= 80, text='  5km', xanchor='right', ax=10, ay=20, showarrow=False)



fig_repdelay.update_layout(
    bargap=0.2, # gap between bars of adjacent location coordinates
    xaxis_title = "Exposure date", 
    yaxis_title = "Number of exposure sites", 
    width = width
)


fig_repdelay.update_xaxes(
    tickangle = 60, 
    nticks = 10,
    range=[df_official.Exposure_date_dtm.min() - dt.timedelta(hours=12), df_official.Added_date_dtm.max() + dt.timedelta(hours=12)]
)

fig_repdelay.show()

In [655]:
nbinsy = (df_official.Added_date_dtm.max() - df_official.Exposure_date_dtm.min()).days + 1
nbinsx = (df_official.Added_date_dtm.max() - df_official.Exposure_date_dtm.min()).days + 1

fig_heat = px.density_heatmap(df_official, y="Added_date_dtm", x="Exposure_date_dtm", 
                         template='plotly_dark', nbinsx=nbinsx, 
                              nbinsy=nbinsy, color_continuous_scale='viridis',
                              title = 'Exposure site counts by date added')
fig_heat.add_vline(x=dt.datetime(2021, 5, 27, 12, 0))
fig_heat.add_hline(y=dt.datetime(2021, 5, 27, 12, 0))
fig_heat.add_annotation(x = '2021-05-27', y = '2021-05-27', text='lockdown four', xanchor='left', ax=20, ay=20)

fig_heat.add_vline(x=dt.datetime(2021, 7, 15, 12, 0))
fig_heat.add_hline(y=dt.datetime(2021, 7, 15, 12, 0))
fig_heat.add_annotation(x = '2021-07-15', y = '2021-07-15', text='lockdown five', xanchor='left', ax=15, ay=-20)



fig_heat.update_layout(xaxis_title = "Exposure Date", 
                       xaxis_tickangle = 60, xaxis_side = 'bottom', 
                       yaxis_title='Date Added',
                       xaxis_nticks=nbinsx//2,
                       width=width, 
                       plot_bgcolor=px.colors.sequential.Viridis[0], 
                       xaxis_showgrid=False,
                       yaxis_showgrid=False, 
                       xaxis_range=[df_official.Exposure_date_dtm.min()- dt.timedelta(hours=12), df_official.Added_date_dtm.max() + dt.timedelta(hours=12)], 
                       yaxis_range=[df_official.Exposure_date_dtm.min()- dt.timedelta(hours=12), df_official.Added_date_dtm.max() + dt.timedelta(hours=12)]
                       )

In [656]:
df_official.to_csv('data_incl_historical_values.csv')

---

# pretty but useless 

In [None]:
import seaborn as sns
sns.set_theme(style="darkgrid")

g = sns.JointGrid(data=df_official, x="Exposure_date_dtm", y="Added_date_dtm", space=0)
g.plot_joint(sns.kdeplot,
             fill=True, 
             thresh=0, levels=10, cmap="viridis")
g.plot_marginals(sns.histplot, color="#03051A", alpha=1, bins=nbinsx)

In [None]:
df_official.shape

# mappin'

Postcode data from: https://www.corra.com.au/australian-postcode-location-data/

In [None]:
df_loc = pd.read_csv('Australian_Post_Codes_Lat_Lon.csv')
df_loc = df_loc.rename(columns={'postcode':'Site_postcode'})
df_loc.Site_postcode = df_loc.Site_postcode.astype('string')

df_loc = df_loc.drop_duplicates(subset=['Site_postcode'])

In [None]:
df_official.columns

In [None]:
a = df_official.groupby(['Exposure_date_dtm', 'Site_postcode'])['_id'].count().reset_index()

In [None]:
a.head()

In [None]:
a.shape

In [None]:
temp = df_official.drop_duplicates(['Site_postcode', 'Suburb'])[['Site_postcode', 'Suburb']]

temp.shape

In [None]:
a.Site_postcode.unique().size, temp.Site_postcode.unique().size

In [None]:
temp.Site_postcode.value_counts()

In [None]:
a_pcs = set(a.Site_postcode.unique())
t_pcs = set(temp.Site_postcode.unique())

In [None]:
a2 = a.merge(temp, 
            left_on='Site_postcode', 
            right_on = 'Site_postcode',
            how='left')

In [None]:
a2.shape

In [None]:
a2.sample(15)

In [None]:
df_map = a.merge(df_loc, on=['Site_postcode'], how='left')
df_map['days_in_the_past'] = df_map.Exposure_date_dtm.subtract(dt.datetime.today()).dt.days

In [None]:
len(df_map)

In [None]:
df_map.sort_values(by='_id', inplace=True, ascending=False)

In [None]:
df_map[['suburb', 'Suburb']].sample(10)

In [None]:
df_map.shape

In [None]:
sub_diff = df_map.suburb.str.casefold() == df_map.Suburb.str.casefold()

In [None]:
sub_diff.mean()

In [None]:
df_map.loc[~sub_diff].drop_duplicates(subset=['suburb', 'Suburb'])

In [None]:
fig_map = px.scatter_mapbox(df_map,lat='lat', lon='lon', hover_data=['Site_postcode', '_id', 'suburb'], 
                            mapbox_style='carto-positron', 
                  color_discrete_sequence=px.colors.qualitative.Set1, 
                  opacity=0.6, title='Exposure sites in space and time',
                  size='_id', size_max=25,
                color='days_in_the_past', height=800, width=800, center={'lat':-37.45,'lon':144.995})
fig_map.update_layout(
    coloraxis_colorbar_title='Days from today')
fig_map.show()

In [None]:
import kaleido as kd

In [None]:
fig_map.write_image('map.jpg', scale=5)

In [None]:
map_html = fig_map.write_html('index.html', include_plotlyjs='cdn')

In [None]:
df_loc.Site_postcode.value_counts().nlargest(20)

In [None]:
df_loc.Site_postcode.shape

In [None]:
df_loc.query('type == "Delivery Area"')

In [None]:
df_loc.type = df_loc.type.str.strip()

In [None]:
df_loc.query('type == "Delivery Area"').drop_duplicates(subset=['Site_postcode']).shape

In [None]:
df_loc.query('Site_postcode == "3189"')

In [None]:
df_loc.head()

In [None]:
df_loc.drop_duplicates()

# Those old things

In [None]:
from selenium import webdriver 
from selenium.webdriver.common.by import By 
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC 
from selenium.common.exceptions import TimeoutException

In [None]:
import pandas as pd
import plotly.express as px

In [None]:
option = webdriver.ChromeOptions()
option.add_argument(" — incognito")

In [None]:
def scrape():
    table = browser.find_element_by_class_name('ch-exposure-sites-search-form')
    df = pd.read_html(table.get_attribute('innerHTML'))
    return df[0]

In [None]:
browser = webdriver.Chrome(executable_path='/Users/ikiko/Desktop/chromedriver', chrome_options=option)

In [None]:
browser.get("https://www.coronavirus.vic.gov.au/exposure-sites")

In [None]:
elements = scrape()

In [None]:
nextpagebutton1 = browser.find_element_by_class_name('rpl-pagination__list').find_elements_by_class_name('rpl-pagination__step')[1]
nextpagebutton1.click()

In [None]:
elements = pd.concat([elements, scrape()])

In [None]:
nextpagebutton2 = browser.find_element_by_class_name('rpl-pagination__list').find_elements_by_class_name('rpl-pagination__step')[2]
nextpagebutton2.click()

In [None]:
elements = pd.concat([elements, scrape()])

In [None]:
nextpagebutton_continuous = browser.find_element_by_class_name('rpl-pagination__list').find_elements_by_class_name('rpl-pagination__step')[3]
i = 3
while i <= 20:
    nextpagebutton_continuous.click()
    elements = pd.concat([elements, scrape()])
    nextpagebutton_continuous = browser.find_element_by_class_name('rpl-pagination__list').find_elements_by_class_name('rpl-pagination__step')[3]
    i = i + 1


In [None]:
nextpagebutton_last = browser.find_element_by_class_name('rpl-pagination__list').find_elements_by_class_name('rpl-pagination__step')[4]
nextpagebutton_last.click()
elements = pd.concat([elements, scrape()])

In [None]:
browser.close()

In [None]:
len(elements)

In [None]:
elements = elements.drop(columns=['Suburb.1', 'Site.1', 'Notes.1', 'Health advice.1'])

In [None]:
elements = elements.reset_index()
elements = elements.drop(columns=['index'])

In [None]:
elements['date_exposure'] = elements['Exposure period'].apply(lambda x: x.split(' ')[2])
elements['date_added'] = elements['Date added'].apply(lambda x: x.split(' ')[2])

In [None]:
elements = elements.drop_duplicates()

Date not available for some... showing up as 'Data' in date_added column 

In [None]:
elements = elements[elements.date_added != "Data"]

In [None]:
elements.date_added = pd.to_datetime(elements.date_added, dayfirst=True)
elements.date_exposure = pd.to_datetime(elements.date_exposure, dayfirst=True)
elements['tier'] = elements['Health advice'].apply(lambda x: x.split(' ')[3])

In [None]:
# filter weird outliers
elements = elements[elements.date_exposure < '2021-10-01']

In [None]:
import plotly.express as px

In [None]:
fig = px.scatter(elements, y="date_added", x="date_exposure", opacity=0.5, hover_data=['Suburb', 'Site'], template='plotly_dark')
fig.update_traces(marker_size = 20)

In [None]:
import plotly.graph_objects as go

fig_heat = px.density_heatmap(elements, y="date_added", x="date_exposure", hover_data=['Suburb', 'Site'], 
                         template='plotly_dark', nbinsx=20, nbinsy=20, color_continuous_scale='viridis'
                        , title = 'Exposure site counts by date added')
fig_heat.add_vline('2021-05-27')
fig_heat.add_hline('2021-05-27')
fig_heat.add_annotation(x = '2021-05-27', y = '2021-05-27', text='lockdown', xanchor='left', ax=10, ay=20)
fig_heat.update_layout(xaxis_title = "Exposure Date", yaxis_title='Date Added')


In [None]:
px.strip(elements, y="date_exposure", template='plotly_dark')

In [None]:
elements['date_delay'] = elements.date_added - elements.date_exposure

In [None]:
elements.date_delay = elements.date_delay.dt.days

In [None]:
fig_delay = px.histogram(elements, x='date_delay', template='plotly_dark', title='Reporting delay (difference between reporting and exposure time)')
fig_delay.add_vline(x = elements.date_delay.describe()['50%'], annotation_text='median')


In [None]:
fig_date = px.histogram(elements.sort_values(by='tier'), x='date_exposure', template='plotly_dark', color='tier', color_discrete_sequence=px.colors.qualitative.Set1[0:3], title='Exposure time')
fig_date.add_vline(x='2021-05-27')
fig_date.add_annotation(x = '2021-05-27', y= 40, text='lockdown', xanchor='left', ax=10, ay=20)

fig_date.update_layout(
    bargap=0.2, # gap between bars of adjacent location coordinates
)

fig_date.update_xaxes(
    tickangle = 60, 
    nticks = 10
)

fig_date.show()

In [None]:
elements.date_exposure.describe()

In [None]:
fig_heat.show()
fig_date.show()
fig_delay.show()

In [None]:
elements.to_csv('scraped_data.csv')

In [None]:
len(elements)