In [3]:
import pandas as pd
import time
import requests
import json
import math

data = pd.read_json("https://data.nsw.gov.au/data/api/3/action/datastore_search?resource_id=2776dbb8-f807-4fb2-b1ed-184a6fc2c8aa&limit=10000")
data = pd.json_normalize(data.result.records)
data = data.query("likely_source_of_infection != 'Overseas'")
data = data[["notification_date", "lga_code19"]]
data.columns = ["date", "code"]
data.date = pd.to_datetime(data.date)
data["cases"] = [1] * len(data)

data = data.groupby(["date", "code"]).sum().reset_index()
data.code = data.code.astype("int")
data.dropna(inplace=True)
data.sort_values("date", inplace=True)

data.describe(datetime_is_numeric=True)

act_data = pd.read_csv("./actCases.csv")
act_data.date = pd.to_datetime(act_data.date, format="%d/%m/%Y")
act_data.sort_values("date", inplace=True)
act_data["code"] = 89399

act_data.describe(datetime_is_numeric=True)

data = pd.concat([data, act_data], ignore_index=True)
date_start = data.date.min()
date_end = data.date.max()
date_length = (date_end - date_start).days

data_chart = pd.DataFrame(columns=["date", "code", "cases"])

count = 0
print("date being processed:")
while date_start + pd.Timedelta(count, unit="d") <= date_end:
    date_current = date_start + pd.Timedelta(count, unit="d")    
    print(date_current, end="\r")
    for code in data.code.unique():
        data_temp = data[(data.date == date_current) & (data.code == code)]
        if len(data_temp) == 0:
            data_chart.loc[len(data_chart)] = [date_current, code, 0]
        else:
            data_chart.loc[len(data_chart)] = [date_current, code, data_temp.iat[0, 2]]
    count = count + 1
print("date processing completed")

timer = time.time()
interval = 1
print("processing recent cases ...", end="\r")
for i, indice in enumerate(data_chart.index):
    data_temp = data_chart[(data_chart.code == data_chart.at[indice, "code"]) & (data_chart.date > data_chart.at[indice, "date"] - pd.Timedelta("14 days")) & (data_chart.date <= data_chart.at[indice, "date"])]
    data_chart.at[indice, "recent"] = data_temp.cases.sum()
    if time.time() - interval > timer:
        interval = interval + 1
        print(f"processing recent cases ... {(i + 1) / len(data_chart):.1%}", end="\r")
print("processing recent cases ... complete")

data_chart.to_csv("./cases.csv", index=False)

date being processed:
date processing completed
processing recent cases ... complete


In [8]:
data_pop = pd.read_csv("./lga_pop.csv")
data_pop = data_pop.query("POP_COMP == 10 & TIME == 2020")
data_pop = data_pop[["LGA_2020", "Region", "Value"]]
data_pop.columns = ["code", "name", "pop"]
data_pop = data_pop[(data_pop["code"].apply(lambda x: len(str(x)) == 5)) & (data_pop["code"].apply(lambda x: (str(x)[0] == "1") or (str(x)[0]) == "8" ))]
data_pop["name"] = data_pop["name"].apply(lambda x: x[:x.find(" (")] if x.find(" (") > -1 else "ACT" if x == "Unincorporated ACT" else x)

codes = data_pop.code.unique()
names = data_pop.name.unique()

df = pd.DataFrame(columns=["code", "name", "pop", "cases"])
df["code"] = codes
df["name"] = names

for i in df.index:
    try:
        pop = data_pop[data_pop.code == df.at[i, "code"]].iat[0, 2]
    except:
        pop = 0
    df.at[i, "pop"] = pop
    df.at[i, "cases"] = data_chart[data_chart["code"] == df.at[i, "code"]].cases.sum()

data_distance = pd.read_csv("./dataDistance.csv")
data_distance["lng"] = data_distance["location"].apply(lambda x: float(x.split(",")[0]))
data_distance["lat"] = data_distance["location"].apply(lambda x: float(x.split(",")[1]))

for i in df.index:
    df.at[i, "area"] = data_distance[data_distance["code"] == df.at[i, "code"]].iat[0, 2]
    df.at[i, "lng"] = data_distance[data_distance["code"] == df.at[i, "code"]].iat[0, 4]
    df.at[i, "lat"] = data_distance[data_distance["code"] == df.at[i, "code"]].iat[0, 5]
    df.at[i, "density"] = df.at[i, "pop"] / df.at[i, "area"]
    df.at[i, "location"] = data_distance[data_distance["code"] == df.at[i, "code"]].iat[0, 3]

start = time.time()
interval = 1
print("processing driving times ...", end="\r")
for j, i in enumerate(df.index):
    r = requests.get(f"http://router.project-osrm.org/route/v1/car/151.209900,-33.865143;{df.at[i, 'location']}?overview=false")
    routes = json.loads(r.content)
    try:
        duration = routes.get("routes")[0]["duration"]
    except:
        duration = float("null")
    df.at[i, "duration"] = duration
    if time.time() - interval > start:
        interval = interval + 1
        print(f"processing driving times ... {(j + 1) / len(df):.1%}", end="\r")
print("processing driving times ... complete")

df.to_csv("./data_chart.csv", index=False)
df.dtypes

processing driving times ... complete


code          int64
name         object
pop          object
cases        object
area        float64
lng         float64
lat         float64
density     float64
location     object
duration    float64
dtype: object

In [12]:
df_plot = df.query("duration > 0 & pop > 0")
df_plot["pop"] = df_plot["pop"].astype("int")
df_plot["cases"] = df_plot["cases"].astype("int")
max_pop = df_plot["pop"].max()
max_density = df_plot["density"].max()
max_dur = df_plot["duration"].apply(math.log).max()
df_plot["pop_rel"] = df_plot["pop"] / max_pop
df_plot["den_rel"] = df_plot["density"] / max_density
df_plot["pop_test"] = df_plot["pop_rel"] * df_plot["den_rel"]
df_plot["dur_rel"] = 1 - df_plot["duration"].apply(math.log) / max_dur
df_plot["combo"] = df_plot["pop_rel"] * df_plot["dur_rel"]
df_plot["combo2"] = df_plot["pop_test"] * df_plot["dur_rel"]

# df_plot = df_plot[["name", "pop", "cases", "duration"]]
# df_plot.to_csv("data_simple.csv", index=False)

# df_plot.plot.scatter("pop_rel", "cases", figsize=(12, 8))
# df_plot.plot.scatter("dur_rel", "cases", figsize=(12, 8))
# df_plot.plot.scatter("combo", "cases", figsize=(12, 8))
# df_plot.plot.scatter("combo2", "cases", figsize=(12, 8))

In [14]:
import numpy as np
from sklearn.linear_model import LinearRegression
df_plot

ModuleNotFoundError: No module named 'sklearn'