In [1]:
from datetime import datetime

import geopandas as gp
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:


def quarter_start(year: int, q: int) -> datetime:
    if not 1 <= q <= 4:
        raise ValueError("Quarter must be within [1, 2, 3, 4]")

    month = [1, 4, 7, 10]
    return datetime(year, month[q - 1], 1)

def get_tile_url(service_type: str, year: int, q: int) -> str:
    dt = quarter_start(year, q)

    base_url = "https://ookla-open-data.s3-us-west-2.amazonaws.com/shapefiles/performance"
    url = f"{base_url}/type%3D{service_type}/year%3D{dt:%Y}/quarter%3D{q}/{dt:%Y-%m-%d}_performance_{service_type}_tiles.zip"
    return url

In [3]:
years = [2019,2020,2021,2022,2023]

In [4]:
SAL = gp.read_file('../../data/landing/SAL_data/SAL_2021_AUST_GDA2020.shp')
SAL = SAL[SAL["STE_NAME21"] == "Victoria"]

In [5]:
for year in years:
    print(year)
    tile_url = get_tile_url("fixed", year, 4)
    tiles = gp.read_file(tile_url)

    tiles = tiles.to_crs(SAL.crs)
    tiles_in_SA2 = gp.sjoin(SAL, tiles , how="inner", predicate="intersects")

    tiles_in_SA2['avg_d_mbps'] = tiles_in_SA2['avg_d_kbps'] / 1000
    tiles_in_SA2['avg_u_mbps'] = tiles_in_SA2['avg_u_kbps'] / 1000

    SA2_stats = (
    tiles_in_SA2.groupby("SAL_CODE21")
    .apply(
        lambda x: pd.Series(
            {"avg_d_mbps_wt": np.average(x["avg_d_mbps"], weights=x["tests"])}
        )
    )
    .reset_index()
    .merge(
        tiles_in_SA2.groupby("SAL_CODE21")
        .agg(tests=("tests", "sum"))
        .reset_index(),
        on="SAL_CODE21",
    )


    )
    SA2_stats.rename(columns={SA2_stats.columns[0]: "SA2_CODE"}, inplace=True)
    # save the data
    SA2_stats.to_csv(f"../../data/curated/{year}_fixed_broadband.csv", index=False)


SyntaxError: invalid syntax. Perhaps you forgot a comma? (2856448408.py, line 13)