# Extracting datasets

## Relevant Imports

In [1]:
%matplotlib inline

from datetime import datetime

import geopandas as gpd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import rasterio

from adjustText import adjust_text
from shapely.geometry import Point, box
from rasterio.mask import mask

## VIIRS Data

In [2]:
# Filepaths
raster_file = r'VNL_v21_npp_2021_global_vcmslcfg_c202205302300.average.dat.tif'

### Japan Data

In [3]:
# Filepaths
boundary_file = r'boundaries\geoBoundaries-JPN-ADM0.geojson'
output_csv = r'datasets\inputs\Japan_light_intensity.csv'

# Load Japan's Boundary
Japan = gpd.read_file(boundary_file)
Japan = Japan.to_crs(epsg=4326)

# Open the Raster and Check Overlap
with rasterio.open(raster_file) as src:
    raster_bounds = box(*src.bounds)
    print("Raster Bounds:", src.bounds)
    print("Japan Bounds:", Japan.total_bounds)

    if not raster_bounds.intersects(Japan.unary_union):
        raise ValueError("Japan's boundary does not overlap with the raster extent.")

    # Clip the raster
    Japan_geom_list = [feature["geometry"] for feature in Japan.__geo_interface__["features"]]
    clipped_raster, clipped_transform = mask(src, Japan_geom_list, crop=True)

# Extract Raster Values
light_intensity = clipped_raster[0]
rows, cols = np.where(~np.isnan(light_intensity))
values = light_intensity[rows, cols]
x_coords, y_coords = rasterio.transform.xy(clipped_transform, rows, cols)

data = pd.DataFrame({
    'longitude': x_coords,
    'latitude': y_coords,
    'light_intensity': values
})
data.to_csv(output_csv, index=False)
print(f"Extracted data saved to {output_csv}")


Raster Bounds: BoundingBox(left=-180.00208333335, bottom=-65.00208445335001, right=180.00208621335, top=75.00208333335)
Japan Bounds: [122.93391306  24.04561583 153.98667512  45.55723905]


  if not raster_bounds.intersects(Japan.unary_union):


Extracted data saved to datasets\inputs\Japan_light_intensity.csv


### Philippines Data

In [4]:
# Filepaths
boundary_file = r'boundaries\geoBoundaries-PHL-ADM0.geojson'
output_csv = r'datasets\inputs\Philippines_light_intensity.csv'

# Load Taiwan's Boundary
Philippines = gpd.read_file(boundary_file)
Philippines = Philippines.to_crs(epsg=4326)

# Open the Raster and Check Overlap
with rasterio.open(raster_file) as src:
    raster_bounds = box(*src.bounds)
    print("Raster Bounds:", src.bounds)
    print("Philippines Bounds:", Philippines.total_bounds)

    if not raster_bounds.intersects(Philippines.unary_union):
        raise ValueError("Philippines's boundary does not overlap with the raster extent.")

    # Clip the raster
    Philippines_geom_list = [feature["geometry"] for feature in Philippines.__geo_interface__["features"]]
    clipped_raster, clipped_transform = mask(src, Philippines_geom_list, crop=True)

# Extract Raster Values
light_intensity = clipped_raster[0]
rows, cols = np.where(~np.isnan(light_intensity))
values = light_intensity[rows, cols]
x_coords, y_coords = rasterio.transform.xy(clipped_transform, rows, cols)

data = pd.DataFrame({
    'longitude': x_coords,
    'latitude': y_coords,
    'light_intensity': values
})
data.to_csv(output_csv, index=False)
print(f"Extracted data saved to {output_csv}")


Raster Bounds: BoundingBox(left=-180.00208333335, bottom=-65.00208445335001, right=180.00208621335, top=75.00208333335)
Philippines Bounds: [114.27790169   4.5872945  126.60495743  21.12178332]


  if not raster_bounds.intersects(Philippines.unary_union):


Extracted data saved to datasets\inputs\Philippines_light_intensity.csv


### Taiwan Data 

In [5]:
# Filepaths
boundary_file = r'boundaries\geoBoundaries-TWN-ADM0.geojson'
output_csv = r'datasets\inputs\Taiwan_light_intensity.csv'

# Load Taiwan's Boundary
Taiwan = gpd.read_file(boundary_file)
Taiwan = Taiwan.to_crs(epsg=4326)

# Open the Raster and Check Overlap
with rasterio.open(raster_file) as src:
    raster_bounds = box(*src.bounds)
    print("Raster Bounds:", src.bounds)
    print("Taiwan Bounds:", Taiwan.total_bounds)

    if not raster_bounds.intersects(Taiwan.unary_union):
        raise ValueError("Taiwan's boundary does not overlap with the raster extent.")

    # Clip the raster
    Taiwan_geom_list = [feature["geometry"] for feature in Taiwan.__geo_interface__["features"]]
    clipped_raster, clipped_transform = mask(src, Taiwan_geom_list, crop=True)

# Extract Raster Values
light_intensity = clipped_raster[0]
rows, cols = np.where(~np.isnan(light_intensity))
values = light_intensity[rows, cols]
x_coords, y_coords = rasterio.transform.xy(clipped_transform, rows, cols)

data = pd.DataFrame({
    'longitude': x_coords,
    'latitude': y_coords,
    'light_intensity': values
})
data.to_csv(output_csv, index=False)
print(f"Extracted data saved to {output_csv}")


Raster Bounds: BoundingBox(left=-180.00208333335, bottom=-65.00208445335001, right=180.00208621335, top=75.00208333335)
Taiwan Bounds: [118.20920337  21.89259297 122.03704767  26.25789185]


  if not raster_bounds.intersects(Taiwan.unary_union):


Extracted data saved to datasets\inputs\Taiwan_light_intensity.csv


## Ookla Data

### Helper Functions for Quarter Management

In [6]:
def quarter_start(year: int, q: int) -> datetime:
    if not 1 <= q <= 4:
        raise ValueError("Quarter must be within [1, 2, 3, 4]")

    month = [1, 4, 7, 10]
    return datetime(year, month[q - 1], 1)

def get_tile_url(service_type: str, year: int, q: int) -> str:
    dt = quarter_start(year, q)
    base_url = "https://ookla-open-data.s3-us-west-2.amazonaws.com/shapefiles/performance"
    url = f"{base_url}/type%3D{service_type}/year%3D{dt:%Y}/quarter%3D{q}/{dt:%Y-%m-%d}_performance_{service_type}_tiles.zip"
    return url

### Downloading and Processing All Quarters for 2021

In [7]:
service_type = "fixed"  # Can be "fixed" or "mobile"
year = 2021

#### Defining the Boundaries

In [8]:
boundaries_folder = "./boundaries/"
geojson_files = {
    "Philippines": os.path.join(boundaries_folder, "geoBoundaries-PHL-ADM0.geojson"),
    "Japan": os.path.join(boundaries_folder, "geoBoundaries-JPN-ADM0.geojson"),
    "Taiwan": os.path.join(boundaries_folder, "geoBoundaries-TWN-ADM0.geojson"),
}

#### Extracting the Data

In [9]:
# Initialize an empty list to store all combined data
all_tiles = []

for q in range(1, 5):  # Loop through quarters
    tile_url = get_tile_url(service_type, year, q)
    print(f"Fetching data from: {tile_url}")

    # Read the tiles for the specific quarter
    tiles = gpd.read_file(tile_url)
    tiles["quarter"] = f"Q{q}"  # Add quarter column

    for country, geojson_path in geojson_files.items():
        # Load the boundary from GeoJSON file
        boundary = gpd.read_file(geojson_path)
        boundary = boundary.to_crs(4326)  # Ensure CRS matches the tiles

        # Perform spatial join between tiles and the country's boundary
        country_tiles = gpd.sjoin(tiles, boundary, how="inner", predicate='intersects')

        # Convert speeds to Mbps
        country_tiles['avg_d_mbps'] = country_tiles['avg_d_kbps'] / 1000
        country_tiles['avg_u_mbps'] = country_tiles['avg_u_kbps'] / 1000
        country_tiles['country'] = country  # Add a column to distinguish countries
        country_tiles['quarter'] = q  # Add year column

        # Append to the list
        all_tiles.append(country_tiles)

# Combine all results into a single GeoDataFrame
combined_tiles = gpd.GeoDataFrame(pd.concat(all_tiles, ignore_index=True))
print("Combined tiles data:")
print(combined_tiles.head())

# Save combined data to CSV
combined_tiles.to_csv("datasets/inputs/Global_internet_combined_tiles.csv", index=False)

Fetching data from: https://ookla-open-data.s3-us-west-2.amazonaws.com/shapefiles/performance/type%3Dfixed/year%3D2021/quarter%3D1/2021-01-01_performance_fixed_tiles.zip
Fetching data from: https://ookla-open-data.s3-us-west-2.amazonaws.com/shapefiles/performance/type%3Dfixed/year%3D2021/quarter%3D2/2021-04-01_performance_fixed_tiles.zip
Fetching data from: https://ookla-open-data.s3-us-west-2.amazonaws.com/shapefiles/performance/type%3Dfixed/year%3D2021/quarter%3D3/2021-07-01_performance_fixed_tiles.zip
Fetching data from: https://ookla-open-data.s3-us-west-2.amazonaws.com/shapefiles/performance/type%3Dfixed/year%3D2021/quarter%3D4/2021-10-01_performance_fixed_tiles.zip
Combined tiles data:
            quadkey  avg_d_kbps  avg_u_kbps  avg_lat_ms  tests  devices  \
0  1323012130133323       48367       43153          16      6        2   
1  1323012130311101        4690        3953          27      1        1   
2  1323012130311103        9005        2289          25      1        1   