# Data Preprocessing

## Loading and Normalizing CSV Data

### Relevant Imports

In [1]:
import geopandas as gpd
import pandas as pd

### VIIRS Data

#### Normalisation Method

We'll use Min-Max Normalisation for our datasets.
$$v' = \frac{v - min_A}{max_A - min_A}$$

#### Japan

##### Light Intensity Normalization

In [2]:
# Filepath for the outputted CSV file
input_csv = "datasets/inputs/Japan_light_intensity.csv"
output_normalized_csv = "datasets/inputs/Japan_light_intensity_normalized.csv"

# Loading the extracted data
data = pd.read_csv(input_csv)

# Normalizing the light intensity column
data['normalized_light_intensity'] = (data['light_intensity'] - data['light_intensity'].min()) / \
                                     (data['light_intensity'].max() - data['light_intensity'].min())

# Save the normalized data to a new CSV
data.to_csv(output_normalized_csv, index=False)
print(f"Normalized VIIRS data saved to {output_normalized_csv}")

Normalized VIIRS data saved to datasets/inputs/Japan_light_intensity_normalized.csv


##### High-Radiance Zones Extraction

In [3]:
# Filepath for the normalized data
normalized_csv = "datasets/inputs/Japan_light_intensity_normalized.csv"
urban_output_csv = "datasets/processed/Japan_urban_light_intensity.csv"

# Loading the normalized data
normalized_data = pd.read_csv(normalized_csv).drop('light_intensity', axis=1)

# Defining a threshold for high-radiance zones
threshold = 0.3

# Extracting high-radiance zones
urban_data = normalized_data[normalized_data['normalized_light_intensity'] >= threshold]

# Saving the extracted urban areas data
urban_data.to_csv(urban_output_csv, index=False)
print(f"Urban areas data saved to {urban_output_csv}")

Urban areas data saved to datasets/processed/Japan_urban_light_intensity.csv


In [4]:
urban_data.head()

Unnamed: 0,longitude,latitude,normalized_light_intensity
4484274,141.350003,43.054166,0.353907
4484275,141.354169,43.054166,0.46118
4484276,141.358336,43.054166,0.354317
8986526,141.500003,40.5375,0.523353
8986527,141.504169,40.5375,0.654899


In [5]:
urban_data.describe()

Unnamed: 0,longitude,latitude,normalized_light_intensity
count,30.0,30.0,30.0
mean,138.226253,36.769305,0.425668
std,2.961285,3.003967,0.149639
min,132.537503,34.220833,0.300467
25%,135.501044,34.670833,0.338119
50%,139.700003,35.677083,0.375986
75%,140.940628,39.333333,0.450831
max,141.508336,43.054166,1.0


#### Philippines

##### Light Intensity Normalization

In [6]:
# Filepath for the outputted CSV file
input_csv = "datasets/inputs/Philippines_light_intensity.csv"
output_normalized_csv = "datasets/inputs/Philippines_light_intensity_normalized.csv"

# Loading the extracted data
data = pd.read_csv(input_csv)

# Normalizing the light intensity column
data['normalized_light_intensity'] = (data['light_intensity'] - data['light_intensity'].min()) / \
                                     (data['light_intensity'].max() - data['light_intensity'].min())

# Save the normalized data to a new CSV
data.to_csv(output_normalized_csv, index=False)
print(f"Normalized VIIRS data saved to {output_normalized_csv}")

Normalized VIIRS data saved to datasets/inputs/Philippines_light_intensity_normalized.csv


##### High-Radiance Zones Extraction

In [7]:
# Filepath for the normalized data
normalized_csv = "datasets/inputs/Philippines_light_intensity_normalized.csv"
urban_output_csv = "datasets/processed/Philippines_urban_light_intensity.csv"

# Loading the normalized data
normalized_data = pd.read_csv(normalized_csv).drop('light_intensity', axis=1)

# Defining a threshold for high-radiance zones
threshold = 0.3

# Extracting high-radiance zones
urban_data = normalized_data[normalized_data['normalized_light_intensity'] >= threshold]

# Saving the extracted urban areas data
urban_data.to_csv(urban_output_csv, index=False)
print(f"Urban areas data saved to {urban_output_csv}")

Urban areas data saved to datasets/processed/Philippines_urban_light_intensity.csv


In [8]:
urban_data.head()

Unnamed: 0,longitude,latitude,normalized_light_intensity
3549238,120.100002,16.125,0.361303
4522957,120.966669,14.754166,0.3546
4522958,120.970836,14.754166,0.367698
4617666,121.054169,14.620833,0.307261
4620602,120.958336,14.616666,0.513001


In [9]:
urban_data.describe()

Unnamed: 0,longitude,latitude,normalized_light_intensity
count,152.0,152.0,152.0
mean,121.17248,14.2341,0.411548
std,0.719162,1.154185,0.121967
min,120.100002,6.904166,0.300249
25%,120.983336,14.516666,0.330265
50%,121.004169,14.541666,0.366314
75%,121.050002,14.580208,0.456612
max,124.662502,16.125,1.0


#### Taiwan

##### Light Intensity Normalization

In [10]:
# Filepath for the outputted CSV file
input_csv = "datasets/inputs/Taiwan_light_intensity.csv"
output_normalized_csv = "datasets/inputs/Taiwan_light_intensity_normalized.csv"

# Loading the extracted data
data = pd.read_csv(input_csv)

# Normalizing the light intensity column
data['normalized_light_intensity'] = (data['light_intensity'] - data['light_intensity'].min()) / \
                                     (data['light_intensity'].max() - data['light_intensity'].min())

# Save the normalized data to a new CSV
data.to_csv(output_normalized_csv, index=False)
print(f"Normalized VIIRS data saved to {output_normalized_csv}")

Normalized VIIRS data saved to datasets/inputs/Taiwan_light_intensity_normalized.csv


##### High-Radiance Zones Extraction

In [11]:
# Filepath for the normalized data
normalized_csv = "datasets/inputs/Taiwan_light_intensity_normalized.csv"
urban_output_csv = "datasets/processed/Taiwan_urban_light_intensity.csv"

# Loading the normalized data
normalized_data = pd.read_csv(normalized_csv).drop('light_intensity', axis=1)

# Defining a threshold for high-radiance zones
threshold = 0.3

# Extracting high-radiance zones
urban_data = normalized_data[normalized_data['normalized_light_intensity'] >= threshold]

# Saving the extracted urban areas data
urban_data.to_csv(urban_output_csv, index=False)
print(f"Urban areas data saved to {urban_output_csv}")

Urban areas data saved to datasets/processed/Taiwan_urban_light_intensity.csv


In [12]:
urban_data.head()

Unnamed: 0,longitude,latitude,normalized_light_intensity
245570,121.750002,25.15,0.324514
256488,121.241669,25.1,0.326211
257407,121.237502,25.095833,0.386666
257408,121.241669,25.095833,0.447414
257409,121.245836,25.095833,0.320626


In [13]:
urban_data.describe()

Unnamed: 0,longitude,latitude,normalized_light_intensity
count,285.0,285.0,285.0
mean,120.575105,23.443318,0.36117
std,0.483176,0.974436,0.086443
min,120.179169,22.491666,0.300001
25%,120.225002,22.6375,0.315573
50%,120.329169,22.995833,0.335162
75%,120.937502,24.145833,0.370983
max,121.750002,25.15,1.0


#### Combining

In [14]:
# Filepaths for urban light intensity CSVs
urban_csvs = [
    "datasets/processed/Japan_urban_light_intensity.csv",
    "datasets/processed/Philippines_urban_light_intensity.csv",
    "datasets/processed/Taiwan_urban_light_intensity.csv"
]

# Output filepath for the combined CSV
combined_csv = "datasets/processed/combined_urban_light_intensity.csv"

# Initializing an empty list to store DataFrames
urban_dfs = []

# Looping through each file and load data
for csv_path in urban_csvs:
    # Extracting the country name from the file path for tagging
    country_name = csv_path.split("/")[-1].split("_")[0]
    
    # Load the CSV into a DataFrame
    df = pd.read_csv(csv_path)
    
    # Add a column for the country name
    df['country'] = country_name
    
    # Append the DataFrame to the list
    urban_dfs.append(df)

# Concatenate all the DataFrames
combined_df = pd.concat(urban_dfs, ignore_index=True)

# Save the combined DataFrame to a CSV file
combined_df.to_csv(combined_csv, index=False)
print(f"Combined urban light intensity data saved to {combined_csv}")

Combined urban light intensity data saved to datasets/processed/combined_urban_light_intensity.csv


### Ookla Data

#### Aggregate Metrics for Urban Regions

In [15]:
# File paths
ookla_csv = "datasets/inputs/Global_internet_combined_tiles.csv"
urban_light_csv = "datasets/processed/combined_urban_light_intensity.csv"
output_cleaned_csv = "datasets/processed/cleaned_urban_internet_data.csv"

# Load Ookla data as GeoDataFrame
ookla_data = pd.read_csv(ookla_csv)
ookla_gdf = gpd.GeoDataFrame(
    ookla_data,
    geometry=gpd.GeoSeries.from_wkt(ookla_data['geometry']),
    crs="EPSG:4326"
) # type: ignore

# Load and convert urban light intensity data to GeoDataFrame
urban_light_data = pd.read_csv(urban_light_csv)
urban_light_gdf = gpd.GeoDataFrame(
    urban_light_data,
    geometry=gpd.points_from_xy(urban_light_data['longitude'], urban_light_data['latitude']),
    crs="EPSG:4326"
) # type: ignore
ookla_gdf.drop(['index_right'], axis=1, inplace=True)

# Perform spatial join to filter Ookla data for urban regions
urban_internet_data = gpd.sjoin(ookla_gdf, urban_light_gdf, how="inner", predicate="intersects")

# Edit the records with shapeName "Republic Of China" to be "Taiwan"
urban_internet_data.loc[urban_internet_data['shapeName'] == "Republic Of China", 'shapeName'] = "Taiwan"

# Change the name of shapeName column to Country
urban_internet_data.rename(columns={'shapeName': 'country'}, inplace=True)

# Filter only relevant columns for the research scope
urban_internet_data_filtered = urban_internet_data[[
    'country', 'avg_d_kbps', 'avg_u_kbps', 'avg_lat_ms', 'tests', 'quarter', 'geometry', 'normalized_light_intensity'
]]

# Save the cleaned dataset
urban_internet_data_filtered.to_csv(output_cleaned_csv, index=False)
print(f"Cleaned dataset saved to {output_cleaned_csv}")


  ookla_data = pd.read_csv(ookla_csv)


Cleaned dataset saved to datasets/processed/cleaned_urban_internet_data.csv


In [16]:
urban_internet_data_filtered.head()

Unnamed: 0,country,avg_d_kbps,avg_u_kbps,avg_lat_ms,tests,quarter,geometry,normalized_light_intensity
3819,The Philippines,61541,56224,13,1,1,"POLYGON ((120.09705 16.13026, 120.10254 16.130...",0.361303
18235,The Philippines,46688,33207,28,172,1,"POLYGON ((120.96497 14.75895, 120.97046 14.758...",0.3546
18238,The Philippines,42159,35749,19,105,1,"POLYGON ((120.97046 14.75895, 120.97595 14.758...",0.367698
18820,The Philippines,42853,49297,14,915,1,"POLYGON ((120.95947 14.62079, 120.96497 14.620...",0.327973
18826,The Philippines,37685,32161,15,1634,1,"POLYGON ((120.95947 14.61548, 120.96497 14.615...",0.349992


### Annual Data Consolidation

In [17]:
output_annual_csv = "datasets/processed/annual_urban_internet_data.csv"

# Group by geometry and country to consolidate quarterly data into annual data
annual_data = urban_internet_data_filtered.groupby(['geometry', 'country'], as_index=False).agg({
    'avg_d_kbps': 'mean',
    'avg_u_kbps': 'mean',
    'avg_lat_ms': 'mean',
    'tests': 'mean',
    'normalized_light_intensity': 'first'  # Keep the annual value
})

# change order of columns
annual_data = annual_data[['country', 'normalized_light_intensity', 'avg_d_kbps', 'avg_u_kbps', 'avg_lat_ms', 'tests', 'geometry']]

# Save the consolidated dataset
annual_data.to_csv(output_annual_csv, index=False)
print(f"Annual consolidated dataset saved to {output_annual_csv}")

Annual consolidated dataset saved to datasets/processed/annual_urban_internet_data.csv


In [18]:
annual_data.head()

Unnamed: 0,country,normalized_light_intensity,avg_d_kbps,avg_u_kbps,avg_lat_ms,tests,geometry
0,The Philippines,0.362867,64757.75,55375.25,21.75,71.5,"POLYGON ((122.07458 6.90461, 122.08008 6.90461..."
1,The Philippines,0.482296,89848.0,44673.5,15.0,3.0,"POLYGON ((121.05286 13.67801, 121.05835 13.678..."
2,The Philippines,0.347055,4855.0,8136.0,84.0,1.0,"POLYGON ((120.5365 14.42936, 120.54199 14.4293..."
3,The Philippines,0.327501,11486.5,6642.0,11.5,7.0,"POLYGON ((120.59692 14.5251, 120.60242 14.5251..."
4,The Philippines,0.351483,1147.0,1644.0,26.0,2.0,"POLYGON ((120.60242 14.5251, 120.60791 14.5251..."


In [19]:
annual_data.describe()

Unnamed: 0,normalized_light_intensity,avg_d_kbps,avg_u_kbps,avg_lat_ms,tests
count,323.0,323.0,323.0,323.0,323.0
mean,0.374875,81846.547214,48536.165635,12.318111,452.670537
std,0.101195,40173.341444,27314.596965,10.083002,650.35052
min,0.300001,1147.0,260.5,2.0,1.0
25%,0.318378,55489.5,35552.0,8.0,46.75
50%,0.344287,78841.0,44973.0,9.25,245.75
75%,0.385314,108732.125,55210.625,13.125,504.0
max,1.0,322379.0,299586.0,95.0,3506.5


In [20]:
output_aggregated_csv = "datasets/processed/urban_internet_metrics.csv"

# Group by urban region or city (use `shapeName` as the grouping column)
grouped = annual_data.groupby('country')

# Compute average metrics for each region
aggregated_metrics = grouped.agg({
    'avg_d_kbps': 'mean',
    'avg_u_kbps': 'mean',
    'avg_lat_ms': 'mean'
}).reset_index()

# Save the aggregated metrics to CSV
aggregated_metrics.to_csv(output_aggregated_csv, index=False)
print(f"Aggregated internet metrics saved to {output_aggregated_csv}")

Aggregated internet metrics saved to datasets/processed/urban_internet_metrics.csv


In [21]:
aggregated_metrics

Unnamed: 0,country,avg_d_kbps,avg_u_kbps,avg_lat_ms
0,Japan,87166.819444,86408.319444,20.277778
1,Taiwan,97193.342262,45824.129677,10.153061
2,The Philippines,53371.898318,47158.737003,14.896789
