# Preprocessing Ookla Speed Test Data

- Filter for fibre speed test
- Aggregate the fltered data into municipalities

In [None]:
# Import the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import boto3

sns.set()

In [None]:
# Instantiate boto3 by providing access and secrete keys
client = boto3.client('s3', aws_access_key_id='AKIATNJHRXAPUA4DIFER', aws_secret_access_key="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")

In [None]:
# S3 bucket name
bucket = "2207-17-fibre-competitive-intensity-model-b"

In [None]:
# Get the speed test file url from the s3 bucket
speedtest_file_path = 'https://2207-17-fibre-competitive-intensity-model-b.s3.eu-west-1.amazonaws.com/Raw+Data/joined-ookla_data_municipality_final.csv'

In [None]:
# Load speed test data
df_speedtest = pd.read_csv(speedtest_file_path)

In [None]:
# Preview the speed test data
df_speedtest

In [None]:
# Get meta data of the speed test data
df_speedtest.info()

In [None]:
# Check for number of missing values
df_speedtest.isnull().sum()

There are no missing values in the data set

Emalahleni municipality exist for two different districts and these municipalities are valid. to avoid any problem when annlyzing, we will attach their district names to them. This will be super helpful when trying to merge with other datasets

In [None]:
# Rename and attach the district to Emalahleni municipalities
df_speedtest.loc[(df_speedtest['MUNICNAME'] == 'Emalahleni') & (df_speedtest['DISTRICT_N'] == 'Nkangala'), 'MUNICNAME'] = 'Emalahleni Nkangala'

df_speedtest.loc[(df_speedtest['MUNICNAME'] == 'Emalahleni') & (df_speedtest['DISTRICT_N'] == 'Chris Hani'), 'MUNICNAME'] = 'Emalahleni Chris Hani'

### Filtering Fiber Speed Test

The speed test information is on fixed broadband. According to [statista](https://www.statista.com/statistics/1346082/fixed-internet-subscriptions-in-south-africa-by-technology-type/#:~:text=Most%20of%20the%20fixed%20internet,the%20internet%20using%20that%20means.), this may include fiber, DSL(ADSL and VDSL), and other fixed broadband. With other fixed broadband having the poorest speed test and fiber having the best speed test.

Digital Subscriber Line (DSL) according to [spiceworks](https://www.spiceworks.com/tech/networking/articles/digital-subscriber-line/#:~:text=A%20digital%20subscriber%20line%20or,close%20to%20the%20DSL%20provider.), uses the voice frequency of telephone lines to send and receive internet data. There are several types of DSL but for the scope of this project, the team will only require information on ADSL and VDSL.

In order to successfully filter for fiber speed test, given that fiber is the fastest among the fixed briadband, best speed test information on other fixed broadband in South Africa is required.

This information can be gotten from [Mybroadband](https://mybroadband.co.za/news/adsl/163958-top-5-adsl-and-vdsl-isp-speeds-in-south-africa.html) with the following summary:

- ADSL has a maximum average download speed of 3863Kbps, a maximum average upload speed of 531Kbps, and a lowest average latency of 51ms
- VDSL has a maximum average download speed of 17390Kbps, a maximum average upload speed of 1967Kbps, and a lowest average latency of 37ms

Given that VDSL is the fastest of all other fixed broadband besides fiber, we will filter for fiber speed test using VDSL values as the threshold.

***Assumption***
- *Fixed broadband speeds are classified as either fiber or VDSL*

In [None]:
# Set threshold values for VDSL
download_threshold = 17390
upload_threshold = 1967
latency_threshold = 37

Latency is signinficantly affected by distance from the service provider. This means that a low latency does not necessarily mean that the fixed broadband is fiber. Also, a high latency does not mean that the broadband is not fiber. In order to avoid this complications, the team will use just upload and download speed as the threshhold for fiber.

In [None]:
# Create a new column for fiber speedtests
df_speedtest['fiber'] = 0

In [None]:
# Use conditions to mark 'fiber' feature as 1 if the condition is met for that row
df_speedtest.loc[(df_speedtest['avg_d_kbps'] > download_threshold) & (df_speedtest['avg_u_kbps'] > upload_threshold), 'fiber'] = 1

In [None]:
# Extract fiber contents
df_fiber_du = df_speedtest[df_speedtest['fiber'] ==1]

In [None]:
# Using both download and upload speed
df_non_fiber_du = df_speedtest[df_speedtest['fiber'] ==0]

In [None]:
# No of municipalities with fiber
print(df_fiber_du['MUNICNAME'].nunique())
print(df_non_fiber_du['MUNICNAME'].nunique())

We can observe 208 unique municipalities in the non fiber dataframe. This is so because there are some parts of municipalities which has been identified to have fiber that does not have fiber.

In [None]:
# Get descriptive statistics for the fiber data
df_fiber_du.describe()

In [None]:
# Plot the distribution of the fiber speed test
fig, axes = plt.subplots(1, 3, figsize =(15,3))

sns.kdeplot(df_fiber_du['avg_d_kbps'], ax = axes[0])
axes[0].set_title("Average Download Speed")

sns.kdeplot(df_fiber_du['avg_u_kbps'], ax = axes[1])
axes[1].set_title("Average Upload Speed")

sns.kdeplot(df_fiber_du['avg_lat_ms'], ax = axes[2])
axes[2].set_title("Average Latency")

plt.show()

There are some part of municipalities that do not have fiber, and these municipalities can be found in both fiber and non fiber dataframes. To ensure no municipalities are duplicated, we would need to remove the names of municipalities that have fiber from the non fiber dataframe.

In [None]:
# Store all unique municipalities in an array
munic_with_fiber = np.array(df_fiber_du['MUNICNAME'].unique())
# Set municipality name as index
df_non_fiber_du = df_non_fiber_du.set_index('MUNICNAME')

In [None]:
# Remove municipalities that can be found in both fiber and non fiber dataframes
for i in munic_with_fiber:
    if i in np.array(df_non_fiber_du.index):
        df_non_fiber_du.drop(i, axis=0, inplace=True)

In [None]:
# Reset the index and check how many unique municipalities available in non fiber dataframe
df_non_fiber_du = df_non_fiber_du.reset_index()
len(df_non_fiber_du['MUNICNAME'].unique())

A right-skewed distribution is observed for the fiber speed tests.

The number of tiles for some local municipalities is sufficient to aggregate the the speed tests using average in order to normalize the data(**Central limit theorem**). Applying the central limit theorem will help in normalizing the irregularities.

Aggregation of the speed test data will be applied in the following ways:

- The feature "devices" will be used to compute the number of fiber users in each municipality. This feature will be summed up for each municipalities to get the total number of fiber users for each municipality

- The feature "number_of_tiles" will be aggregated from the number of occurence of each municipality

- The feature "avg_d_kbps" shows the average download speed for each tiles within a municipality. This feature will be averaged to get the average download speed for each municipality. Averaging this feature should approximate a normal distribution.

- The feature "avg_u_kbps" shows the average upload speed for each tiles within a municipality. This feature will be averaged to get the average upload speed for each municipality. Averaging this feature should approximate a normal distribution.

- The feature "avg_lat_ms" shows the average laency for each tiles within a municipality. This feature will be averaged to get the average latency for each municipality. Averaging this feature should approximate a normal distribution.

### Aggregate Speed Tests

In [None]:
# Create a DataFrame with total tiles for each municipality
total_tiles_f = df_fiber_du.groupby(['MUNICNAME', 'CAT2','DISTRICT_N'])[['quadkey']].count().rename(columns={'quadkey': 'total_tiles'}).reset_index()
total_tiles_nf = df_non_fiber_du.groupby(['MUNICNAME', 'CAT2', 'DISTRICT_N'])[['quadkey']].count().rename(columns={'quadkey': 'total_tiles'}).reset_index()

In [None]:
# Create a DataFrame with the average speed test for each municipality
avg_speedtest_f = df_fiber_du.groupby('MUNICNAME')[['avg_d_kbps', 'avg_u_kbps', 'avg_lat_ms', 'fiber']].mean().reset_index()
avg_speedtest_nf = df_non_fiber_du.groupby('MUNICNAME')[['avg_d_kbps', 'avg_u_kbps', 'avg_lat_ms', 'fiber']].mean().reset_index()

In [None]:
# Create a DataFrame with total fiber usage
total_devices_f = df_fiber_du.groupby('MUNICNAME')[['devices']].sum().reset_index()
total_devices_nf = df_non_fiber_du.groupby('MUNICNAME')[['devices']].sum().reset_index()

In [None]:
# Merge the speed test and total usage DataFrames
df_preprocessed_f = pd.merge(pd.merge(avg_speedtest_f, total_devices_f, on='MUNICNAME'), total_tiles_f, on='MUNICNAME')
df_preprocessed_nf = pd.merge(pd.merge(avg_speedtest_nf, total_devices_nf, on='MUNICNAME'), total_tiles_nf, on='MUNICNAME')

In [None]:
df_preprocessed = pd.concat([df_preprocessed_f, df_preprocessed_nf], axis=0)

In [None]:
# Plot the distribution of the preprocessed fiber speed test
fig, axes = plt.subplots(1, 3, figsize =(15,3))

sns.kdeplot(df_preprocessed['avg_d_kbps'], ax = axes[0])
axes[0].set_title("Average Download Speed")

sns.kdeplot(df_preprocessed['avg_u_kbps'], ax = axes[1])
axes[1].set_title("Average Upload Speed")

sns.kdeplot(df_preprocessed['avg_lat_ms'], ax = axes[2])
axes[2].set_title("Average Latence")

plt.show()

In [None]:
# Export the file as a csv
df_preprocessed.to_csv("preprocessed_municipal_speedtest.csv")

# Upload the file to the S3 bucket
client.upload_file("preprocessed_municipal_speedtest.csv", Bucket=bucket, Key="Preprocessed Data/preprocessed_municipal_speedtest.csv")