# Compute Fiber Uptake Rate for Municipalities in South Africa

In [None]:
# Import the necessary libraries
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import boto3
import warnings

# Surpress warnings
warnings.filterwarnings("ignore")
# Use seaborn styles for charts
sns.set()
pd.set_option('display.max_columns', None)

In [None]:
# Instantiate boto3 by providing access and secrete keys
client = boto3.client('s3', aws_access_key_id='AKIATNJHRXAPUA4DIFER', aws_secret_access_key="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")

In [None]:
# S3 bucket name
bucket = "2207-17-fibre-competitive-intensity-model-b"

In [None]:
# Generate a file path to google drive with the file id
file_path_speedtest = 'https://2207-17-fibre-competitive-intensity-model-b.s3.eu-west-1.amazonaws.com/Preprocessed+Data/preprocessed_municipal_speedtest.csv'
file_path_municipality_2016 = 'https://2207-17-fibre-competitive-intensity-model-b.s3.eu-west-1.amazonaws.com/Preprocessed+Data/cleaned-2016-scraped-local-municipal-data.csv'
file_path_metro_municipality_2016 = 'https://2207-17-fibre-competitive-intensity-model-b.s3.eu-west-1.amazonaws.com/Preprocessed+Data/cleaned_scraped-2016-metropolitan-municipal_data_final.csv'

In [None]:
# Load the datasets
df_speedtest = pd.read_csv(file_path_speedtest)
df_municipality_2016 = pd.read_csv(file_path_municipality_2016)
df_metro_municipality_2016 = pd.read_csv(file_path_metro_municipality_2016)

In [None]:
# Preview the speedtest data
df_speedtest

In [None]:
# Preview the local municipality data
df_municipality_2016.head()

In [None]:
# Preview the metro municipality data
df_metro_municipality_2016

In [None]:
# Ensure that the features of the two dataframe matches before concatenating
df_metro_municipality_2016.columns == df_municipality_2016.columns

Just difference in feature title for the three flagged features. Their content are the same.

In [None]:
# Rename features to avoid feature duplication
df_metro_municipality_2016 = df_metro_municipality_2016.rename(columns={'Dependency Ratio: Per 100 (15-64)':'dependency ratio:per_100_15_to_64',
                                                                       'sex_ratio_males_per_100_females':'sex ratio: males per 100 females',
                                                                       'percent_population_growth_per_annum':'Population growth Per annum'})

In [None]:
# Combine the metropolitan and local municipalities into one dataframe
df_all_municipality_2016 = pd.concat([df_metro_municipality_2016,df_municipality_2016], axis=0)

In [None]:
# Preview to ensure the concatenation went smoothly
df_all_municipality_2016.head()

For successful merging of the datasets, the irregularies in the municipality names needs to be fixed since both datasets will be connected using their municipalities.

In [None]:
# Municipality names to be replacedin the municipality dataset
mun_name = ['big 5 hlabisa','dr beyers naudé','kagisano-molopo','khai-ma','koukamma','lepelle-nkumpi',
            'madibeng','mahikeng','maluti-a-phofung','umfolozi','msunduzi','thembisile hani',
           'city of umhlathuze','winnie madikizela-mandela','city of ekurhuleni','ingquza hill','umsinga','nquthu']
# Municipality names from the speedtest datset
mun_name_rep = ['big five hlabisa','dr beyers naude','kagisano/molopo','khâi-ma','kou-kamma','lepele-nkumpi',
           'local municipality of madibeng','mafikeng','maluti a phofung','mfolozi','the msunduzi',
           'thembisile','umhlathuze','mbizana','ekurhuleni','ngquza hill','msinga','nqutu']

In [None]:
# Replace the municipality names in the municipality dataset withe the names in the speedtest dataset
df_all_municipality_2016['municipality'] = df_all_municipality_2016['municipality'].str.lower().replace(mun_name, mun_name_rep)

In [None]:
# Check for municipalities that are in the speed test dataset but are not in the municipality dataset
[i for i in list(df_speedtest['MUNICNAME'].str.lower()) if i not in list(df_all_municipality_2016['municipality'])]

In [None]:
# Check for municipalities that are in the municipality dataset but are not in the speed test dataset
[i for i in list(df_all_municipality_2016['municipality']) if i not in list(df_speedtest['MUNICNAME'].str.lower())]

In order to capture all municipalities, a left or right join should be perfomed given that not all municipality speed test were captured in the Ookla speed test data.

In [None]:
# Change the municipality name case to lower
df_speedtest['MUNICNAME'] = df_speedtest['MUNICNAME'].str.lower()
# Rename the municipality column
df_speedtest.rename(columns={'MUNICNAME': 'municipality'}, inplace=True)

In [None]:
# Merge the datasets
df_merge = pd.merge(df_all_municipality_2016, df_speedtest, how='left', on='municipality')

In [None]:
# Rename the Emalahleni municipalities to their original names so as to be easily visualized with folium and geopandas
df_merge.loc[df_merge['municipality'] == 'emalahleni nkangala', 'municipality'] = 'emalahleni'
df_merge.loc[df_merge['municipality'] == 'emalahleni chris hani', 'municipality'] = 'emalahleni'

In [None]:
# Preview the top 5 rows to ensure merging went smoothly
df_merge.head()

In [None]:
# Check for missing values
df_merge.isnull().sum()

As expected, there are speed test values missing for the four municipalities that were not captured. These missing values will be filled with zero(0) since no speed test was performed.

In [None]:
# Fill missing values
df_merge = df_merge.fillna(0)

In [None]:
# For all municipalities that do not have fiber, the number of devices will be assigned 0
df_merge.loc[((df_merge['fiber']==0) & (df_merge['devices'] < 5)), 'devices'] = 0

In [None]:
# Drop irrelevant features
df_merge.drop(['Unnamed: 0_x','Unnamed: 0_y'],axis=1, inplace=True)

In [None]:
# Compute the uptake rates using two metrics
# Uptake rate per population
df_merge['uptake_rate/population'] = df_merge['devices']/df_merge['population'] * 100
# Uptake rate per households
df_merge['uptake_rate/households'] = df_merge['devices']/df_merge['households'] * 100

In [None]:
# Preview the top 5 rows to ensure calculation went smoothly
df_merge.head()

In [None]:
# Plot and visualize the distribution of the uptake rates
fig, axes = plt.subplots(1, 2, figsize =(15,3))

sns.histplot(df_merge['uptake_rate/population'], ax = axes[0], kde=True)
axes[0].set_title("Uptake Rate Per Population", weight='bold', fontsize=16)

sns.histplot(df_merge['uptake_rate/households'], ax = axes[1], kde=True)
axes[1].set_title("Uptake Rate Per Households", weight='bold', fontsize=16)

plt.show()

A right skewed distribution is observed from the uptake rates with uptake rate per households having a greater spread.

In [None]:
df_merge.isnull().sum()

### Uptake Rate Transformation

Given that the ookla speed tests are recorded only where there is speed test server, the dataset available does not represent the entire fixed broadband usage in all South Africa. In addition, the speed test dataset is roughtly 48K devices for fixed broadband while according to [Mybroadband](https://mybroadband.co.za/news/fibre/482845-south-africas-biggest-fibre-networks.html), about 1.5 million households are connected with fiber. All this points to the fact that we are dealing with sample data from each municipalities.

To ensure we obtain a distribution that is very similar to the population distribution, descriptive statistics such as mean and standard deviation of the population should be known. According to [Statssa](https://www.statssa.gov.za/?p=15473#:~:text=In%202021%2C%20South%20Africa%20had,size%20of%203%2C34%20persons.), South Africa has approximately 18 million house holds. Also, from the [2021 SOuth Africa General Household Survey - page 52](https://www.statssa.gov.za/publications/P0318/P03182021.pdf), 17.2% of metropolitan households have access to internet at home while it was just 1.2% for the rural areas.

With all the information above, we can make an assumption on the mean and standard deviation of fixed broadband(fiber) in South Africa.

***Assumptions***
- *Using the number of households connected to fiber and the total households in South Africa, we can get the percentage and assume that to be the average fiber uptake in South Africa*
- *Using the urban/rural divide for home internet for metropolitan and rural households, we can compute for the standard deviation about the mean. This standard deviation can be assumed to be the standard deviation for SOuth Africa fiber uptake rate"

With the current uptake rate from the available dataset, the two metrics used both have a mean and a standard deviation that is less than 1. Having made the above assumptions, the aim is to transform the current distribution of the computed uptake rates so that its average and standard deviation is the same with that of the population. This transformation will not change the underlying distribution, rather it just shift the mean to the desired position and spread the distribution by the assumed standard deviation of the population.

From **linear transformation**, adding a constant to each value in a distribution will increase/decrease the mean of the distribution to the direction of the constant but the standard deviation of the distribution remains thesame(unchanged). Multiplying each value in a distribution by a constant, the new mean will be equal to the product of the constant and the old mean. Also, the new standard deviation will be equal to the porduct of the constant and the old standard deviation. Further reading can be done [here](https://stattrek.com/random-variable/transformation#:~:text=A%20linear%20transformation%20is%20a,the%20variable%20by%20a%20constant.)

To achieve this transformation, both process will be applied simultaneously.

In [None]:
# Compute for the mean and standard deviation of the distribution for both metric for municipalities with fiber
mean_pop = df_merge[df_merge['fiber']==1]['uptake_rate/population'].mean()
std_pop = df_merge[df_merge['fiber']==1]['uptake_rate/population'].std(ddof=0)
mean_hh = df_merge[df_merge['fiber']==1]['uptake_rate/households'].mean()
std_hh = df_merge[df_merge['fiber']==1]['uptake_rate/households'].std(ddof=0)

In [None]:
# Compute the standard deviation using the data from the population
population_mean = 8.3
average_uptake_rural = 1.2
average_uptake_metropolitan = 17.2

population_stdev = np.sqrt(((average_uptake_rural-population_mean)**2 + (average_uptake_metropolitan-population_mean)**2)/1)
# Note: Division by 1 was because we are using sample standard deviation formula given how sparse the information is

In [None]:
# Compute for the gradient and intercept of the uptake rates to be transformed
scale_pop = population_stdev/std_pop
scale_hh = population_stdev/std_hh

const_pop = population_mean -(scale_pop*mean_pop)
const_hh = population_mean -(scale_hh*mean_hh)

In [None]:
# Transfrom the uptake rates for both metrics
df_merge.loc[df_merge['fiber']==1,'uptake_rate_pop'] = df_merge['uptake_rate/population'].apply(lambda x: (x*scale_pop + const_pop))
df_merge.loc[df_merge['fiber']==1,'uptake_rate_hh'] = df_merge['uptake_rate/households'].apply(lambda x: (x*scale_hh + const_hh))

In [None]:
# Assign zero value to fiber uptake rate for municipalities that has no fiber
df_merge.loc[((df_merge['fiber']==0) & (df_merge['devices'] < 5)),'uptake_rate_pop'] = 0
df_merge.loc[((df_merge['fiber']==0) & (df_merge['devices'] < 5)),'uptake_rate_hh'] = 0

In [None]:
# Preview top few rows to ensure transformation went smoothly
df_merge.head()

In [None]:
# Plot the distribution of the transformed uptake rates
fig, axes = plt.subplots(1, 2, figsize =(15,3))

sns.histplot(df_merge['uptake_rate_pop'], ax = axes[0], kde=True)
axes[0].set_title("Uptake Rate Per Population", weight='bold', fontsize=16)

sns.histplot(df_merge['uptake_rate_hh'], ax = axes[1], kde=True)
axes[1].set_title("Uptake Rate Per Households", weight='bold', fontsize=16)

plt.show()

In [None]:
# Export the cleaned dataframe to a new CSV file
df_merge.to_csv("SA-municipality-uptake-rate.csv")

# Upload the file to the S3 bucket
client.upload_file("SA-municipality-uptake-rate.csv", Bucket=bucket, Key="Preprocessed Data/SA-municipality-uptake-rate.csv")