# Preprocess Data for Algorithm Development

In [None]:
# Import the relevant libraries
import numpy as np
import pandas as pd
import boto3

In [None]:
# Instantiate boto3 by providing access and secrete keys
client = boto3.client('s3', aws_access_key_id='AKIATNJHRXAPUA4DIFER', aws_secret_access_key="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")

In [None]:
# S3 bucket name
bucket = "2207-17-fibre-competitive-intensity-model-b"

In [None]:
# Generate a file path to the S3 bucket
census_file_path = 'https://2207-17-fibre-competitive-intensity-model-b.s3.eu-west-1.amazonaws.com/Preprocessed+Data/Cleaned_2011_SA_Census_Sample.dta'
uptake_file_path = 'https://2207-17-fibre-competitive-intensity-model-b.s3.eu-west-1.amazonaws.com/Preprocessed+Data/SA-municipality-uptake-rate.csv'

In [None]:
# Load the datasets
df_census = pd.read_stata(census_file_path)
df_uptake = pd.read_csv(uptake_file_path)

In [None]:
# Check the features of the census data
df_census.columns

In [None]:
# Preview the top five rows of the census data
df_census.head()

In [None]:
# Preview the top five rows of the uptake rate dataset
df_uptake.head()

The municpality names in the census data and uptake rate data have some descrepancies as some municipalities have been disestablished in the uptake rate municipality and some municipality names have also changed. There is a need to align the municipalities to share the same name for easy merging of the two dataframes

In [None]:
# Rename the Emalahleni municipalities to their original names so as to be easily merged with the census data
df_uptake.loc[(df_uptake['municipality'] == 'emalahleni') & (df_uptake['DISTRICT_N'] == 'Nkangala'), 'municipality'] = 'emalahleni-mp'
df_uptake.loc[(df_uptake['municipality'] == 'emalahleni') & (df_uptake['DISTRICT_N'] == 'Chris Hani'), 'municipality'] = 'emalahleni-ec'

In [None]:
# Store the names of municipalities to be replaced in a list
old_mun_name = ['emnambithi/ladysmith','indaka','the big 5 false bay','hlabisa','albert luthuli','umjindi','mbombela','mier',
                '//khara hais','engcobo','camdeboo','ikwezi','baviaans','ingwe','kwa sani','pixley ka seme','tsolwana','inkwanca',
               'lukanji','fetakgomo','greater tubatse','ga-segonyane','umtshezi','imbabazane','ventersdorp',
                'tlokwe city council','madibeng','mookgopong','modimolle','naledi-fs','naledi-nw','randfontein','westonaria','ezingoleni',
               'hibiscus coast','nkonkobe','nxuba','sol plaatjie','maletswai','gariep']
# Store the new names of municipalities in a list
new_mun_name = ['alfred duma','alfred duma','big five hlabisa','big five hlabisa','chief albert luthuli','city of mbombela','city of mbombela',
               'dawid kruiper','dawid kruiper','dr ab xuma','dr beyers naude','dr beyers naude','dr beyers naude',
               'dr nkosazana dlamini zuma','dr nkosazana dlamini zuma','dr pixley ka isaka seme','enoch mgijima','enoch mgijima',
               'enoch mgijima','fetakgomo tubatse','fetakgomo tubatse','ga-segonyana','inkosi langalibalele',
                'inkosi langalibalele','jb marks','jb marks','local municipality of madibeng', 'modimolle-mookgophong',
               'modimolle-mookgophong','naledi','naledi','rand west city','rand west city','ray nkonyeni','ray nkonyeni',
                'raymond mhlaba','raymond mhlaba','sol plaatje','walter sisulu','walter sisulu']


In [None]:
# Ensure that the length of both lists are equal
len(new_mun_name) == len(old_mun_name)

In [None]:
# Replace the old municipality names with the new one in the census dataset
df_census['H_MUNIC'] = df_census['H_MUNIC'].str.lower().replace(old_mun_name, new_mun_name)

In [None]:
# Check for municipalities that are in the uptake rate dataset but are not in the census dataset
[i for i in list(df_uptake['municipality']) if i not in list(df_census['H_MUNIC'].str.lower().unique())]

The "Collins Chabane" municipality had no match in the 2011 census data as municipality was created in 2016 from some portions of Thulamela and Makhado municipalities

In [None]:
# Check for municipalities that are in the census dataset but are not in the uptake rate dataset
[i for i in list(df_census['H_MUNIC'].str.lower().unique()) if i not in list(df_uptake['municipality'])]

These three municipalities that were not found in the speed test data were disestablished in 2016 and were either integrated into another municipality or broken into multiple municipalities.

In [None]:
# Extract the columns of interest from the census dataset
df_new_census = df_census.iloc[:, -13:]

## Feature Engineering

In [None]:
# Engineer numerical values from the employment status column
df_new_census['DERH_HH_EMPLOY_STATUS'] = pd.to_numeric(df_new_census['DERH_HH_EMPLOY_STATUS'].map({'Employed': 1,
                                                                                     'Not economically active':0,
                                                                                     'Unemployed':0,
                                                                                     'Household head out of working age scope i.e. ...':0,
                                                                                     'Discouraged work-seeker':0 }))

In [None]:
# Fill the missing values in the employment status
df_new_census['DERH_HH_EMPLOY_STATUS'] = df_new_census['DERH_HH_EMPLOY_STATUS'].fillna(0)

In [None]:
# Map the household geotypes into numerical components of appropriate weights
df_new_census['H_GEOTYPE'] = pd.to_numeric(df_new_census['H_GEOTYPE'].map({'Farms':0.5, 'Urban':1.5,'Traditional':1}))

In [None]:
df_new_census

In [None]:
# Separate the categorical feautures from the numeric features so as to facilitate aggregation
df_census_cat = df_new_census[['H_MUNIC', 'H_GEOTYPE','DERH_HH_EMPLOY_STATUS']]

In [None]:
# Drop features that categorical features to keep only the numeric ones
df_new_census.drop(['DERH_HH_EMPLOY_STATUS','DERH_XPOP','DERH_INCOME_CLASS','HHLD_10PERCENT_WGT',
                    'H_DISTRICT','H_PROVINCE','H_GEOTYPE'], axis=1, inplace=True)

In [None]:
# Aggregate the numeric census dataset and reset the index
grouped_mun_numeric = df_new_census.groupby(['H_MUNIC']).mean().reset_index()

In [None]:
# Aggregate the categorical census dataset and reset the index
grouped_mun_cat = df_census_cat.groupby(['H_MUNIC']).sum().reset_index()

In [None]:
# Combine the two census datasets into one
df_agg_census = pd.merge(grouped_mun_cat,grouped_mun_numeric, on='H_MUNIC')

In [None]:
# View top five rows of the combined datasets
df_agg_census.head()

In [None]:
# Create a combined dataframe for the census and uptake rates
df_cleaned_merge = pd.merge(df_uptake, df_agg_census, how='left', left_on='municipality', right_on='H_MUNIC')

In [None]:
# View the top five rows of the merged datasets to ensure everything went smoothly
df_cleaned_merge.head()

In [None]:
# Check for missing values
df_cleaned_merge.isnull().sum()

In [None]:
# Export dataset for modelling
df_cleaned_merge.to_csv("municipality-data-for-modelling.csv")

# Upload the file to the S3 bucket
client.upload_file("municipality-data-for-modelling.csv", Bucket=bucket, Key="Data for Modeling/municipality-data-for-modelling.csv")