# Preprocess Ward-Level Data

In [None]:
# Libraries for data loading, data manipulation and data visulisation

import numpy as np
import pandas as pd
import boto3

pd.set_option('display.max_columns', None)

In [None]:
# Instantiate boto3 by providing access and secrete keys
client = boto3.client('s3', aws_access_key_id='AKIATNJHRXAPUA4DIFER', aws_secret_access_key="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")

In [None]:
# S3 bucket name
bucket = "2207-17-fibre-competitive-intensity-model-b"

In [None]:
# S3 bucket file path
file_path = 'https://2207-17-fibre-competitive-intensity-model-b.s3.eu-west-1.amazonaws.com/Raw+Data/2011+Census+Raw+Demographic+Data+Ward+Level+/Income_SA_2011_census.csv'

## Average Income

In [None]:
# Import the income dataset
df_ward_income = pd.read_csv(file_path, encoding='utf-8',  skiprows=[0, 1, 2, 3, 4, 5, 6, 7, 4286, 4287, 4288, 4289, 4290])

In [None]:
# Skip the first and last rows
df_ward_income = df_ward_income.drop('Individual monthly income', axis=1).rename(columns={'Unnamed: 1': 'geography'}).iloc[1:-1]

In [None]:
# Preview the dataset
df_ward_income

In [None]:
# Store the income group in a list
income_group = [i for i in df_ward_income.columns if 'R' in i]
df_income_group = pd.DataFrame(income_group, columns=['income'])

In [None]:
# Create a lower and upper bound for the income group so as to easily compute the average
df_income_group['income_low'] = df_income_group['income'].apply(lambda x: x.split('-')[0])
df_income_group['income_high'] = df_income_group['income'].apply(lambda x: x.split('-',)[-1])

In [None]:
# Remove unwanted characters and white space from the income range
df_income_group['income_low'] = df_income_group['income_low'].apply(lambda x: "".join(x.replace('R', '').replace('or more', '').split()))
df_income_group['income_high'] = df_income_group['income_high'].apply(lambda x: "".join(x.replace('R', '').replace('or more', '').split()))

In [None]:
# Change the income bounds data type
df_income_group['income_low'] = df_income_group['income_low'].astype('int')
df_income_group['income_high'] = df_income_group['income_high'].astype('int')

In [None]:
# Get the average income for each group
df_income_group['average_income'] = (df_income_group['income_low'] + df_income_group['income_high'])/2

In [None]:
# Convert the ward population to income
for i, j in enumerate(income_group):
    
    df_ward_income[j] = df_ward_income[j] * df_income_group['average_income'][i]

In [None]:
# Drop columns with no specific income
df_ward_income =  df_ward_income.drop(['No income','Unspecified','Not applicable'], axis=1)
# Rename "Total to population" and convert to int
df_ward_income = df_ward_income.rename(columns={'Total': 'population'})
df_ward_income['population'] = df_ward_income['population'].astype('int')

In [None]:
# Create a feature called total income and assign zero to all its values
df_ward_income['total_income'] = 0

# Loop through the income and add them to the total income column
for i in income_group:
    
    df_ward_income['total_income'] = df_ward_income['total_income'] + df_ward_income[i]

In [None]:
# Convert total monthly income to annual income
df_ward_income['total_income'] = df_ward_income['total_income'] * 12

In [None]:
# Get the average income for each ward
df_ward_income['average_income'] = df_ward_income['total_income']/df_ward_income['population']

In [None]:
# Select the necessary features for the wards
df_ward_income = df_ward_income[[i for i in df_ward_income.columns if i not in income_group]]

In [None]:
# Extract the wards and their ward codes
df_ward_income['ward'] = df_ward_income['geography'].apply(lambda x: x.replace('Ward', '').split()[1])
df_ward_income['ward_code'] = df_ward_income['geography'].apply(lambda x: x.split(':')[0])

In [None]:
df_ward_income

In [None]:
#Export the income data
df_ward_income.to_csv('ward_income.csv')

# Upload the file to the S3 bucket
client.upload_file("ward_income.csv", Bucket=bucket, Key="Preprocessed Data/ward_income.csv")

### Education

In [None]:
# S3 bucket file path
file_path_edu = 'https://2207-17-fibre-competitive-intensity-model-b.s3.eu-west-1.amazonaws.com/Raw+Data/2011+Census+Raw+Demographic+Data+Ward+Level+/Highest+educational+level.xls'

In [None]:
# Load the dataset
df_education = pd.read_excel(file_path_edu)

In [None]:
# Drop the first column and extract dataframe with values
df_education = df_education.drop('Highest educational level', axis=1).rename(columns={'Unnamed: 1': 'geography'}).iloc[1:-5]

In [None]:
# Preview the dataset
df_education

In [None]:
# Store the features to be categorized for education in a list for ease of iteration
lower_edu = list(df_education.columns[1:-11]) + ['Other']
higher_edu = list(df_education.columns[-11:-5])

In [None]:
# Categorize the education in the dataset
# Create lower education feature
df_education['lower_education'] = 0
df_education['higher_education'] = 0

# Populate lower education feature
for i in lower_edu:
    df_education['lower_education'] = df_education['lower_education'] + df_education[i]
    
# Populate higher education feature    
for i in higher_edu:
    df_education['higher_education'] = df_education['higher_education'] + df_education[i]

In [None]:
# Extract required features
edu_features = ['No schooling', 'lower_education', 'higher_education']

# Get the feature percentage of the population
for i in edu_features:
    df_education[i] = df_education[i] / df_education['Total'] * 100

In [None]:
df_education = df_education.loc[:, ['geography'] + edu_features]

In [None]:
# Extract the wards and their ward codes
df_education['ward'] = df_education['geography'].apply(lambda x: x.replace('Ward', '').split()[1])
df_education['ward_code'] = df_education['geography'].apply(lambda x: x.split(':')[0])

In [None]:
df_education

In [None]:
df_education.to_csv('ward_education.csv')

# Upload the file to the S3 bucket
client.upload_file("ward_education.csv", Bucket=bucket, Key="Preprocessed Data/ward_education.csv")

### Water

In [None]:
# Create a file path for piped water
file_path_water = 'https://2207-17-fibre-competitive-intensity-model-b.s3.eu-west-1.amazonaws.com/Raw+Data/2011+Census+Raw+Demographic+Data+Ward+Level+/Piped+water.xlsx'

In [None]:
# Load the dataset
df_piped_water = pd.read_excel(file_path_water, skiprows=[0, 1, 2, 3, 4, 5, 6, 7, 4286, 4287, 4288, 4289, 4290])

In [None]:
# Drop the first column and extract dataframe with values
df_piped_water = df_piped_water.drop('Piped water', axis=1).rename(columns={'Unnamed: 1': 'geography', 'Total': 'households'}).iloc[1:-1]

In [None]:
# Create a feature for pipe water in community
df_piped_water = df_piped_water.assign(piped_water_on_community_stand = lambda x:  x['Piped (tap) water on community stand: distance less than 200m from dwelling/institution'] + 
                                       x['Piped (tap) water on community stand: distance between 200m and 500m from dwelling/institution'] + 
                                       x['Piped (tap) water on community stand: distance between 500m and 1000m (1km) from dwelling /institution'] + 
                                       x['Piped (tap) water on community stand: distance greater than 1000m (1km) from dwelling/institution'])


In [None]:
# Extract required features
water_features = ['Piped (tap) water inside dwelling/institution','Piped (tap) water inside yard',
                  'No access to piped (tap) water','piped_water_on_community_stand']

# Get the feature percentage of the population
for i in water_features:
    df_piped_water[i] = df_piped_water[i] / df_piped_water['households'] * 100

In [None]:
# Slice the dataframe with only the required features
df_piped_water = df_piped_water.loc[:, ['geography','Piped (tap) water inside dwelling/institution','Piped (tap) water inside yard',
                                        'No access to piped (tap) water','piped_water_on_community_stand','households']]

In [None]:
# Extract the wards and their ward codes
df_piped_water['ward'] = df_piped_water['geography'].apply(lambda x: x.replace('Ward', '').split()[1])
df_piped_water['ward_code'] = df_piped_water['geography'].apply(lambda x: x.split(':')[0])

In [None]:
df_piped_water.to_csv('ward_piped_water.csv')

# Upload the file to the S3 bucket
client.upload_file("ward_piped_water.csv", Bucket=bucket, Key="Preprocessed Data/ward_piped_water.csv")