#### Import necessary dependecies

In [None]:
import pandas as pd
import boto3

#### Read the CSV file

In [None]:
df = pd.read_csv('https://2207-17-fibre-competitive-intensity-model-b.s3.eu-west-1.amazonaws.com/2011-Census-Webscrapped-Demographic-Data/2011-South-Africa-Municipalities.csv')

#### Show a summary of the DataFrame

In [None]:
df.info()

#### Display the first few rows of the DataFrame

In [None]:
df.head(5)

#### Drop unnamed column. It is a redundant index

In [None]:
df.drop('Unnamed: 0', axis=1, inplace=True)

#### Check for missing rows

In [None]:
df.isnull().sum()

#### Per annum has zero entries, unemployment rates dropped for now if there is need to impute missing data at a later stage we will refer to the original file and impute missing figure(s)

In [None]:
df.drop(['Per annum', 'Unemployment rate (official)', 'Youth unemployment rate (official) 15-34'], axis=1, inplace=True)

#### Confirm missing rows

In [None]:
df.isnull().sum()

#### Show a summary of the DataFrame

In [None]:
df.info()

#### Convert columns to appropriate data types and convert the percentage values to decimal values

In [None]:
df['Population'] = df['Population'].replace('[^0-9]', '', regex=True).astype(int)
df['Population under 15'] = df['Population under 15'].replace('[^0-9.]', '', regex=True).astype(float) / 100.0
df['Population 15 to 64'] = df['Population 15 to 64'].replace('[^0-9.]', '', regex=True).astype(float) / 100.0
df['Population over 65'] = df['Population over 65'].replace('[^0-9.]', '', regex=True).astype(float) / 100.0
df[' Per 100 (15-64)'] = df[' Per 100 (15-64)'].astype(float)
df['Males per 100 females'] = df['Males per 100 females'].astype(float)
df['No schooling'] = df['No schooling'].replace('[^0-9.]', '', regex=True).astype(float) / 100.0
df['Matric'] = df['Matric'].replace('[^0-9.]', '', regex=True).astype(float) / 100.0
df['Higher education'] = df['Higher education'].replace('[^0-9.]', '', regex=True).astype(float) / 100.0
df['Households'] = df['Households'].replace('[^0-9]', '', regex=True).astype(int)
df['Average household size'] = df['Average household size'].astype(float)
df['Female headed households'] = df['Female headed households'].replace('[^0-9.]', '', regex=True).astype(float) / 100.0
df['Formal dwellings'] = df['Formal dwellings'].replace('[^0-9.]', '', regex=True).astype(float) / 100.0
df['Housing owned'] = df['Housing owned'].replace('[^0-9.]', '', regex=True).astype(float) / 100.0
df['Flush toilet connected to sewerage'] = df['Flush toilet connected to sewerage'].replace('[^0-9.]', '', regex=True).astype(float) / 100.0
df['Weekly refuse removal'] = df['Weekly refuse removal'].replace('[^0-9.]', '', regex=True).astype(float) / 100.0
df['Piped water inside dwelling'] = df['Piped water inside dwelling'].replace('[^0-9.]', '', regex=True).astype(float) / 100.0
df['Electricity for lighting'] = df['Electricity for lighting'].replace('[^0-9.]', '', regex=True).astype(float) / 100.0

#### Show a summary of the DataFrame to confirm dtype conversion

In [None]:

df.info()

#### Display the first few rows of the DataFrame to confirm percentage to decimal conversion

In [None]:
df.head()

#### Check for duplicates

In [None]:
df.duplicated().sum()

#### Show duplicated rows

In [None]:
df[df.duplicated(keep=False)]

#### Keep the first and drop other duplicated rows

In [None]:
df.drop_duplicates(keep='first', inplace=True)

#### Check for duplicates to confirm duplicated row has been dropped

In [None]:
df.duplicated().sum()

#### Rename the columns appropriately. I added "percent_" to the beginning of each column name that represents a percentage of the population.

In [None]:
df = df.rename(columns={
    'Municipality': 'municipality',
    'Population': 'population',
    'Population under 15': 'percent_population_under_15',
    'Population 15 to 64': 'percent_population_15_to_64',
    'Population over 65': 'percent_population_over_65',
    'Per 100 (15-64)': 'per_100_15_to_64',
    'Males per 100 females': 'males_per_100_females',
    'No schooling': 'percent_no_schooling',
    'Matric': 'percent_matric',
    'Higher education': 'percent_higher_education',
    'Households': 'households',
    'Average household size': 'average_household_size',
    'Female headed households': 'percent_female_headed_households',
    'Formal dwellings': 'percent_formal_dwellings',
    'Housing owned': 'percent_housing_owned',
    'Flush toilet connected to sewerage': 'percent_flush_toilet_connected_to_sewerage',
    'Weekly refuse removal': 'percent_weekly_refuse_removal',
    'Piped water inside dwelling': 'percent_piped_water_inside_dwelling',
    'Electricity for lighting': 'percent_electricity_for_lighting'
})

#### Strip municipality name strings to municipality names only

In [None]:
df['municipality'] = df['municipality'].apply(lambda x: x[:x.index('Local')]).str.strip()

#### Display the first few rows of the DataFrame to confirm the changes in the two steps above

In [None]:
df.head()

#### Write the cleaned dataframe to a new CSV file and save the output in an S3 bucket

In [None]:
# Instantiate boto3 by providing access and secret keys
client = boto3.client('s3', aws_access_key_id='AKIATNJHRXAPUA4DIFER', aws_secret_access_key="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")

# Provide the name of the s3 bucket
bucket = "2207-17-fibre-competitive-intensity-model-b"

# Convert df to CSV
df.to_csv('cleaned_scraped-2011-municipal_data_final.csv')

# Save CSV file to S3 bucket
client.upload_file("cleaned_scraped-2011-municipal_data_final.csv", Bucket=bucket, Key="cleaned_scraped-2011-municipal_data_final.csv")