In [None]:
import pandas as pd
import numpy as np

In [None]:
# Read data
neighbourhood_df = pd.read_csv("04_neighbourhood-profiles-2016.csv")

# As the dataset has quite a few fields, let's start by dropping obvious columns first. 
neighbourhood_df.drop(["Data Source"], axis = 1, inplace=True)

In [None]:
# Now I can filter the data into language and income, also keeping the population row
ndf = neighbourhood_df[(neighbourhood_df['Topic'] == 'Neighbourhood Information') | (neighbourhood_df['Topic'] == 'Mother tongue') | (neighbourhood_df['Category'] == 'Income') | (neighbourhood_df['_id'] == 3)]

# From those rows, I narrowed down the rows I want and filtered the dataset again
#ndf_filtered = ndf.loc[ndf['_id'].isin([3,143,144,145,1056,1057,1058,1059,1060,1061,1062,1063,1064,1065,1066,1067,1068,1069,1070,1071])]
ndf_filtered = ndf.loc[ndf['_id'].isin([1,3,143,144,145,1110,1111,1112,1113,1114,1115])]

# Dropping more useless columns now that filtering is done
ndf_filtered2 = ndf_filtered.drop(['_id','Category'],axis=1).T

# Need a proper heading now. I want row 2 to be my header, so I'm pulling it out and saving it into new_header, and then loading the dataset without the first 2 rows
new_header = ndf_filtered2.iloc[1]
ndf_filtered2 = ndf_filtered2[2:]

# Loading the column headers back in 
ndf_filtered2.columns = new_header

# There's an awkward 'Characteristic' as an index name but is unaccessible in the df's current form. Flipping the dataset does the trick
ndf_filtered2 = ndf_filtered2.T

# Now I can rename the index into 'Neighbourhood' and then flip the data one last time
ndf_filtered2.index.names = ['Neighbourhood']
ndf_filtered2 = ndf_filtered2.T

# The columns have weird leading and trailing spaces. Let's get rid of them.
ndf_filtered2.columns = ndf_filtered2.columns.str.strip()

In [None]:
# Creating a list of columns on income to filter
money_list = ndf_filtered2[ndf_filtered2.filter(regex = 'In the').columns].columns.tolist()

# Removing the ","  between numbers
ndf_filtered2 = ndf_filtered2.replace(r',','',regex = True)

# Changing columns to numeric
ndf_filtered2 = ndf_filtered2.apply(pd.to_numeric)

In [None]:
### Now let's aggregate the data a bit for visualization sake ###
ndf_filtered2['low_income'] = ndf_filtered2['In the bottom decile'] + ndf_filtered2['In the second decile'] + ndf_filtered2['In the third decile']
ndf_filtered2['midplus_income'] = ndf_filtered2['In the fourth decile'] + ndf_filtered2['In the fifth decile'] + ndf_filtered2['In the top half of the distribution']

ndf_filtered2 = ndf_filtered2.drop(money_list, axis=1)

In [None]:
# Renaming columns
ndf_filtered2 = ndf_filtered2.rename(columns={"Population, 2016": "population_2016", "English": "english", "French": "french", "Non-official languages": "non_official_languages"})

# Creating percentage columns
ndf_filtered2['percent_low'] = ndf_filtered2['low_income']/ndf_filtered2['population_2016']
ndf_filtered2['percent_midplus'] = ndf_filtered2['midplus_income']/ndf_filtered2['population_2016']
ndf_filtered2['percent_english'] = ndf_filtered2['english']/ndf_filtered2['population_2016']
ndf_filtered2['percent_french'] = ndf_filtered2['french']/ndf_filtered2['population_2016']
ndf_filtered2['percent_nol'] = ndf_filtered2['non_official_languages']/ndf_filtered2['population_2016']

In [None]:
# Reordering columns
ndf_cols = ndf_filtered2.columns.to_list()
new_ndf_cols = ['Neighbourhood Number','population_2016',
 'english',
 'percent_english',
 'french',
 'percent_french',
 'non_official_languages',
 'percent_nol',
 'low_income',
 'percent_low',
 'midplus_income',
 'percent_midplus']
ndf_filtered2 = ndf_filtered2[new_ndf_cols]

# Getting rid of the City of Toronto Row
ndf_filtered2 = ndf_filtered2[1:]

In [None]:
# Turning Neighbourhood Number from float to str
ndf_filtered2['Neighbourhood Number'] = ndf_filtered2['Neighbourhood Number'].astype(int)
ndf_filtered2['Neighbourhood Number'] = ndf_filtered2['Neighbourhood Number'].astype(str)

In [None]:
# Writing to .csv. Need to reset the index and turn it into an actual column
ndf_filtered2 = ndf_filtered2.reset_index()
ndf_filtered2 = ndf_filtered2.rename(columns={"index":"Neighbourhood"})
ndf_filtered2.index.names=[""]
ndf_filtered2.to_csv("05_neighbourhood-profiles_cleaned.csv", index=False)