Notebook for adding electoral district information to geojson

In [1]:
import pandas as pd
import json
import pprint
import math

pp = pprint.PrettyPrinter(indent=4)

In [2]:
#Loading the Statscan Data

statscanData = pd.read_csv("StatsCan_2016_ElecDist.csv")

statscanData.head()

Unnamed: 0.1,Unnamed: 0,PROVINCE,GEO_UID,GEO_ID,NAME,"Population, 2016",Population density,Average age,Average household size,English,French,Non-official languages,Average total income,poverty rate,Non-immigrants,Immigrants
0,,Province,Geo-UID,Geo-ID,Name of Riding,Population in 2016,Population density per squakre km,Average age of the population,Average household size (persons),"English Mother tongue, single response","Fench Mother tongue, single response","Non-official language as mother tongue, single...",Average income among recipients in 2015 ($),% of persons 18 to 64 living in poverty,Number of non-immigrants living in the riding,Number of Immigrants living in the riding
1,0.0,Newfoundland and Labrador,2013A000410001,10001,Avalon,86494,13.4,42.4,2.4,84870,195,480,48083,9.3,84185,1020
2,1.0,Newfoundland and Labrador,2013A000410002,10002,Bonavista--Burin--Trinity,74116,4.4,47.4,2.3,73005,175,370,38924,12.6,72110,695
3,2.0,Newfoundland and Labrador,2013A000410003,10003,Coast of Bays--Central--Notre Dame,77680,2,46.5,2.3,76320,150,485,38258,15,75620,740
4,3.0,Newfoundland and Labrador,2013A000410004,10004,Labrador,27197,0.1,37,2.7,23190,410,3225,56347,6.5,25805,805


In [3]:
statscanData.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 339 entries, 0 to 338
Data columns (total 16 columns):
Unnamed: 0                    338 non-null float64
PROVINCE                      339 non-null object
GEO_UID                       339 non-null object
GEO_ID                        339 non-null object
NAME                          339 non-null object
Population, 2016              339 non-null object
Population density            339 non-null object
Average age                   339 non-null object
Average household size        339 non-null object
    English                   339 non-null object
      French                  339 non-null object
    Non-official languages    339 non-null object
    Average total income      339 non-null object
poverty rate                  336 non-null object
  Non-immigrants              339 non-null object
  Immigrants                  339 non-null object
dtypes: float64(1), object(15)
memory usage: 42.5+ KB


In [4]:
#Dataframe - don't need information in first row, delete and clean up the index

statscanData = statscanData[statscanData['PROVINCE'] != 'Province']
statscanData = statscanData.reset_index()
statscanData = statscanData.drop(["index", "Unnamed: 0"], axis=1)

#Rename columns to get rid of blanks
statscanData = statscanData.rename(columns={'      French':'French','    English':'English',\
'    Non-official languages':'Non_official_languages','    Average total income':'Average_total_income',\
'  Non-immigrants':'Non_immigrants','  Immigrants':'Immigrants','Population density ': 'Population_density',\
'Population, 2016':'Population_2016', 'Average household size': 'Average_household_size', 'poverty rate': 'poverty_rate', 'Average age': 'Average_age'})

statscanData.head()



Unnamed: 0,PROVINCE,GEO_UID,GEO_ID,NAME,Population_2016,Population_density,Average_age,Average_household_size,English,French,Non_official_languages,Average_total_income,poverty_rate,Non_immigrants,Immigrants
0,Newfoundland and Labrador,2013A000410001,10001,Avalon,86494,13.4,42.4,2.4,84870,195,480,48083,9.3,84185,1020
1,Newfoundland and Labrador,2013A000410002,10002,Bonavista--Burin--Trinity,74116,4.4,47.4,2.3,73005,175,370,38924,12.6,72110,695
2,Newfoundland and Labrador,2013A000410003,10003,Coast of Bays--Central--Notre Dame,77680,2.0,46.5,2.3,76320,150,485,38258,15.0,75620,740
3,Newfoundland and Labrador,2013A000410004,10004,Labrador,27197,0.1,37.0,2.7,23190,410,3225,56347,6.5,25805,805
4,Newfoundland and Labrador,2013A000410005,10005,Long Range Mountains,86553,2.3,46.5,2.3,84485,565,620,38151,14.9,83755,1190


In [5]:
statscanData.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338 entries, 0 to 337
Data columns (total 15 columns):
PROVINCE                  338 non-null object
GEO_UID                   338 non-null object
GEO_ID                    338 non-null object
NAME                      338 non-null object
Population_2016           338 non-null object
Population_density        338 non-null object
Average_age               338 non-null object
Average_household_size    338 non-null object
English                   338 non-null object
French                    338 non-null object
Non_official_languages    338 non-null object
Average_total_income      338 non-null object
poverty_rate              335 non-null object
Non_immigrants            338 non-null object
Immigrants                338 non-null object
dtypes: object(15)
memory usage: 39.7+ KB


In [6]:
#Adjusting dataframe - percentage for languages and immigrants

for i in range(0, statscanData.shape[0]):
    a = float(statscanData.loc[i, 'Population_2016'])
    b = float(statscanData.loc[i, 'English'])
    c = float(statscanData.loc[i, 'French'])
    d = float(statscanData.loc[i, 'Non_official_languages'])
    e = float(statscanData.loc[i, 'Non_immigrants'])
    f = float(statscanData.loc[i, 'Immigrants'])
    statscanData.at[i, "English"] = b/a
    statscanData.at[i, 'French'] = c/a
    statscanData.at[i, 'Non_official_languages'] = d/a
    statscanData.at[i, 'Immigrants'] = f/a
    statscanData.at[i, 'Non_immigrants'] = e/a
    #print(a, b, c, d, e, f)

statscanData.head()
    


Unnamed: 0,PROVINCE,GEO_UID,GEO_ID,NAME,Population_2016,Population_density,Average_age,Average_household_size,English,French,Non_official_languages,Average_total_income,poverty_rate,Non_immigrants,Immigrants
0,Newfoundland and Labrador,2013A000410001,10001,Avalon,86494,13.4,42.4,2.4,0.981224,0.00225449,0.00554952,48083,9.3,0.973305,0.0117927
1,Newfoundland and Labrador,2013A000410002,10002,Bonavista--Burin--Trinity,74116,4.4,47.4,2.3,0.98501,0.00236116,0.00499217,38924,12.6,0.972934,0.00937719
2,Newfoundland and Labrador,2013A000410003,10003,Coast of Bays--Central--Notre Dame,77680,2.0,46.5,2.3,0.982492,0.001931,0.00624356,38258,15.0,0.973481,0.00952626
3,Newfoundland and Labrador,2013A000410004,10004,Labrador,27197,0.1,37.0,2.7,0.852668,0.0150752,0.118579,56347,6.5,0.948818,0.0295989
4,Newfoundland and Labrador,2013A000410005,10005,Long Range Mountains,86553,2.3,46.5,2.3,0.976107,0.00652779,0.00716324,38151,14.9,0.967673,0.0137488


In [7]:
#Creating a list of dictionaries - contains the GEO_ID as key and the index as value.
#This list can then be used to retrieve infor for the GeoJson

geoid = []

for i in range(0, statscanData.shape[0]):
    dict_key = str(statscanData.loc[i, "GEO_ID"]).strip()
    geoid.append({dict_key: i})
    
#print(geoid)

In [8]:
#Overview of the Properties in the json-file
#Opens the json and looks at the properties - no changes

with open('electoral_districts.geojson', 'r') as read_file:
        data_test = json.load(read_file)

detail_list = data_test["features"]

print(len(detail_list))
#for i in range(len(detail_list)):
#    detail_dict = detail_list[i]
#    properties = (detail_dict["properties"])
#    #print(str(properties.get("FED_NUM")).strip())

#print(properties)


347


In [9]:
#Simplifying the json properties - contains only the FEDNUM (as a string with possible spaces at beginning and end cut off
# to ensure consistency)

for i in range(len(detail_list)):
    detail_dict = detail_list[i]
    properties = detail_dict["properties"]
    a = str(properties.get("FED_NUM")).strip()
    properties["FED_NUM"] = a
    for e in ['FEDNUM', 'FRNAME', 'PROVCODE', 'DECPOPCNT']:
        properties.pop(e)
    #print(properties)

In [11]:
#Looping through the dictionary - rebuild the properties field with information from the dataframe

#Prepwork: Names of the Columns I want to retrieve from the dataframe

value_list = []

columns_retr = ["NAME", "Population_2016", "Population_density", "Average_age", "Average_household_size",\
               "English", "French", "Non_official_languages", "Average_total_income", "poverty_rate", "Non_immigrants",\
                "Immigrants"]

for i in range(len(detail_list)):
    detail_dict = detail_list[i]
    properties = (detail_dict["properties"])
    #print(str(properties.get("FED_NUM")).strip())
    reference = properties['FED_NUM']
    for j in range(len(geoid)):
        #Selecting the correct row in the dataframe
        if reference in geoid[j].keys():
            index = geoid[j][reference]
            print(i, "success", index)
            for k in range(len(columns_retr)):
                s = statscanData.loc[index, columns_retr[k]]
                if 'N' in str(s):
                    print(s, index)
                if type(s) == float:
                    s = round(s, 2)
                if not isinstance(s, str):
                    if math.isnan(s):
                        s = "Na"
                t = columns_retr[k]
                properties[t] = s
                #print(s, t, type(s))
            #print(properties)
        #else:
            #print("failure")

#pp.pprint(detail_list)

0 success 138
Etobicoke North 138
1 success 141
2 success 275
3 success 276
4 success 279
5 success 280
6 success 281
7 success 282
8 success 283
9 success 96
10 success 97
11 success 98
12 success 99
13 success 100
14 success 101
15 success 102
16 success 103
17 success 107
18 success 104
19 success 109
20 success 131
21 success 132
22 success 134
23 success 71
24 success 73
25 success 142
Haldimand--Norfolk 142
26 success 166
27 success 167
28 success 277
29 success 285
30 success 278
31 success 286
32 success 284
33 success 287
34 success 288
35 success 290
36 success 291
37 success 292
38 success 72
39 success 289
40 success 74
41 success 75
42 success 168
43 success 105
44 success 106
45 success 169
46 success 335
47 success 108
48 success 337
Nunavut 337
49 success 336
Northwest Territories 336
50 success 259
51 success 260
52 success 261
53 success 262
54 success 130
55 success 139
56 success 140
57 success 146
58 success 147
59 success 143
60 success 144
61 success 145
62 succe

In [None]:
#print(data_test)

In [12]:
with open('data_upd.geojson', 'w', encoding='utf-8') as f:
          json.dump(data_test, f)