In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Data Cleaning and Uploading to pgAdmin

Methods for cleaning data includes:

- checkNull(dataset, beenCleaned): check if there's a null value
- removeNull(dataset): drop the null value in the dataset


In [2]:
#Checking if null value exist in dataset
def checkNull( dataset ):
    
    beenCleaned = False    
    for num in list(dataset.isnull().sum()):

        if num != 0:
            print("Null value found. Cleaned null values...")
            dataset = dataset.dropna()
            beenCleaned = True
            
    if beenCleaned:
        print("\nNull values have been removed. You're good to go!")

    else:
        print("\nNo null values in. Good to go, no need for cleaning :)")

    return dataset
    

Functions to upload dataset to pgAdmin

In [11]:
from sqlalchemy import create_engine
import pandas as pd
import psycopg2
import psycopg2.extras
import json
import os

data_path = "."

def pgconnect(credential_filepath, db_schema="public"):
    with open(credential_filepath) as f:
        db_conn_dict = json.load(f)
        HOST       = db_conn_dict['host']
        DB_USER    = db_conn_dict['user']
        DB_PW      = db_conn_dict['password']
        DEFAULT_DB = db_conn_dict['user']

        try:
            db = create_engine('postgres+psycopg2://'+DB_USER+':'+DB_PW+'@'+HOST+'/'+DEFAULT_DB, echo=False)
            conn = db.connect()
            print('connected')
        except Exception as e:
            print("unable to connect to the database")
            print(e)
            
        return db,conn

credfilepath = os.path.join(data_path, "data2x02_db.json")
db, conn = pgconnect(credfilepath)


connected


## Cleaning and Uploading StatisticalAreas.csv


In [4]:
rawData = pd.read_csv("StatisticalAreas.csv")

#Check if contains NULL
cleanData = checkNull(rawData)

#Cleaning data so that only numeric values are included in area_id
if cleanData['area_id'].dtypes != int:
    print("\nUpdated area_id data type from " + str(cleanData['area_id'].dtypes) + " to integer & removed data with wrong type")
    cleanData = cleanData[cleanData.area_id.str.isnumeric()]
    cleanData['area_id'].astype(int)

#Cleaning data so that only numeric values are included in parent_area_id
if cleanData['parent_area_id'].dtypes != int:
    print("\nUpdated area_id data type from " + str(cleanData['parent_area_id'].dtypes) + " to integer & removed data with wrong type")
    cleanData = cleanData[cleanData.parent_area_id.str.isnumeric()]
    cleanData['parent_area_id'].astype(int)

statisticalAreas_clean = cleanData



No null values in. Good to go, no need for cleaning :)


In [5]:
#Uploading StatisticalAreas.csv to pgAdmin

conn.execute("DROP TABLE IF EXISTS statisticalareas")

statistical_areas = """CREATE TABLE IF NOT EXISTS statisticalareas (
                         area_id   Integer PRIMARY KEY,
                         area_name VARCHAR(20),
                         parent_area_id Integer
                   )"""
conn.execute(statistical_areas)
print("Successfully, created table for statistical areas...")

table_name = "statisticalareas"
statisticalAreas_clean.to_sql(table_name, con=conn, if_exists='replace',index=False)
print ("\nUploaded clean statistical areas data to pgAdmin")
res = pd.read_sql_query("SELECT * FROM statisticalareas",conn)
res

Successfully, created table for statistical areas...

Uploaded clean statistical areas data to pgAdmin


Unnamed: 0,area_id,area_name,parent_area_id
0,1,New South Wales,0
1,10,Greater Sydney,1
2,11,Rest of NSW,1
3,2,Victoria,0
4,20,Greater Melbourne,2
...,...,...,...
429,106,Hunter Valley exc Newcastle,11
430,11102,Lake Macquarie - West,111
431,111,Newcastle and Lake Macquarie,11
432,11402,Southern Highlands,114


## Cleaning and Uploading Neighbourhoods.csv


In [6]:
rawData = pd.read_csv("Neighbourhoods.csv")
#Check if there's null values

noNull_data = checkNull(rawData)
neighbourhoods_clean = noNull_data.copy()
numericData = ['area_id', 'land_area', 'population', 'number_of_dwellings', 'number_of_dwellings', 
               'number_of_businesses', 'median_annual_household_income', 'avg_monthly_rent']

for col in noNull_data.columns:
    
    if col in numericData:        
        if noNull_data[col].dtypes != int and noNull_data[col].dtypes != float:
            neighbourhoods_clean[col] = noNull_data[col].str.replace(',', '')
            neighbourhoods_clean[col] = pd.to_numeric(neighbourhoods_clean[col])   
            
print ("\nCommas in numeric data removed and all numbers as type string converted to numeric values!")

#Uploading Neighbourhoods.csv to pgAdmin

conn.execute("DROP TABLE IF EXISTS neighbourhoods")

neighbourhoods = """CREATE TABLE IF NOT EXISTS neighbourhoods (
                         area_id   Integer PRIMARY KEY,
                         area_name VARCHAR(20),
                         land_area Integer,
                         population Integer,
                         number_of_dwellings Integer,
                         number_of_businesses Integer,
                         median_annual_household_income Integer,
                         avg_monthly_rent Integer
                   )"""
conn.execute(neighbourhoods)
print("\nSuccessfully created neighbourhoods table.")

#Testing if table has been created
table_name = "neighbourhoods"
neighbourhoods_clean.to_sql(table_name, con=conn, if_exists='replace',index=False)

res = pd.read_sql_query('SELECT * FROM neighbourhoods', conn)
res


Null value found. Cleaned null values...
Null value found. Cleaned null values...
Null value found. Cleaned null values...
Null value found. Cleaned null values...

Null values have been removed. You're good to go!

Commas in numeric data removed and all numbers as type string converted to numeric values!

Successfully created neighbourhoods table.


Unnamed: 0,area_id,area_name,land_area,population,number_of_dwellings,number_of_businesses,median_annual_household_income,avg_monthly_rent
0,102011028,Avoca Beach - Copacabana,643.8000,7590,2325,738.0,46996.0,1906.0
1,102011029,Box Head - MacMasters Beach,3208.6000,10986,3847,907.0,42621.0,1682.0
2,102011030,Calga - Kulnura,76795.1000,4841,1575,1102.0,42105.0,1182.0
3,102011031,Erina - Green Point,3379.3000,14237,4450,1666.0,43481.0,1595.0
4,102011032,Gosford - Springfield,1691.2000,19385,6373,2126.0,45972.0,1382.0
...,...,...,...,...,...,...,...,...
304,106011109,Cessnock Region,1570.4341,7931,3281,673.0,73164.0,1080.0
305,106011113,Singleton Region,4067.2349,4919,2055,698.0,87984.0,1000.0
306,111021218,Morisset - Cooranbong,330.5208,14959,6298,1154.0,58084.0,1260.0
307,114021285,Hill Top - Colo Vale,174.3752,6025,2249,400.0,81120.0,1512.0


## Cleaning BusinessStats.csv


In [7]:
rawData = pd.read_csv("BusinessStats.csv")

#Remove null values
noNull_data = checkNull(rawData)
businesStats_clean = noNull_data.copy()

numericData = ['area_id', 'number_of_businesses', 'accommodation_and_food_services', 'retail_trade', 'agriculture_forestry_and_fishing', 
               'health_care_and_social_assistance', 'public_administration_and_safety', 'transport_postal_and_warehousing']

for col in noNull_data.columns:
    
    if col in numericData:        
        if noNull_data[col].dtypes != int and noNull_data[col].dtypes != float:
            businesStats_clean[col] = noNull_data[col].str.replace(',', '')
            businesStats_clean[col] = pd.to_numeric(businesStats_clean[col])   

            
print ("\nCommas in numeric data removed and all numbers as type string converted to numeric values!")



No null values in. Good to go, no need for cleaning :)

Commas in numeric data removed and all numbers as type string converted to numeric values!


In [8]:
conn.execute("DROP TABLE IF EXISTS businessstats")

business_stats = """CREATE TABLE IF NOT EXISTS businessstats (
                         area_id   Integer PRIMARY KEY,
                         area_name VARCHAR(20),
                         number_of_businesses Integer,
                         accomodation_and_food_services Integer,
                         retail_trade Integer,
                         agriculture_forestry_and_fishing Integer,
                         health_care_and_social_assistance Integer,
                         public_administration_and_safety Integer,
                         transport_postal_and_warehousing Integer
                   )"""
conn.execute(business_stats)
print("Created table for business stats")

table_name = "businessstats"
businesStats_clean.to_sql(table_name, con=conn, if_exists='replace',index=False)

#Testing if table has been created
print ("Business Stats table created")
res = pd.read_sql_query('SELECT * FROM businessstats', conn)
res

Created table for business stats
Business Stats table created


Unnamed: 0,area_id,area_name,number_of_businesses,accommodation_and_food_services,retail_trade,agriculture_forestry_and_fishing,health_care_and_social_assistance,public_administration_and_safety,transport_postal_and_warehousing
0,101021007,Braidwood,629,26,27,280,11,0,35
1,101021008,Karabar,326,7,10,8,11,0,43
2,101021009,Queanbeyan,724,52,47,11,56,3,77
3,101021010,Queanbeyan - East,580,16,23,4,12,0,57
4,101021011,Queanbeyan Region,1642,39,63,292,34,7,81
...,...,...,...,...,...,...,...,...,...
2296,901011001,Christmas Island,0,0,0,0,0,0,0
2297,901021002,Cocos (Keeling) Islands,7,3,0,0,0,0,0
2298,901031003,Jervis Bay,6,0,3,0,0,0,0
2299,901041004,Norfolk Island,0,0,0,0,0,0,0


## Cleaning and Uploading Additional Data Set: RFS_FireStation

In [21]:
#Reading the json file
from __future__ import (absolute_import, division, print_function)
import os
import json

import matplotlib as mpl
import matplotlib.pyplot as plt

from shapely.geometry import Point, Polygon, MultiPolygon
import pandas as pd
import geopandas as gpd
from geopandas import GeoSeries, GeoDataFrame
from geoalchemy2 import Geometry, WKTElement
from sqlalchemy import *
from sqlalchemy import create_engine
import psycopg2
import psycopg2.extras


with open('RFSStation_EPSG4326.json') as f:
  fire_stations = json.load(f)
f.close()

fireStation_df = gpd.GeoDataFrame.from_features(fire_stations['RFSStation'])

conn.execute("DROP TABLE IF EXISTS rfsfirestations")

RFS_FireStations = """CREATE TABLE IF NOT EXISTS rfsfirestations (
                        coordinates GEOMETRY(POINT, 4326) PRIMARY KEY,
                        stationid INTEGER, 
                        station_name VARCHAR(20)
                   )"""

# try:
#     conn.execute(RFS_FireStations)
#     print("Successfully created table for fire_stations")

# except Exception as e:
#     print("Table not created.\n")
#     print(e)
    



In [22]:
# # table_name = "rfsfirestations"
# # fireStation_df.to_sql(table_name, con=conn, if_exists='replace',index=False)

# # res = pd.read_sql_query('SELECT * FROM rfsfirestations', conn)
# # res

# def create_wkt_point_element(geom,srid)
#     return WKTElement(geom.wkt, srid)


# citiesWkCpy = cities.copy()
# citiesWkCpy['location'] = citiesWkCpy['geometry'].apply(lambda x: create_wkt_point_element(geom=x, srid=srid))
# citiesWkCpy = citiesWkCpy.drop(columns="geometry")
# citiesWkCpy

# cities_table_name = "cities"
# srid = 4326
# citiesWkCpy.to_sql(cities_table_name, conn, if_exists='append', index=False,
#                   dtype={'location':Geomtry('POINT', srid)})

In [23]:

# Use GeoAlchemy's WKTElement to create a geom with SRID
# NOTE :: This time we are assuming everything is a point so only wkt conversion is required.
def create_wkt_point_element(geom,srid):
    return WKTElement(geom.wkt, srid)


# since we are altering data again, we should create another copy
citiesWkCpy = fireStation_df.copy()
citiesWkCpy['location'] = citiesWkCpy['geometry'].apply(lambda x: create_wkt_point_element(geom=x,srid=srid))
#delete the old column before insert
citiesWkCpy = citiesWkCpy.drop(columns="geometry")
citiesWkCpy

# Then insert the data from the GeoPandas DataFrame to PostGIS Table
# Use 'dtype' to specify column's type
# For the geom column, we will use GeoAlchemy's type 'Geometry'
cities_table_name = "cities"
srid = 4326
citiesWkCpy.to_sql(cities_table_name, conn, if_exists='append', index=False, 
                         dtype={'location': Geometry('POINT', srid)})

NameError: name 'srid' is not defined

In [24]:
import json

with open('RFSStation_EPSG4326.json') as f:
  fire_stations = json.load(f)

f.close()

data_length = len(fire_stations['RFSStation']['features'])
print("There are {} data points in this file".format(data_length))


fireStations_df = pd.DataFrame()

i = 0
temp_data = list()
while i < data_length:
    coord = fire_stations['RFSStation']['features'][i]['geometry']['coordinates']
    stationid = fire_stations['RFSStation']['features'][i]['properties']['stationid']
    station_name = fire_stations['RFSStation']['features'][i]['properties']['stationnam']
    
    i += 1
    fireStations_df = fireStations_df.append({"coordinates": coord, "stationid": stationid, "station_name": station_name }, ignore_index=True)

table_name = "rfsfirestations"
fireStations_df['coordinates'].astype(point)


RFS_FireStations = """CREATE TABLE IF NOT EXISTS rfsfirestations (
                        coordinates GEOMETRY(POINT, 4326) PRIMARY KEY,
                        stationid INTEGER, 
                        station_name VARCHAR(20)
                   )"""

try:
    conn.execute(RFS_FireStations)
    print("Successfully created table for fire_stations")

except Exception as e:
    print("Table not created.\n")
    print(e)

There are 1856 data points in this file


NameError: name 'point' is not defined

# Calculating Components for Fire Risk


Calculation below involves:
1. Calculating Population Density
2. Dwelling Density
3. Business Density
4. Assistive Service Density


In [32]:
#Calculating population density

res = pd.read_sql_query('SELECT * FROM neighbourhoods', conn)
res

population_density = """
                        Select land_area, population, population/land_area as "population_density" 
                        from neighbourhoods;

                    """
result = pd.read_sql_query(population_density, conn)
result

Unnamed: 0,land_area,population,population_density
0,643.8000,7590,11.789376
1,3208.6000,10986,3.423923
2,76795.1000,4841,0.063038
3,3379.3000,14237,4.213003
4,1691.2000,19385,11.462275
...,...,...,...
304,1570.4341,7931,5.050196
305,4067.2349,4919,1.209421
306,330.5208,14959,45.258876
307,174.3752,6025,34.551932


In [35]:
#Calculating dwellings density

res = pd.read_sql_query('SELECT * FROM neighbourhoods', conn)
res

dwellings_density = """
                        Select land_area, number_of_dwellings, number_of_dwellings/land_area as "dwellings_density" 
                        from neighbourhoods;

                    """
result = pd.read_sql_query(dwellings_density, conn)
result

Unnamed: 0,land_area,number_of_dwellings,dwellings_density
0,643.8000,2325,3.611370
1,3208.6000,3847,1.198965
2,76795.1000,1575,0.020509
3,3379.3000,4450,1.316841
4,1691.2000,6373,3.768330
...,...,...,...
304,1570.4341,3281,2.089231
305,4067.2349,2055,0.505257
306,330.5208,6298,19.054777
307,174.3752,2249,12.897476


In [44]:
#Calculating businessstats density

res = pd.read_sql_query('SELECT * FROM businessstats', conn)
res

businessstats_density = """
                        Select B.number_of_businesses, land_area, B.number_of_businesses/land_area as "business_density"
                        from businessstats B, neighbourhoods;

                    """
result = pd.read_sql_query(businessstats_density, conn)
result


Unnamed: 0,number_of_businesses,land_area,business_density
0,629,643.8000,0.977011
1,629,3208.6000,0.196036
2,629,76795.1000,0.008191
3,629,3379.3000,0.186133
4,629,1691.2000,0.371925
...,...,...,...
711004,1215,1570.4341,0.773671
711005,1215,4067.2349,0.298729
711006,1215,330.5208,3.676017
711007,1215,174.3752,6.967734


In [46]:
#Calculating businessstats density


businessstats_density = """
                        Select B.number_of_businesses, land_area, B.number_of_businesses/land_area as "business_density"
                        from businessstats B, neighbourhoods;

                    """
result = pd.read_sql_query(businessstats_density, conn)
result

Unnamed: 0,number_of_businesses,land_area,business_density
0,629,643.8000,0.977011
1,629,3208.6000,0.196036
2,629,76795.1000,0.008191
3,629,3379.3000,0.186133
4,629,1691.2000,0.371925
...,...,...,...
711004,1215,1570.4341,0.773671
711005,1215,4067.2349,0.298729
711006,1215,330.5208,3.676017
711007,1215,174.3752,6.967734
