In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Data Cleaning and Uploading to pgAdmin

Methods for cleaning data includes:

- checkNull(dataset, beenCleaned): check if there's a null value
- removeNull(dataset): drop the null value in the dataset


In [None]:
#Checking if null value exist in dataset
def checkNull( dataset ):
    
    beenCleaned = False    
    
    for num in list(dataset.isnull().sum()):

        if num != 0:
            print("Null value found. Coverted null values to -...")
            dataset = dataset.dropna()
            beenCleaned = True
            
    if beenCleaned:
        print("\nNull values have been removed. You're good to go!")

    else:
        print("\nNo null values in. Good to go, no need for cleaning :)")

    return dataset
    

Functions to upload dataset to pgAdmin

In [74]:
from sqlalchemy import create_engine
import pandas as pd
import psycopg2
import psycopg2.extras
import json
import os

data_path = "."

def pgconnect(credential_filepath, db_schema="public"):
    with open(credential_filepath) as f:
        db_conn_dict = json.load(f)
        HOST       = db_conn_dict['host']
        DB_USER    = db_conn_dict['user']
        DB_PW      = db_conn_dict['password']
        DEFAULT_DB = db_conn_dict['user']

        try:
            db = create_engine('postgres+psycopg2://'+DB_USER+':'+DB_PW+'@'+HOST+'/'+DEFAULT_DB, echo=False)
            conn = db.connect()
            print('connected')
        except Exception as e:
            print("unable to connect to the database")
            print(e)
            
        return db,conn

credfilepath = os.path.join(data_path, "data2x02_db.json")
db, conn = pgconnect(credfilepath)


connected


## Cleaning and Uploading StatisticalAreas.csv


In [75]:
rawData = pd.read_csv("StatisticalAreas.csv")

#Check if contains NULL
cleanData = checkNull(rawData)

#Cleaning data so that only numeric values are included in area_id
if cleanData['area_id'].dtypes != int:
    print("\nUpdated area_id data type from " + str(cleanData['area_id'].dtypes) + " to integer & removed data with wrong type")
    cleanData = cleanData[cleanData.area_id.str.isnumeric()]
    cleanData['area_id'].astype(int)

#Cleaning data so that only numeric values are included in parent_area_id
if cleanData['parent_area_id'].dtypes != int:
    print("\nUpdated area_id data type from " + str(cleanData['parent_area_id'].dtypes) + " to integer & removed data with wrong type")
    cleanData = cleanData[cleanData.parent_area_id.str.isnumeric()]
    cleanData['parent_area_id'].astype(int)

statisticalAreas_clean = cleanData



No null values in. Good to go, no need for cleaning :)


In [76]:
#Uploading StatisticalAreas.csv to pgAdmin

conn.execute("DROP TABLE IF EXISTS statisticalareas")

statistical_areas = """CREATE TABLE IF NOT EXISTS statisticalareas (
                         area_id   Integer PRIMARY KEY,
                         area_name VARCHAR(20),
                         parent_area_id Integer
                   )"""
conn.execute(statistical_areas)
print("Successfully, created table for statistical areas...")

table_name = "statisticalareas"
statisticalAreas_clean.to_sql(table_name, con=conn, if_exists='replace',index=False)
print ("\nUploaded clean statistical areas data to pgAdmin")
res = pd.read_sql_query("SELECT * FROM statisticalareas",conn)
res

Successfully, created table for statistical areas...

Uploaded clean statistical areas data to pgAdmin


Unnamed: 0,area_id,area_name,parent_area_id
0,1,New South Wales,0
1,10,Greater Sydney,1
2,11,Rest of NSW,1
3,2,Victoria,0
4,20,Greater Melbourne,2
...,...,...,...
429,106,Hunter Valley exc Newcastle,11
430,11102,Lake Macquarie - West,111
431,111,Newcastle and Lake Macquarie,11
432,11402,Southern Highlands,114


## Cleaning and Uploading Neighbourhoods.csv


In [None]:
rawData = pd.read_csv("Neighbourhoods.csv")
#Check if there's null values

noNull_data = checkNull(rawData)
neighbourhoods_clean = noNull_data.copy()
numericData = ['area_id', 'land_area', 'population', 'number_of_dwellings', 'number_of_dwellings', 
               'number_of_businesses', 'median_annual_household_income', 'avg_monthly_rent']

for col in noNull_data.columns:
    
    if col in numericData:        
        if noNull_data[col].dtypes != int and noNull_data[col].dtypes != float:
            neighbourhoods_clean[col] = noNull_data[col].str.replace(',', '')
            neighbourhoods_clean[col] = pd.to_numeric(neighbourhoods_clean[col])   
            
print ("\nCommas in numeric data removed and all numbers as type string converted to numeric values!")

#Uploading Neighbourhoods.csv to pgAdmin

conn.execute("DROP TABLE IF EXISTS neighbourhoods")

neighbourhoods = """CREATE TABLE IF NOT EXISTS neighbourhoods (
                         area_id   Integer PRIMARY KEY,
                         area_name VARCHAR(20),
                         land_area Integer,
                         population Integer,
                         number_of_dwellings Integer,
                         number_of_businesses Integer,
                         median_annual_household_income Integer,
                         avg_monthly_rent Integer
                   )"""
conn.execute(neighbourhoods)
print("\nSuccessfully created neighbourhoods table.")

#Testing if table has been created
table_name = "neighbourhoods"
neighbourhoods_clean.to_sql(table_name, con=conn, if_exists='replace',index=False)

res = pd.read_sql_query('SELECT * FROM neighbourhoods', conn)
res


Null value found. Coverted null values to -1...
Null value found. Coverted null values to -1...
Null value found. Coverted null values to -1...
Null value found. Coverted null values to -1...

Null values have been removed. You're good to go!

Commas in numeric data removed and all numbers as type string converted to numeric values!


## Cleaning BusinessStats.csv


In [None]:
rawData = pd.read_csv("BusinessStats.csv")

#Remove null values
noNull_data = checkNull(rawData)
businesStats_clean = noNull_data.copy()

numericData = ['area_id', 'number_of_businesses', 'accommodation_and_food_services', 'retail_trade', 'agriculture_forestry_and_fishing', 
               'health_care_and_social_assistance', 'public_administration_and_safety', 'transport_postal_and_warehousing']

for col in noNull_data.columns:
    
    if col in numericData:        
        if noNull_data[col].dtypes != int and noNull_data[col].dtypes != float:
            businesStats_clean[col] = noNull_data[col].str.replace(',', '')
            businesStats_clean[col] = pd.to_numeric(businesStats_clean[col])   

            
print ("\nCommas in numeric data removed and all numbers as type string converted to numeric values!")


In [None]:
conn.execute("DROP TABLE IF EXISTS businessstats")

business_stats = """CREATE TABLE IF NOT EXISTS businessstats (
                         area_id   Integer PRIMARY KEY,
                         area_name VARCHAR(20),
                         number_of_businesses Integer,
                         accomodation_and_food_services Integer,
                         retail_trade Integer,
                         agriculture_forestry_and_fishing Integer,
                         health_care_and_social_assistance Integer,
                         public_administration_and_safety Integer,
                         transport_postal_and_warehousing Integer
                   )"""
conn.execute(business_stats)
print("Created table for business stats")

table_name = "businessstats"
businesStats_clean.to_sql(table_name, con=conn, if_exists='replace',index=False)

#Testing if table has been created
print ("Business Stats table created")
res = pd.read_sql_query('SELECT * FROM businessstats', conn)
res

## Cleaning and Uploading Additional Data Set: RFS_FireStation

In [None]:
#Reading the json file
import json
import geopandas as gpd
from geopandas import GeoSeries, GeoDataFrame

with open('RFSStation_EPSG4326.json') as f:
  fire_stations = json.load(f)
f.close()

fireStation_df = gpd.GeoDataFrame.from_features(fire_stations['RFSStation'])

In [None]:
#Creating table for firestations
conn.execute("DROP TABLE IF EXISTS rfsfirestations")

RFS_FireStations = """CREATE TABLE IF NOT EXISTS rfsfirestations (
                        coordinates GEOMETRY PRIMARY KEY,
                        stationid INTEGER, 
                        station_name VARCHAR(20)
                   )"""

try:
     conn.execute(RFS_FireStations)
     print("Successfully created table for fire_stations")

except Exception as e:
     print("Table not created.\n")
     print(e)
    
fireStation_df.to_postgis('rfsfirestations', conn, if_exists='replace')
res = pd.read_sql_query('SELECT * FROM rfsfirestations', conn)
res


# Joining Tables

# Calculating Components for Fire Risk


Calculation below involves:
1. Calculating Population Density
2. Dwelling Density
3. Business Density
4. Assistive Service Density


In [32]:
#Calculating population density

res = pd.read_sql_query('SELECT * FROM neighbourhoods', conn)
res

population_density = """
                        Select land_area, population, population/land_area as "population_density" 
                        from neighbourhoods;

                    """
result = pd.read_sql_query(population_density, conn)
result

Unnamed: 0,land_area,population,population_density
0,643.8000,7590,11.789376
1,3208.6000,10986,3.423923
2,76795.1000,4841,0.063038
3,3379.3000,14237,4.213003
4,1691.2000,19385,11.462275
...,...,...,...
304,1570.4341,7931,5.050196
305,4067.2349,4919,1.209421
306,330.5208,14959,45.258876
307,174.3752,6025,34.551932


In [35]:
#Calculating dwellings density

res = pd.read_sql_query('SELECT * FROM neighbourhoods', conn)
res

dwellings_density = """
                        Select land_area, number_of_dwellings, number_of_dwellings/land_area as "dwellings_density" 
                        from neighbourhoods;

                    """
result = pd.read_sql_query(dwellings_density, conn)
result

Unnamed: 0,land_area,number_of_dwellings,dwellings_density
0,643.8000,2325,3.611370
1,3208.6000,3847,1.198965
2,76795.1000,1575,0.020509
3,3379.3000,4450,1.316841
4,1691.2000,6373,3.768330
...,...,...,...
304,1570.4341,3281,2.089231
305,4067.2349,2055,0.505257
306,330.5208,6298,19.054777
307,174.3752,2249,12.897476


In [44]:
#Calculating businessstats density

res = pd.read_sql_query('SELECT * FROM businessstats', conn)
res

businessstats_density = """
                        Select B.number_of_businesses, land_area, B.number_of_businesses/land_area as "business_density"
                        from businessstats B, neighbourhoods;
                    """
result = pd.read_sql_query(businessstats_density, conn)
result


Unnamed: 0,number_of_businesses,land_area,business_density
0,629,643.8000,0.977011
1,629,3208.6000,0.196036
2,629,76795.1000,0.008191
3,629,3379.3000,0.186133
4,629,1691.2000,0.371925
...,...,...,...
711004,1215,1570.4341,0.773671
711005,1215,4067.2349,0.298729
711006,1215,330.5208,3.676017
711007,1215,174.3752,6.967734


In [None]:
#Calculating businessstats density


businessstats_density = """
                        Select B.number_of_businesses, land_area, B.number_of_businesses/land_area as "business_density"
                        from businessstats B, neighbourhoods;

                    """
result = pd.read_sql_query(businessstats_density, conn)
result