## Download External Dataset

### Table of Content
- [Download External Dataset](#download-external-dataset)

In [1]:
import pandas as pd
import numpy as np
from csv import writer
import re
import unicodedata
import json
import os

from bs4 import BeautifulSoup
import requests

import geopandas as gpd
import folium

In [2]:
# import required libraries
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import col, isnan, when, count, mean, udf, split, unix_timestamp, from_unixtime, lower
from pyspark.sql.types import StringType, IntegerType, FloatType

# init SparkSession class
spark = (
    # if available consider use yarn master node
    SparkSession.builder.master("local[*]") 
    
    # spark executor env configuration
    .config("spark.executor.memory", "8g")
    .config("spark.driver.memory", "16g")
    .config("spark.executor.cores", "2")
    .config("spark.executor.instances", "6")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    
    # jvm memory configuration
    .config("spark.memory.offHeap.enabled", "true")
    .config("spark.memory.offHeap.size", "8g")
    
    # parquet file load configuration
    .config("spark.sql.repl.eagerEval.enabled", 'true')
    .config("spark.sql.parquet.cacheMetadata", 'true')
    
    # build the session
    .appName("Pyspark Start Template") # change app name here
    .getOrCreate()
)
# change default log level
spark.sparkContext.setLogLevel('ERROR')

22/09/16 17:07:21 WARN Utils: Your hostname, QuzihandeMacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.96.115 instead (on interface en0)
22/09/16 17:07:21 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/16 17:07:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
OUTPUT_RELATIVE_PATH = "../../data/"

def create_path():
    """
    Create data path to save raw data
    """

    path = ["curated/external-data","raw/external-data"]
    for target_dir in path:
        if not os.path.exists(OUTPUT_RELATIVE_PATH + target_dir):
            os.makedirs(OUTPUT_RELATIVE_PATH + target_dir)
    print('Already Create Paths')

create_path()

Already Create Paths


In [4]:
# re to find lat and lon
def point_to_coor(df):

    df["lat"] = float(re.findall(r"\d+\.?\d*", df['geometry'][i])[0])
    df["lon"] = float(re.findall(r"\d+\.?\d*", df['geometry'][i])[1])
    return df


In [5]:
output_dir = '../../data/raw/external-data/'

### [property and elector count by postcode]('https://discover.data.vic.gov.au/dataset/victorian-electors-by-locality-postcode-and-electorates')

In [6]:
LocalityFinder_postcode = pd.read_excel('../../data/raw/external-data/LocalityFinder_postcode.xlsx',
                                        sheet_name= 'Place_Names_Electronic',
                                        header=2)

In [7]:
columns = ['Locality Name',          'Post_x000D_\nCode',
           'Property_x000D_\nCount', 'Elector_x000D_\nCount']
           
LocalityFinder_postcode = LocalityFinder_postcode[columns].rename(
                                                                {'Post_x000D_\nCode': 'postcode', 
                                                                'Locality Name':'suburb_name', 
                                                                'Property_x000D_\nCount': 'property_count', 
                                                                'Elector_x000D_\nCount':'elector_count'}, axis='columns')

property_and_elector = LocalityFinder_postcode.groupby('postcode').sum()

In [8]:
# output
filename = 'property_and_elector_by_postcode.csv'
output_dir_full = f'{output_dir}{filename}'
property_and_elector.to_csv(output_dir_full)

In [11]:
property_and_elector

Unnamed: 0_level_0,property_count,elector_count
postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
3000,15408,9168
3002,3308,3823
3003,3068,3265
3004,6439,6387
3006,10636,8631
...,...,...
3990,53,99
3991,199,336
3992,575,863
3995,5874,8574
