In [40]:
import pandas as pd
import geopandas as gpd
import requests, json, geojson
from shapely.geometry import shape
import shapely.wkt

import sys
sys.path.append('../scripts/')
from read_utils import read_file, create_folder, temp_record_query, temp_record_sdf
from pyspark.sql import SparkSession

In [2]:
spark = (
    # Create a spark session (which will run spark jobs)
    SparkSession.builder.appName("Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config('spark.executor.memory','10g')
    .config('spark.driver.memory','12g')
    .config('spark.driver.maxResultsSize', '10GiB')
    # .config("spark.network.timeout", "3600s")
    # .master("local[6]")
    .getOrCreate()
    )

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/28 13:03:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/09/28 13:03:10 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
POA_df = pd.read_csv('../data/tables/external_postcode.csv')
pos_df = POA_df.drop_duplicates(['postcode'])
POA_geo = gpd.GeoDataFrame(data = pos_df)

In [4]:
POA_df.head(5)

Unnamed: 0,postcode,place_name,state_name,state_code,latitude,longitude,accuracy
0,200,Australian National University,Australian Capital Territory,ACT,-35.2777,149.1189,1.0
1,221,Barton,Australian Capital Territory,ACT,-35.3049,149.1412,4.0
2,2540,Wreck Bay,Australian Capital Territory,ACT,-35.1627,150.6907,4.0
3,2540,Hmas Creswell,Australian Capital Territory,ACT,-35.028,150.5501,3.0
4,2540,Jervis Bay,Australian Capital Territory,ACT,-35.1333,150.7,4.0


In [5]:
sa2_sf = gpd.read_file('../data/tables/external_SA2/SA2_2021_AUST_GDA2020.shp')
sa2_sf['geometry'] = sa2_sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
sa2_gdf = gpd.GeoDataFrame(sa2_sf)
sa2_gdf = sa2_gdf.dropna()
sa2_gdf['centroid'] = sa2_gdf['geometry'].apply(lambda x: (x.centroid.y, x.centroid.x))
bound = sa2_gdf['geometry'].bounds
sa2_geo = sa2_gdf.join(bound)

In [6]:
sa2_geo_info = sa2_geo[['SA2_CODE21', 'geometry','centroid']].drop_duplicates('SA2_CODE21')

In [7]:
def match_sa2_and_postcode(sa2_df, pos_df):
    # pos_new_df = pos_df.reindex(columns=[*pos_df.columns.tolist(), 'SA2_CODE21'], fill_value=0)
    for i in pos_df.index:
        for j in sa2_df.index:
            if ((pos_df['latitude'][i] >= sa2_df['miny'][j])& (pos_df['latitude'][i] <= sa2_df['maxy'][j])
                &(pos_df['longitude'][i] >= sa2_df['minx'][j]) & (pos_df['longitude'][i] <= sa2_df['maxx'][j])):
                pos_df.loc[i, 'SA2_CODE21'] = sa2_df['SA2_CODE21'][j]
    return pos_df

In [8]:
mix_df = match_sa2_and_postcode(sa2_geo, POA_geo)

In [9]:
mix_df = mix_df.dropna()

In [15]:
mix_df.dtypes

postcode        int64
place_name     object
state_name     object
state_code     object
latitude      float64
longitude     float64
accuracy      float64
SA2_CODE21      int64
dtype: object

In [16]:
sa2_geo_info.loc[:,'SA2_CODE21'] = sa2_geo_info.loc[:,'SA2_CODE21'].astype('int')

In [17]:
sa2_geo_info.dtypes

SA2_CODE21       int64
geometry      geometry
centroid        object
dtype: object

In [18]:
final_df = pd.merge(mix_df, sa2_geo_info, on='SA2_CODE21',how='inner')

In [19]:
final_df.head(5)

Unnamed: 0,postcode,place_name,state_name,state_code,latitude,longitude,accuracy,SA2_CODE21,geometry,centroid
0,200,Australian National University,Australian Capital Territory,ACT,-35.2777,149.1189,1.0,801051049,"POLYGON ((149.11002 -35.28400, 149.10987 -35.2...","(-35.281339155480744, 149.11505674306417)"
1,221,Barton,Australian Capital Territory,ACT,-35.3049,149.1412,4.0,801061129,"POLYGON ((149.12961 -35.30890, 149.12962 -35.3...","(-35.30925425962516, 149.13585270514673)"
2,2540,Wreck Bay,Australian Capital Territory,ACT,-35.1627,150.6907,4.0,901031003,"MULTIPOLYGON (((150.69567 -35.18295, 150.69556...","(-35.15235858081164, 150.6912049877714)"
3,2600,Deakin West,Australian Capital Territory,ACT,-35.3126,149.1278,3.0,801061063,"POLYGON ((149.11789 -35.31529, 149.11887 -35.3...","(-35.3171832215981, 149.12476387791898)"
4,2601,City,Australian Capital Territory,ACT,-35.2813,149.1293,4.0,801051053,"POLYGON ((149.12464 -35.27583, 149.12474 -35.2...","(-35.28126033741348, 149.12916957590815)"


In [20]:
final_df = final_df.dropna()

In [21]:
final_df.dtypes

postcode         int64
place_name      object
state_name      object
state_code      object
latitude       float64
longitude      float64
accuracy       float64
SA2_CODE21       int64
geometry      geometry
centroid        object
dtype: object

In [23]:
len(final_df)

3306

- seperate data into state

In [28]:
def get_states(dataframe):
    state_list = []
    for state in dataframe['state_code']:
        if state not in state_list:
            state_list.append(state)
    return state_list        

In [29]:
state_list = get_states(final_df)

In [30]:
state_list

['ACT', 'NSW', 'QLD', 'VIC', 'NT', 'SA', 'TAS', 'WA']

In [31]:
ACT_df = final_df[final_df['state_code'] == 'ACT']
ACT_JSON = ACT_df[['postcode', 'place_name', 'geometry']].to_json()

In [32]:
NSW_df = final_df[final_df['state_code'] == 'NSW']
NSW_JSON = NSW_df[['postcode', 'place_name', 'geometry']].to_json()

In [54]:
len(NSW_df)

960

In [33]:
QLD_df = final_df[final_df['state_code'] == 'QLD']
QLD_JSON = QLD_df[['postcode', 'place_name', 'geometry']].to_json()

In [55]:
len(QLD_df)

464

In [56]:
VIC_df = final_df[final_df['state_code'] == 'VIC']
VIC_JSON =VIC_df[['postcode', 'place_name', 'geometry']].to_json()

In [57]:
len(VIC_df)

751

In [35]:
NT_df = final_df[final_df['state_code'] == 'NT']
NT_JSON = NT_df[['postcode', 'place_name', 'geometry']].to_json()

In [58]:
len(NT_df)

49

In [36]:
SA_df = final_df[final_df['state_code'] == 'SA']
SA_JSON = SA_df[['postcode', 'place_name', 'geometry']].to_json()

In [59]:
len(SA_df)

391

In [37]:
TAS_df = final_df[final_df['state_code'] == 'TAS']
TAS_JSON = TAS_df[['postcode', 'place_name', 'geometry']].to_json()

In [60]:
len(TAS_df)

158

In [38]:
WA_df = final_df[final_df['state_code'] == 'WA']
WA_JSON = WA_df[['postcode', 'place_name', 'geometry']].to_json()

In [61]:
len(WA_df)

500

In [64]:
import folium
m = folium.Map(location=[-26, 133], tiles="Stamen Terrain", zoom_start=5)

# Mouse over display
tooltip=folium.features.GeoJsonTooltip(
    fields=['postcode'],
    localize=True,
    sticky=False,
    labels=True,
    style="""
        background-color: #F0EFEF;
        border: 2px solid black;
        border-radius: 3px;
        box-shadow: 3px;
    """)

# draw map
folium.GeoJson(
    ACT_JSON,
    # style_function=lambda x: {'fillColor': 'orange'}
    tooltip=tooltip
).add_to(m)

# # station popup
# for postcode, place_name, centroid in sa2_map:
#     m.add_child(
#         folium.Marker(location=centroid, popup=f'{postcode}\n>>>>\n{place_name}')
#     )

# m.save('../plots/foliumStationToNTA.png')
# m

<folium.features.GeoJson at 0x7f886b157490>

In [52]:
# draw map
folium.GeoJson(
    NSW_JSON,
    # style_function=lambda x: {'fillColor': 'orange'}
    tooltip=tooltip
).add_to(m)

<folium.features.GeoJson at 0x7f886c315310>

In [None]:
# draw map
folium.GeoJson(
    QLD_JSON,
    # style_function=lambda x: {'fillColor': 'orange'}
    tooltip=tooltip
).add_to(m)

In [None]:
# draw map
folium.GeoJson(
    VIC_JSON,
    # style_function=lambda x: {'fillColor': 'orange'}
    tooltip=tooltip
).add_to(m)

In [67]:
# draw map
folium.GeoJson(
    NT_JSON,
    # style_function=lambda x: {'fillColor': 'orange'}
    tooltip=tooltip
).add_to(m)

<folium.features.GeoJson at 0x7f8859bf1690>

In [None]:
# draw map
folium.GeoJson(
    SA_JSON,
    # style_function=lambda x: {'fillColor': 'orange'}
    tooltip=tooltip
).add_to(m)

In [None]:
# draw map
folium.GeoJson(
    TAS_JSON,
    # style_function=lambda x: {'fillColor': 'orange'}
    tooltip=tooltip
).add_to(m)

In [None]:
# draw map
folium.GeoJson(
    WA_JSON,
    # style_function=lambda x: {'fillColor': 'orange'}
    tooltip=tooltip
).add_to(m)

In [22]:
final_df.to_parquet('../data/curated/sa2_pos_geometry.parquet')


This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  """Entry point for launching an IPython kernel.


- Compare with the consumer data

In [23]:
geo_data = gpd.read_parquet('../data/curated/sa2_pos_geometry.parquet')

In [24]:
geo_data.dtypes

postcode         int64
place_name      object
state_name      object
state_code      object
latitude       float64
longitude      float64
accuracy       float64
SA2_CODE21       int64
geometry      geometry
centroid        object
dtype: object

In [64]:
geo_data.head()

Unnamed: 0,postcode,place_name,state_name,state_code,latitude,longitude,accuracy,SA2_CODE21,geometry,centroid
0,200,Australian National University,Australian Capital Territory,ACT,-35.2777,149.1189,1.0,801051049,"POLYGON ((149.11002 -35.28400, 149.10987 -35.2...","[-35.281339155480744, 149.11505674306417]"
1,221,Barton,Australian Capital Territory,ACT,-35.3049,149.1412,4.0,801061129,"POLYGON ((149.12961 -35.30890, 149.12962 -35.3...","[-35.30925425962516, 149.13585270514673]"
2,2540,Wreck Bay,Australian Capital Territory,ACT,-35.1627,150.6907,4.0,901031003,"MULTIPOLYGON (((150.69567 -35.18295, 150.69556...","[-35.15235858081164, 150.6912049877714]"
3,2600,Deakin West,Australian Capital Territory,ACT,-35.3126,149.1278,3.0,801061063,"POLYGON ((149.11789 -35.31529, 149.11887 -35.3...","[-35.3171832215981, 149.12476387791898]"
4,2601,City,Australian Capital Territory,ACT,-35.2813,149.1293,4.0,801051053,"POLYGON ((149.12464 -35.27583, 149.12474 -35.2...","[-35.28126033741348, 149.12916957590815]"


In [25]:
consumer_df = pd.read_parquet('../data/curated/new_consumer_data.parquet')

In [26]:
consumer_df.head()

Unnamed: 0,consumer_id,name,address,state,postcode,gender,user_id
0,1195503,Yolanda Williams,413 Haney Gardens Apt. 742,WA,6935,Female,1
1,179208,Mary Smith,3764 Amber Oval,NSW,2782,Female,2
2,1194530,Jill Jones MD,40693 Henry Greens,NT,862,Female,3
3,154128,Lindsay Jimenez,00653 Davenport Crossroad,NSW,2780,Female,4
4,712975,Rebecca Blanchard,9271 Michael Manors Suite 651,WA,6355,Female,5


In [27]:
consumer_df.loc[:,'postcode'] = consumer_df.loc[:,'postcode'].astype('int')
consumer_df.dtypes

consumer_id    object
name           object
address        object
state          object
postcode        int64
gender         object
user_id         int64
dtype: object

In [69]:
conn_by_postcode = pd.merge(consumer_df,geo_data, on='postcode', how='left')

In [70]:
conn_by_postcode.head()

Unnamed: 0,consumer_id,name,address,state,postcode,gender,user_id,place_name,state_name,state_code,latitude,longitude,accuracy,SA2_CODE21,geometry,centroid
0,1195503,Yolanda Williams,413 Haney Gardens Apt. 742,WA,6935,Female,1,Guildford,Western Australia,WA,-31.9,115.9667,4.0,504031066.0,"POLYGON ((115.98662 -31.88821, 115.98724 -31.8...","[-31.894690777603778, 116.00345049359626]"
1,179208,Mary Smith,3764 Amber Oval,NSW,2782,Female,2,Wentworth Falls,New South Wales,NSW,-33.7103,150.3753,4.0,124021456.0,"POLYGON ((150.17345 -33.81724, 150.17340 -33.8...","[-33.99341257065195, 150.28795736716887]"
2,1194530,Jill Jones MD,40693 Henry Greens,NT,862,Female,3,Warrego,Northern Territory,NT,-19.4362,133.8208,4.0,702021055.0,"POLYGON ((132.40205 -17.30632, 132.40207 -17.0...","[-19.280833088965764, 135.0622557037591]"
3,154128,Lindsay Jimenez,00653 Davenport Crossroad,NSW,2780,Female,4,Katoomba,New South Wales,NSW,-33.7198,150.3074,4.0,124021456.0,"POLYGON ((150.17345 -33.81724, 150.17340 -33.8...","[-33.99341257065195, 150.28795736716887]"
4,712975,Rebecca Blanchard,9271 Michael Manors Suite 651,WA,6355,Female,5,Lake Biddy,Western Australia,WA,-33.0019,118.9292,4.0,509031247.0,"POLYGON ((117.63905 -32.58162, 117.63989 -32.5...","[-32.83646424681477, 118.9108199259583]"


In [71]:
temp = conn_by_postcode[conn_by_postcode.SA2_CODE21.isna()]
print( f'There are {len(conn_by_postcode)} data, and {len(temp)} of them have no null value.')

There are 499999 data, and 4024 of them have no null value.


In [72]:
len(temp.drop_duplicates('postcode'))
temp.drop_duplicates('postcode').head()

Unnamed: 0,consumer_id,name,address,state,postcode,gender,user_id,place_name,state_name,state_code,latitude,longitude,accuracy,SA2_CODE21,geometry,centroid
140,751248,James Donovan,455 Trevino Ports Apt. 032,WA,6435,Undisclosed,141,,,,,,,,,
192,1010604,Eric Glover,27850 Charles Stream Suite 645,QLD,4314,Male,193,,,,,,,,,
879,474979,Ashley Adams,507 Smith Islands,SA,5717,Female,880,,,,,,,,,
1461,1258545,Katrina Rodriguez,9530 Jamie Trace Suite 775,NSW,1441,Female,1462,,,,,,,,,
1548,31376,Brandy Russell,88973 Anthony Roads Apt. 529,NT,874,Female,1549,,,,,,,,,


In [73]:
null_pos = temp.drop_duplicates('postcode').reset_index()

In [74]:
null_pos.head()

Unnamed: 0,index,consumer_id,name,address,state,postcode,gender,user_id,place_name,state_name,state_code,latitude,longitude,accuracy,SA2_CODE21,geometry,centroid
0,140,751248,James Donovan,455 Trevino Ports Apt. 032,WA,6435,Undisclosed,141,,,,,,,,,
1,192,1010604,Eric Glover,27850 Charles Stream Suite 645,QLD,4314,Male,193,,,,,,,,,
2,879,474979,Ashley Adams,507 Smith Islands,SA,5717,Female,880,,,,,,,,,
3,1461,1258545,Katrina Rodriguez,9530 Jamie Trace Suite 775,NSW,1441,Female,1462,,,,,,,,,
4,1548,31376,Brandy Russell,88973 Anthony Roads Apt. 529,NT,874,Female,1549,,,,,,,,,


In [39]:
poa_sf = gpd.read_file('../data/tables/external_POA/POA_2021_AUST_GDA2020.shp')

In [41]:
poa_sf.head()

Unnamed: 0,POA_CODE21,POA_NAME21,AUS_CODE21,AUS_NAME21,AREASQKM21,LOCI_URI21,SHAPE_Leng,SHAPE_Area,geometry
0,800,800,AUS,Australia,3.1731,http://linked.data.gov.au/dataset/asgsed3/POA/...,0.081893,0.000264,"POLYGON ((130.83681 -12.45376, 130.83684 -12.4..."
1,810,810,AUS,Australia,24.4283,http://linked.data.gov.au/dataset/asgsed3/POA/...,0.241859,0.002031,"POLYGON ((130.89986 -12.36567, 130.89875 -12.3..."
2,812,812,AUS,Australia,35.8899,http://linked.data.gov.au/dataset/asgsed3/POA/...,0.278788,0.002983,"POLYGON ((130.91915 -12.40786, 130.91831 -12.4..."
3,820,820,AUS,Australia,39.0642,http://linked.data.gov.au/dataset/asgsed3/POA/...,0.409134,0.003248,"POLYGON ((130.85260 -12.43994, 130.85089 -12.4..."
4,822,822,AUS,Australia,150775.803,http://linked.data.gov.au/dataset/asgsed3/POA/...,90.601831,12.564238,"MULTIPOLYGON (((136.56648 -12.08392, 136.56704..."


In [42]:
poa_sf['LOCI_URI21'][0]

'http://linked.data.gov.au/dataset/asgsed3/POA/0800'

In [43]:
url = 'http://linked.data.gov.au/dataset/asgsed3/POA/'

In [57]:
def merge_code_and_link(code, link):
    code_string = str(code)
    return link + code_string

In [62]:
def get_geo_info(link):
    req = requests.get(link, headers={'accept': 'application/geo+json'})
    res_json = json.loads(req.text)
    try:
        geo = res_json['geometry']
    except Exception:
        return None
    return shape(geo)

In [58]:
def request_data(dataframe, link):
    new_df = dataframe.copy()
    for i in new_df.index:
        pos_url = merge_code_and_link(new_df['postcode'][i], link)
        new_df.loc[i, 'pos_url'] = pos_url
    return new_df

In [59]:
new_df = request_data(null_pos, url)

In [60]:
new_df.head()

Unnamed: 0,index,consumer_id,name,address,state,postcode,gender,user_id,place_name,state_name,state_code,latitude,longitude,accuracy,SA2_CODE21,geometry,centroid,pos_url
0,140,751248,James Donovan,455 Trevino Ports Apt. 032,WA,6435,Undisclosed,141,,,,,,,,,,http://linked.data.gov.au/dataset/asgsed3/POA/...
1,192,1010604,Eric Glover,27850 Charles Stream Suite 645,QLD,4314,Male,193,,,,,,,,,,http://linked.data.gov.au/dataset/asgsed3/POA/...
2,879,474979,Ashley Adams,507 Smith Islands,SA,5717,Female,880,,,,,,,,,,http://linked.data.gov.au/dataset/asgsed3/POA/...
3,1461,1258545,Katrina Rodriguez,9530 Jamie Trace Suite 775,NSW,1441,Female,1462,,,,,,,,,,http://linked.data.gov.au/dataset/asgsed3/POA/...
4,1548,31376,Brandy Russell,88973 Anthony Roads Apt. 529,NT,874,Female,1549,,,,,,,,,,http://linked.data.gov.au/dataset/asgsed3/POA/874


In [88]:
geo = new_df.pos_url.apply(lambda x: get_geo_info(x))

In [92]:
gdf = gpd.GeoDataFrame(data=new_df[['state', 'postcode']], geometry=geo, crs=4329)

In [96]:
gdf.dtypes

state         object
postcode       int64
geometry    geometry
dtype: object

In [99]:
# for i in gdf.index:
#     if gdf.loc[i, 'geometry'] is not 'None':
#         gdf.loc[i, 'centroid'] = gdf.loc[i, 'geometry'].apply(lambda x: (x.centroid.y, x.centroid.x)) 

AttributeError: 'NoneType' object has no attribute 'apply'