In [1]:
import numpy as np

import pandas as pd
import geopandas as gpd

#import matplotlib.pyplot as plt
#import seaborn as sns
#import folium

import requests

import json

from random import randint

import pickle

In [2]:
from bokeh.io import output_file, show, output_notebook, export_png
from bokeh.models import ColumnDataSource, GeoJSONDataSource, LinearColorMapper, ColorBar, HoverTool, Select
from bokeh.plotting import figure
from bokeh.palettes import brewer
#from bokeh.sampledata.sample_geojson import geojson
from bokeh.layouts import widgetbox, row, column
from bokeh.io.doc import curdoc


from bokeh.tile_providers import CARTODBPOSITRON, get_provider

#output_notebook()

In [26]:
def keep_forecast_only(dicty):
    """Keeping only the forecasted median price and the year it is forcasted for

    :param dicty: dictionary containing a mapping Suburb-> prediction dataframe
    :return: dictionary with mapping suburb -> [year, Median_rental_price]
    """    
    for key, df in dicty.items():
        dicty[key] = df[['year', 'Median_rental_price']]
    return dicty

In [27]:
DATA_DIR = '../data/curated/'
forecast_names = ['forecast_bed_1_flat_covid', 'forecast_bed_2_flat_covid', 'forecast_bed_3_flat_covid', 'forecast_bed_2_house_covid', 'forecast_bed_3_house_covid', 'forecast_bed_4_house_covid']
forecasts = {}
for name in forecast_names:
    with open(f'{DATA_DIR}{name}', 'rb') as f:
        stuff = pickle.load(f)
    forecasts[name] = keep_forecast_only(stuff)
    

In [28]:
# current structure of forecasts:
# forecasts is a dict for each property type
# each property type is a dictionary suburb(concatenated) -> df[year, price]

# flatten the structure to have the following structure, for one year
# suburb_name, 1_bed_flat, 2_bed_flat, .., 2_bed_house, ...
# Name1, 134, 1234, 2342, 4255, 6453, ...
def flatten_prediction(forecasts, year=2027):
    columns = ['suburb']  # list(forecasts.keys())
    # columns.insert(0,'suburb')
    # [x.split('forecast_bed_')[1].split('_covid')[0] for x in forecasts.keys()]
    df = pd.DataFrame(columns=columns)
    df.suburb = pd.Series(forecasts['forecast_bed_1_flat_covid'].keys())
    # display(df)

    for prop_type, suburb_dict in forecasts.items():
        temp = []  # pd.Series(dtype='float64')
        for suburb, prediction in suburb_dict.items():
            #prediction = prediction.set_index('year')
            temp.append(prediction[prediction['year'] ==
                        year].Median_rental_price.values[0])
        df[prop_type] = temp

    # Keeping only informative part for the column names: forecast_bed_1_flat_covid-> 1_flat
    df.rename(lambda x:  'suburb' if not '_' in x else '_'.join(
        x.split('_')[2:4]), axis='columns', inplace=True)
    #df.colums = [str(x) for x in range(df.shape[1])]
    return df


df = flatten_prediction(forecasts)
#forecasts['forecast_bed_1_flat_covid']['Carrum, Patterson Lakes']
df


Unnamed: 0,suburb,1_flat,2_flat,3_flat,2_house,3_house,4_house
0,"St Kilda, St Kilda South, St Kilda West",472.677291,738.763641,772.946164,851.550577,949.046029,746.189755
1,"Armadale North, Armadale",600.609407,781.079020,1094.634765,801.974716,1089.720340,941.566670
2,"Carlton South, Carlton",649.294495,733.325816,831.293411,1117.571798,1419.028149,1234.722488
3,"Melbourne University, Parkville",466.104332,706.484733,826.162580,912.398678,879.269360,1409.081052
4,"Collingwood, Collingwood North",584.947822,610.556538,975.820153,851.758125,1019.088670,928.119986
...,...,...,...,...,...,...,...
85,"Dromana, Arthurs Seat, Safety Beach",588.112854,714.856024,720.593835,711.286320,814.436227,954.226327
86,"Karingal Centre, Karingal, Frankston Heights, ...",363.999545,505.686715,593.474999,510.176100,588.715498,591.208075
87,"Tuerong, Hastings",633.106063,772.521547,950.270761,723.024475,957.670521,1375.148953
88,Mornington,678.244101,791.031963,962.182640,965.823379,992.470310,1252.844441


In [6]:
with open('../data/curated/postcode_dict', 'rb') as f:
    code = pickle.load(f)
code

{'3000': 'Melbourne',
 '3002': 'East Melbourne',
 '3003': 'West Melbourne',
 '3004': 'St Kilda Road Melbourne, St Kilda Road Central, Melbourne',
 '3006': 'South Wharf, Southbank',
 '3008': 'Docklands',
 '3011': 'Seddon, Seddon West, Footscray',
 '3012': 'Brooklyn, Kingsville, Maidstone, Tottenham, West Footscray',
 '3013': 'Yarraville West, Yarraville',
 '3015': 'Spotswood, South Kingsville, Newport',
 '3016': 'Williamstown North, Williamstown',
 '3018': 'Seaholme, Altona',
 '3019': 'Braybrook, Robinson',
 '3020': 'Sunshine West, Sunshine, Sunshine North, Glengala, Albion',
 '3021': 'St Albans, Kings Park, Kealba, Albanvale',
 '3022': 'Deer Park East, Ardeer',
 '3023': 'Cairnlea, Caroline Springs, Burnside, Burnside Heights, Ravenhall, Deer Park North, Deer Park',
 '3024': 'Mount Cottrell, Mambourin, Wyndham Vale',
 '3025': 'Altona East, Altona Gate, Altona North',
 '3026': 'Laverton North',
 '3027': 'Williams Landing',
 '3028': 'Laverton, Altona Meadows, Seabrook',
 '3029': 'Truganin

In [7]:
data = requests.get('https://data.gov.au/geoserver/vic-suburb-locality-boundaries-psma-administrative-boundaries/wfs?request=GetFeature&typeName=ckan_af33dd8c_0534_4e18_9245_fc64440f742e&outputFormat=json', headers={"Content-Type":"application/json"})
geodata = data.json()

with open('../data/raw/geodata.json', 'w', encoding='utf-8') as f:
    json.dump(geodata, f, indent=4)

In [29]:
suburb_gdf = gpd.read_file("../data/raw/Suburb Shapes/vic_localities.shp")[["LOC_NAME", "geometry"]]
suburb_gdf

Unnamed: 0,LOC_NAME,geometry
0,Abbeyard,"POLYGON ((146.81722 -37.09734, 146.81729 -37.0..."
1,Abbotsford,"POLYGON ((145.00235 -37.80722, 145.00350 -37.8..."
2,Aberfeldie,"POLYGON ((144.89830 -37.76464, 144.89790 -37.7..."
3,Aberfeldy,"POLYGON ((146.39448 -37.71006, 146.39405 -37.7..."
4,Acheron,"POLYGON ((145.75030 -37.24312, 145.75037 -37.2..."
...,...,...
2968,Yundool,"POLYGON ((145.85808 -36.26994, 145.85813 -36.2..."
2969,Yuroke,"POLYGON ((144.87771 -37.58470, 144.87867 -37.5..."
2970,Yuulong,"POLYGON ((143.29976 -38.75114, 143.29895 -38.7..."
2971,Zeerust,"POLYGON ((145.38005 -36.26997, 145.37897 -36.2..."


In [30]:
# split each suburb into its own row instead of being string concatenated, 
# so that it is easy to join with the suburb gdf
df = df.assign(temp=df['suburb'].str.split(', ')).explode('temp').drop(['suburb'], axis=1).rename(columns={'temp':'suburb'}).reset_index(drop=True)
df.shape

(285, 7)

In [31]:
# there is around 85 suburbs that get lost unfortunately

suburb_gdf = suburb_gdf.join(df.set_index('suburb'), on='LOC_NAME').dropna(how='any').rename(columns={'LOC_NAME':'suburb'})
# in case the thing is to heavy, dumb the shapes down
# suburb_gdf['geometry'] = suburb_gdf['geometry'].simplify(0.05, preserve_topology=True) # if we want to reduce precision of each polygon to make it quicker to display, 0.05 is the reducing coeficitient or whateve


In [32]:
#suburb_gdf.crs
#suburb_gdf.set_crs('epsg:4283')
suburb_gdf['geometry'] = suburb_gdf['geometry'].to_crs(epsg=3857)

In [33]:
suburb_gdf

Unnamed: 0,suburb,geometry,1_flat,2_flat,3_flat,2_house,3_house,4_house
2,Aberfeldie,"POLYGON ((16130005.456 -4546230.078, 16129960....",535.523477,622.561584,762.428562,695.236814,710.218297,1228.047212
16,Albanvale,"POLYGON ((16115004.680 -4544270.498, 16114986....",609.463223,721.114205,783.804184,783.142359,726.164035,952.012933
20,Albion,"POLYGON ((16122170.973 -4548646.863, 16122157....",516.149794,586.149794,636.149794,644.396234,626.149794,798.840657
33,Alphington,"POLYGON ((16143655.475 -4549374.647, 16143625....",547.514881,822.208627,943.971852,1016.225464,1083.552571,1308.159467
34,Altona,"POLYGON ((16118043.731 -4561113.120, 16118072....",658.782387,827.242931,776.657682,758.714731,670.445650,710.445650
...,...,...,...,...,...,...,...,...
2819,Williamstown,"POLYGON ((16130594.194 -4559808.350, 16130595....",829.644948,812.615722,733.137611,785.278124,1061.590611,1033.333298
2820,Williamstown North,"POLYGON ((16126041.992 -4558975.748, 16125494....",829.644948,812.615722,733.137611,785.278124,1061.590611,1033.333298
2832,Windsor,"POLYGON ((16141560.036 -4558956.714, 16141548....",552.742192,603.981298,847.890751,823.729411,947.890751,1119.003450
2885,Woori Yallock,"POLYGON ((16204114.208 -4550901.618, 16204124....",759.501781,786.339200,675.869806,773.176620,817.840032,1039.227905


In [77]:
suburb_gdf[suburb_gdf['2_house'].gt(-1)]['2_house'].min()

484.38199730659636

In [113]:
# Set bokeh to save file
def create_map(gdf, plotted_price='1_flat', filename="../plots/actual_data.html"):
    # todo, add year and the preproc fn call to here
    output_file(filename, title="Actual data plotting suburb")

    # convert to int, float precision not needed
    cols = ['1_flat', '2_flat', '3_flat', '2_house', '3_house', '4_house']
    for col in cols:
        gdf[col] = gdf[col].astype('int')

    geo_source = GeoJSONDataSource(geojson=gdf.to_json())

    # giving the basic view frame of the map
    tile_provider = get_provider(CARTODBPOSITRON)

    # range bounds supplied in web mercator coordinates
    x_range = (16075000.0, 16225000.0)
    y_range = (-4635000.0, -4485000.0)
    p = figure(x_range=x_range, y_range=y_range,
               x_axis_type="mercator", y_axis_type="mercator",
               plot_height=600, plot_width=600,
               match_aspect=True,)
    p.add_tile(tile_provider)
    p.title.text = f"Predicted median price per suburb for {plotted_price.split('_')[0]} bed {plotted_price.split('_')[1]} (AUD)"
    p.title.align = "center"
    p.title.text_font_size = "18px"

    # Instantiate LinearColorMapper that linearly maps numbers in a range, into a sequence of colors.
    color_mapper = LinearColorMapper(
        palette='Plasma256', low=gdf[gdf[plotted_price].gt(-1)][plotted_price].min(), high=gdf[plotted_price].max())

    #print(f"used min max is {gdf[gdf['2_house'].gt(-1)]['2_house'].min()}, {gdf['2_house'].max()}")

    color_bar = ColorBar(color_mapper=color_mapper,
                         label_standoff=8,
                         width=20, height=600,
                         border_line_color=None,
                         location=(0, 0),
                         orientation='vertical')
    # major_label_overrides = tick_labels, could be used to set custom tickers, tick_labels is a dict value->display value

    # Add patch renderer to figure.
    suburbs = p.patches('xs', 'ys', source=geo_source,
                        # here comes the transform function
                        fill_color={'field': plotted_price,
                                    'transform': color_mapper},
                        line_color='gray',
                        line_width=0.25,
                        fill_alpha=0.75)

    # add tooltips as needed for other prediction values
    tooltips = [('Suburb', '@suburb')]
    tooltips.extend([(f"2027 prediction {x.split('_')[0]} bed {x.split('_')[1]}", f'@{x}')
                    for x in filter(lambda x: '_' in x, [x for x in suburb_gdf.columns])])
    # ('2027 prediction 1flat', '@1_flat')]
    p.add_tools(HoverTool(renderers=[suburbs],
                          tooltips=tooltips))

    p.add_layout(color_bar, 'right')

    show(p)  # this ; prevents vscode of showing output, output breaks things, todo to be fixef


create_map(suburb_gdf)


Start : This command cannot be run due to the error: The system cannot find the file specified.
At line:1 char:1
+ Start "file:///home/toomas/ADS/generic-real-estate-consulting-project ...
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    + CategoryInfo          : InvalidOperation: (:) [Start-Process], InvalidOperationException
    + FullyQualifiedErrorId : InvalidOperationException,Microsoft.PowerShell.Commands.StartProcessCommand
 


In [114]:
import sys 
sys.path.append('../scripts/')
import create_map


In [115]:
create_map.create_map()

Start : This command cannot be run due to the error: The system cannot find the file specified.
At line:1 char:1
+ Start "file:///home/toomas/ADS/generic-real-estate-consulting-project ...
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    + CategoryInfo          : InvalidOperation: (:) [Start-Process], InvalidOperationException
    + FullyQualifiedErrorId : InvalidOperationException,Microsoft.PowerShell.Commands.StartProcessCommand
 


## BELOW HERE IS NOT ACTUAL ANYMORE

In [171]:
gdf = gpd.read_file('../data/raw/geodata.json')
gdf.set_crs('epsg:4283')
# map tile providers provide only a few projections, so casting to that specific projection
# WGS84 (EPSG:4326) or Web Mercator (EPSG:3857) are usually available 
# source: https://kodu.ut.ee/~kmoch/geopython2020/L6/interactive-map-bokeh.html
gdf['geometry'] = gdf['geometry'].to_crs(epsg=3857) 
gdf.head()


Unnamed: 0,id,lc_ply_pid,dt_create,dt_retire,loc_pid,vic_locali,vic_loca_1,vic_loca_2,vic_loca_3,vic_loca_4,vic_loca_5,vic_loca_6,vic_loca_7,geometry
0,ckan_af33dd8c_0534_4e18_9245_fc64440f742e.1,6670,2011-08-31,,VIC2615,2012-04-27,,UNDERBOOL,,,G,,2,"MULTIPOLYGON (((15779040.302 -4173707.184, 157..."
1,ckan_af33dd8c_0534_4e18_9245_fc64440f742e.2,6671,2011-08-31,,VIC1986,2012-04-27,,NURRAN,,,G,,2,"MULTIPOLYGON (((16549732.035 -4494406.666, 165..."
2,ckan_af33dd8c_0534_4e18_9245_fc64440f742e.3,6672,2011-08-31,,VIC2862,2012-04-27,,WOORNDOO,,,G,,2,"MULTIPOLYGON (((15910102.922 -4576437.989, 159..."
3,ckan_af33dd8c_0534_4e18_9245_fc64440f742e.4,6673,2011-08-31,,VIC734,2018-08-03,,DEPTFORD,,,G,,2,"MULTIPOLYGON (((16455621.450 -4531505.374, 164..."
4,ckan_af33dd8c_0534_4e18_9245_fc64440f742e.5,6674,2011-08-31,,VIC2900,2012-04-27,,YANAC,,,G,,2,"MULTIPOLYGON (((15727194.252 -4300425.394, 157..."


In [30]:
gdf['base'] = 100
gdf['1bedroom_pred'] = np.random.randint(75, 125, gdf.shape[0])
gdf['2bedroom_pred'] = np.random.randint(100, 150, gdf.shape[0])

gdf.head()

Unnamed: 0,id,lc_ply_pid,dt_create,dt_retire,loc_pid,vic_locali,vic_loca_1,vic_loca_2,vic_loca_3,vic_loca_4,vic_loca_5,vic_loca_6,vic_loca_7,geometry,base,1bedroom_pred,2bedroom_pred
0,ckan_af33dd8c_0534_4e18_9245_fc64440f742e.1,6670,2011-08-31,,VIC2615,2012-04-27,,UNDERBOOL,,,G,,2,"MULTIPOLYGON (((15779040.302 -4173707.184, 157...",100,80,124
1,ckan_af33dd8c_0534_4e18_9245_fc64440f742e.2,6671,2011-08-31,,VIC1986,2012-04-27,,NURRAN,,,G,,2,"MULTIPOLYGON (((16549732.035 -4494406.666, 165...",100,108,124
2,ckan_af33dd8c_0534_4e18_9245_fc64440f742e.3,6672,2011-08-31,,VIC2862,2012-04-27,,WOORNDOO,,,G,,2,"MULTIPOLYGON (((15910102.922 -4576437.989, 159...",100,120,109
3,ckan_af33dd8c_0534_4e18_9245_fc64440f742e.4,6673,2011-08-31,,VIC734,2018-08-03,,DEPTFORD,,,G,,2,"MULTIPOLYGON (((16455621.450 -4531505.374, 164...",100,112,110
4,ckan_af33dd8c_0534_4e18_9245_fc64440f742e.5,6674,2011-08-31,,VIC2900,2012-04-27,,YANAC,,,G,,2,"MULTIPOLYGON (((15727194.252 -4300425.394, 157...",100,81,115


In [31]:
#gdf['geometry'] = gdf['geometry'].simplify(0.05, preserve_topology=True) # if we want to reduce precision of each polygon to make it quicker to display, 0.05 is the reducing coeficitient or whateve

In [32]:
gdf['postal_code'] = gdf['loc_pid'].apply(lambda x: int(str(x).split('VIC')[1]))
gdf = gdf[gdf['postal_code'].isin(range(3000, 3792))]   # keep only greater Melbourne post indexes

In [34]:
# Set bokeh to save file
output_file("../plots/test.html", title="Reduced suburb plotting")

geo_source = GeoJSONDataSource(geojson=gdf.to_json())

# giving the basic view frame of the map
tile_provider = get_provider(CARTODBPOSITRON)
# range bounds supplied in web mercator coordinates
p = figure(x_range=(15600000, 16700000), y_range=(-5000000, -3900000),
           x_axis_type="mercator", y_axis_type="mercator",
           plot_height=600, plot_width=600)
p.add_tile(tile_provider)

# Define color palettes
palette = brewer['GnBu'][8]
# reverse order of colors so higher values have darker colors
palette = palette[::-1]

# Instantiate LinearColorMapper that linearly maps numbers in a range, into a sequence of colors.
# TODO, use those as min and max from data we plot
color_mapper = LinearColorMapper(palette=palette, low=75, high=125)

color_bar = ColorBar(color_mapper=color_mapper, label_standoff=8, width=500, height=20,
                     border_line_color=None, location=(0, 0), orientation='horizontal')
# major_label_overrides = tick_labels, could be used to set custom tickers, tick_labels is a dict value->display value


# Add patch renderer to figure.
suburbs = p.patches('xs', 'ys', source=geo_source,
                    # here comes the transform function
                    fill_color={'field': '1bedroom_pred',
                                'transform': color_mapper},
                    line_color='gray',
                    line_width=0.25,
                    fill_alpha=0.2)

# add tooltips as needed for other prediction values
tooltips = [('Suburb', '@vic_loca_2'),
            ('2025 prediction', '@1bedroom_pred')]
p.add_tools(HoverTool(renderers=[suburbs],
                      tooltips=tooltips))

show(p);  # this ; prevents vscode of showing output, output breaks things, todo to be fixef


Start : This command cannot be run due to the error: The system cannot find the file specified.
At line:1 char:1
+ Start "file:///home/toomas/ADS/generic-real-estate-consulting-project ...
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    + CategoryInfo          : InvalidOperation: (:) [Start-Process], InvalidOperationException
    + FullyQualifiedErrorId : InvalidOperationException,Microsoft.PowerShell.Commands.StartProcessCommand
 


In [1]:
gdf.color = 'blue'
show(p);

NameError: name 'gdf' is not defined

In [9]:
from bokeh.io import show
from bokeh.models import CustomJS, RadioGroup

LABELS = ["Option 1", "Option 2", "Option 3"]

radio_group = RadioGroup(labels=LABELS, active=0)
radio_group.js_on_click(CustomJS(code="""
    console.log('radio_group: active=' + this.active, this.toString())
"""))

show(radio_group)


Start : This command cannot be run due to the error: The system cannot find the file specified.
At line:1 char:1
+ Start "file:///home/toomas/ADS/generic-real-estate-consulting-project ...
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    + CategoryInfo          : InvalidOperation: (:) [Start-Process], InvalidOperationException
    + FullyQualifiedErrorId : InvalidOperationException,Microsoft.PowerShell.Commands.StartProcessCommand
 


In [10]:
# might be needed
output_file("../plots/test2.html", title="Testing suburb plotting")

palette = brewer['RdBu'][8]
palette = palette[::-1]

# Define the callback function: update_plot


def update_plot(attr, old, new):
    # The input cr is the criteria selected from the select box
    appartment_type = select.value

    # Update the plot based on the changed inputs
    p = make_plot(appartment_type)

    # Update the layout, clear the old document and display the new document
    layout = column(p, widgetbox(select))
    curdoc().clear()
    curdoc().add_root(layout)

    # Create a plotting function


def make_plot(field_name):
    # colorbar, todo make actual minmax
    min_range = 75
    max_range = 150

    color_mapper = LinearColorMapper(
        palette=palette, low=min_range, high=max_range)

    # Create color bar.
    color_bar = ColorBar(color_mapper=color_mapper,
                         border_line_color=None, location=(0, 0))

    # Create figure object.
    apt_type = gdf[field_name]

    p = figure(x_range=(15600000, 16700000), y_range=(-5000000, -3900000),
               x_axis_type="mercator", y_axis_type="mercator",
               plot_height=600, plot_width=600,
               title=field_name+' prediction for 2025')

    # Add patch renderer to figure.
    geo_source = GeoJSONDataSource(geojson=gdf.to_json())
    suburbs = p.patches('xs', 'ys', source=geo_source,
                        fill_color={'field': field_name,
                                    'transform': color_mapper},
                        line_color='black', line_width=0.25, fill_alpha=0.5)

    # Specify color bar layout.
    p.add_layout(color_bar, 'right')

    # Add the hover tool to the graph
    # add tooltips as needed for other prediction values
    tooltips = [('Suburb', '@vic_loca_2'),
                ('1 bedroom 2025 prediction', '@1bedroom_pred'),
                ('2 bedroom 2025 prediction', '@2bedroom_pred')]
    hover = HoverTool(renderers=[suburbs], tooltips=tooltips)

    p.add_tools(hover)
    return p


# Call the plotting function
input_field = '1bedroom_pred'
p = make_plot(input_field)


# Make a selection object: select
# add here other column names you want to plot
select = Select(title='Select apt type:', value='1bedroom_pred',
                options=['1bedroom_pred', '2bedroom_pred'])
select.on_change('value', update_plot)

# Make a column layout of select and plot, and add it to the current document
# Display the current document
layout = column(p, select)
curdoc().add_root(layout)


# output_notebook()
show(p)


Start : This command cannot be run due to the error: The system cannot find the file specified.
At line:1 char:1
+ Start "file:///home/toomas/ADS/generic-real-estate-consulting-project ...
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    + CategoryInfo          : InvalidOperation: (:) [Start-Process], InvalidOperationException
    + FullyQualifiedErrorId : InvalidOperationException,Microsoft.PowerShell.Commands.StartProcessCommand
 
