In [525]:
from bs4 import BeautifulSoup
import requests

import unicodedata
from csv import writer
import re

import pandas as pd
import numpy as np

import json


#spark session
from pyspark.sql import SparkSession
from functools import reduce
from pyspark.sql import DataFrame
from pyspark.sql import functions as F


import os
import re 

#folium
import geopandas as gpd
import re
import folium

In [526]:
# Create a spark session

spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .getOrCreate()
)

#read in the parquet
domain = spark.read.parquet('../data/raw/domain-website-data')


In [527]:
#drop NSW
domain = domain.filter(F.col('state') == 'VIC')

#drop column with too many missing values
domain = domain.drop('land_size','land_unit','is_rural','is_retirement','url').toPandas()

#filter property type, remove 'cat spcae' and 'Acreage / Semi-Rural'
selected_property_type = ['Apartment / Unit / Flat', 'Studio', 'Townhouse',
       'House', 'New House & Land', 'Semi-Detached', 'Villa', 'Terrace',
       'Acreage / Semi-Rural', 'New Apartments / Off the Plan', 'Duplex',
       'Unknown', 'Farm', 'Penthouse', 'Rural']

domain= domain.loc[domain['property_type'].isin(selected_property_type)]

In [528]:
#remove instance with 'carpark' in street name
def carpark_flag(s):
    pattern = re.compile('carpark|CP|car park|carspace')
    match = re.match(pattern,s)
    #o for non carpark, 1 for carpark
    if match is None:
        return 0
    else:
        return 1

In [529]:
def get_rent(s):
    pattern = re.compile('^( )*([0-9]*)( )*')
    num = re.sub('(,)?', '',s)
    num = re.sub('[^0-9]+', ' ',num)
    match = re.match(pattern,num)[0].replace(" ", "")

    if match !='':
        return match
    else:
        return 0

In [534]:
domain['street'] = domain['street'].str.lower()
domain['carpark_flag'] = [carpark_flag(i) for i in domain['street']]
domain['price_rent'] = [float(get_rent(i)) for i in domain['price']]
domain['coordinate'] = domain[['longitude','latitude']].values.tolist()

In [535]:
domain = domain[(domain.carpark_flag != 1) &
               (domain.price_rent > 40)]


In [653]:

domain_test = domain[domain['price_rent']<2000]
domain_test = spark.createDataFrame(domain_test) 

In [654]:
import openrouteservice

In [655]:
#sample input [[13.42731, 52.51088], [13.384116, 52.533558]]
# return dictionary of distance (m) and duration (s)
def foot_dict(two_coordinates_list):
    
    route = client.directions(
    coordinates=two_coordinates_list,
    profile='foot-walking',
    format='geojson',
    options={"avoid_features": ["steps"]},
    validate=False,
)
    distance_duration = route['features'][0]['properties']['summary']
    return distance_duration


def car_dict(two_coordinates_list):
    
    route = client.directions(
    coordinates=two_coordinates_list,
    profile='driving-car',
    format='geojson',
    validate=False,
)
    distance_duration = route['features'][0]['properties']['summary']
    return distance_duration






In [661]:
#get centroid for CBD (postcode 3000)

postcode_df = gpd.read_file("../data/raw/POSTCODE/POSTCODE_POLYGON.shp")
postcode_df['centroid'] = postcode_df['geometry'].centroid
CBD_centroid = postcode_df.loc[postcode_df['POSTCODE'] == '3000']['centroid']
CBD_centroid =[CBD_centroid.values.x[0],CBD_centroid.values.y[0]]


  postcode_df['centroid'] = postcode_df['geometry'].centroid


In [664]:
def car_CBD_dict(coordinate):
    
    route = client.directions(
    coordinates=[CBD_centroid,coordinate],
    profile='driving-car',
    validate=False,
)
    distance_duration = route['features'][0]['properties']['summary']
    return distance_duration



In [665]:
domain_test

property_id,street,suburb,state,postcode,latitude,longitude,price,bedrooms,bathrooms,parking,property_type,carpark_flag,price_rent,coordinate
13671355,104/300 swanston ...,MELBOURNE,VIC,3000,-37.8106079,144.964584,$550,2,1,0,Apartment / Unit ...,0,550.0,"[144.964584, -37...."
14868152,101/25-33 wills s...,MELBOURNE,VIC,3000,-37.8107758,144.956924,$400,1,1,0,Apartment / Unit ...,0,400.0,"[144.956924, -37...."
15572564,2108/288 spencer ...,MELBOURNE,VIC,3000,-37.8137741,144.952087,$350,1,1,0,Apartment / Unit ...,0,350.0,"[144.952087, -37...."
15831027,2201/288 spencer ...,MELBOURNE,VIC,3000,-37.8137741,144.952087,$400,1,1,0,Apartment / Unit ...,0,400.0,"[144.952087, -37...."
15996004,502/5 sutherland ...,MELBOURNE,VIC,3000,-37.8114166,144.960846,$430 per week,1,1,0,Apartment / Unit ...,0,430.0,"[144.960846, -37...."
16033929,4706/500 elizabet...,MELBOURNE,VIC,3000,-37.8072433,144.960281,$570 per week,2,2,0,Apartment / Unit ...,0,570.0,"[144.960281, -37...."
16046123,2413/220 spencer ...,MELBOURNE,VIC,3000,-37.8157158,144.952957,$450 per week,2,2,1,Apartment / Unit ...,0,450.0,"[144.952957, -37...."
16063063,3601/8 sutherland...,MELBOURNE,VIC,3000,-37.8112869,144.96106,$720 per week,3,2,0,Apartment / Unit ...,0,720.0,"[144.96106, -37.8..."
16069309,unit 1713/200 spe...,MELBOURNE,VIC,3000,-37.816227,144.953247,$440 per week,1,1,1,Apartment / Unit ...,0,440.0,"[144.953247, -37...."
16071670,1715/220 spencer ...,MELBOURNE,VIC,3000,-37.8157158,144.952957,$600,2,2,1,Apartment / Unit ...,0,600.0,"[144.952957, -37...."
