In [525]:
from bs4 import BeautifulSoup
import requests

import unicodedata
from csv import writer
import re

import pandas as pd
import numpy as np

import json


#spark session
from pyspark.sql import SparkSession
from functools import reduce
from pyspark.sql import DataFrame
from pyspark.sql import functions as F


import os
import re 

#folium
import geopandas as gpd
import re
import folium

In [526]:
# Create a spark session

spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .getOrCreate()
)

#read in the parquet
domain = spark.read.parquet('../data/raw/domain-website-data')


In [527]:
#drop NSW
domain = domain.filter(F.col('state') == 'VIC')

#drop column with too many missing values
domain = domain.drop('land_size','land_unit','is_rural','is_retirement','url').toPandas()

#filter property type, remove 'cat spcae' and 'Acreage / Semi-Rural'
selected_property_type = ['Apartment / Unit / Flat', 'Studio', 'Townhouse',
       'House', 'New House & Land', 'Semi-Detached', 'Villa', 'Terrace',
       'Acreage / Semi-Rural', 'New Apartments / Off the Plan', 'Duplex',
       'Unknown', 'Farm', 'Penthouse', 'Rural']

domain= domain.loc[domain['property_type'].isin(selected_property_type)]

In [528]:
#remove instance with 'carpark' in street name
def carpark_flag(s):
    pattern = re.compile('carpark|CP|car park|carspace')
    match = re.match(pattern,s)
    #o for non carpark, 1 for carpark
    if match is None:
        return 0
    else:
        return 1

In [529]:
def get_rent(s):
    pattern = re.compile('^( )*([0-9]*)( )*')
    num = re.sub('(,)?', '',s)
    num = re.sub('[^0-9]+', ' ',num)
    match = re.match(pattern,num)[0].replace(" ", "")

    if match !='':
        return match
    else:
        return 0

In [534]:
domain['street'] = domain['street'].str.lower()
domain['carpark_flag'] = [carpark_flag(i) for i in domain['street']]
domain['price_rent'] = [float(get_rent(i)) for i in domain['price']]
domain['coordinate'] = domain[['longitude','latitude']].values.tolist()

In [535]:
domain = domain[(domain.carpark_flag != 1) &
               (domain.price_rent > 40)]


In [537]:
domain_test = domain[domain['price_rent']<2000]
domain_test['coordinate']

0        [144.964584, -37.8106079]
1        [144.956924, -37.8107758]
2        [144.952087, -37.8137741]
3        [144.952087, -37.8137741]
4        [144.960846, -37.8114166]
                   ...            
15252     [145.127548, -37.052784]
15254     [146.327637, -38.324482]
15255    [145.363983, -37.8530846]
15256       [146.6155, -38.068634]
15257       [145.411179, -37.9051]
Name: coordinate, Length: 14733, dtype: object

In [498]:
import openrouteservice

In [517]:
#sample input [[13.42731, 52.51088], [13.384116, 52.533558]], return 
def foot_dict(two_coordinates_list):
    
    route = client.directions(
    coordinates=two_coordinates_list,
    profile='foot-walking',
    format='geojson',
    options={"avoid_features": ["steps"]},
    validate=False,
)

    #access foot distance
    distance_duration = route['features'][0]['properties']['summary']
    return distance_duration

In [538]:
#return duration in sec , distance in m
foot_dict([domain_test['coordinate'][0],domain_test['coordinate'][1]])

{'distance': 833.1, 'duration': 599.8}

In [539]:
coordinates= [domain_test['coordinate'][0],domain_test['coordinate'][1]]
matrix = client.distance_matrix(
    locations=coordinates,
    profile='foot-walking',
    metrics=['distance', 'duration'],
    validate=False,
)
matrix

{'durations': [[0.0, 599.82], [599.82, 0.0]],
 'distances': [[0.0, 833.1], [833.1, 0.0]],
 'destinations': [{'location': [144.964549, -37.810687],
   'snapped_distance': 9.34},
  {'location': [144.957112, -37.810721], 'snapped_distance': 17.61}],
 'sources': [{'location': [144.964549, -37.810687], 'snapped_distance': 9.34},
  {'location': [144.957112, -37.810721], 'snapped_distance': 17.61}],
 'metadata': {'attribution': 'openrouteservice.org | OpenStreetMap contributors',
  'service': 'matrix',
  'timestamp': 1662789191531,
  'query': {'locations': [[144.964584, -37.8106079],
    [144.956924, -37.8107758]],
   'profile': 'foot-walking',
   'responseType': 'json',
   'metricsStrings': ['DISTANCE', 'DURATION'],
   'metrics': ['distance', 'duration']},
  'engine': {'version': '6.7.0',
   'build_date': '2022-02-18T19:37:41Z',
   'graph_date': '2022-08-30T10:07:49Z'}}}

In [495]:

m = folium.Map(location=[52.521861, 13.40744], tiles='cartodbpositron', zoom_start=13)

# Some coordinates in Berlin
coordinates = [[13.384116, 52.533558], [13.428726, 52.519355], [13.41774, 52.498929], [13.374825, 52.496369]]

matrix = client.distance_matrix(
    locations=coordinates,
    profile='foot-walking',
    metrics=['distance', 'duration'],
    validate=False,
)

for marker in coordinates:
    folium.Marker(location=list(reversed(marker))).add_to(m)

print("Durations in secs: {}\n".format(matrix['durations']))
print("Distances in m: {}".format(matrix['distances']))

m

Durations in secs: [[0.0, 2758.8, 3692.7, 3732.83], [2758.8, 0.0, 1986.95, 3954.29], [3692.7, 1986.95, 0.0, 2510.81], [3732.83, 3954.29, 2510.81, 0.0]]

Distances in m: [[0.0, 3831.75, 5128.85, 5184.59], [3831.75, 0.0, 2759.72, 5492.19], [5128.85, 2759.72, 0.0, 3487.3], [5184.59, 5492.19, 3487.3, 0.0]]
