In [1]:
import pandas as pd
from pyspark.sql import functions as F
import numpy as np
from math import cos, asin, sqrt, pi

In [2]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

22/09/17 12:23:56 WARN Utils: Your hostname, DESKTOP-1D7SN6N resolves to a loopback address: 127.0.1.1; using 172.24.50.100 instead (on interface eth0)
22/09/17 12:23:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/17 12:23:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# The goal is to find the number of each type of facilities near each property
# and the distance to the nearest facilities
# Therefore we need an individual index for each property
# Furthermore, we only need properties' coordinate to calculate distances

# THIS NOTEBOOK CALCULATE DISTANCE ACCORDING TO STRAIGHT LINE DISTANCE
# THE RESULT WILL BE FURTHER PROCESSED TO GET A MORE ACCURACTE APPROXIMATION VIA API
properties = pd.read_csv('../data/curated/properties.csv') 
properties = properties.reset_index()
properties = properties[['index', 'prop_lat', 'prop_long']]
sparkProperty=spark.createDataFrame(properties) 

In [8]:
"""
Calculate distance between two points, the calculation is based on haversine formula
Reference of implementation:
https://stackoverflow.com/questions/27928/calculate-distance-between-two-latitude-longitude-points-haversine-formula

param: latitude, longitude of the two positions
return: the distance in km between the two positions
"""
EQUATOR_DIAMETER = 12742
def distance(lat1, lon1, lat2, lon2):
    p = pi/180
    a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p) * cos(lat2*p) * (1-cos((lon2-lon1)*p))/2
    return EQUATOR_DIAMETER * asin(sqrt(a)) 
distanceUDF = F.udf(lambda a,b,c,d: distance(a,b,c,d))

### Calculate Distance to School

In [9]:
schools = pd.read_csv('../data/curated/schools.csv')
sparkSchool = spark.createDataFrame(schools)

In [12]:
# For each property, calculate its distance to every school
school_distance = sparkProperty \
            .crossJoin(sparkSchool) \
            .withColumn("distance", 
                        distanceUDF(F.col("prop_lat"), F.col("prop_long"), F.col("school_lat"), F.col("school_long"))
                       )

In [13]:
# Count for number of schools within 3km of each property
school_count = school_distance \
            .filter(F.col("distance") <= 3) \
            .groupBy(F.col("index")) \
            .count() \
            .join(sparkProperty, 'index', 'right') \
            .orderBy('index') \
            .toPandas()

22/09/17 12:24:53 WARN ExtractPythonUDFFromJoinCondition: The join condition:(cast(<lambda>(prop_lat#1, prop_long#2, school_lat#7, school_long#6)#15 as int) <= 3) of the join plan contains PythonUDF only, it will be moved out and the join plan will be turned to cross join.


                                                                                

In [14]:
# Find the nearest distance to school for each property
nearestSchool = school_distance \
            .groupBy(F.col('index')) \
            .agg({'distance': 'min'})

In [15]:
# Find the nearest school for each property according to distance
nearestSchoolCoord = school_distance \
            .join(nearestSchool, 'index') \
            .filter(F.col("distance") == F.col("min(distance)")) \
            .select('index','school_lat', 'school_long', 'distance') \
            .toPandas()

22/09/17 12:25:17 WARN ExtractPythonUDFFromJoinCondition: The join condition:isnotnull(<lambda>(prop_lat#1, prop_long#2, school_lat#7, school_long#6)#15) of the join plan contains PythonUDF only, it will be moved out and the join plan will be turned to cross join.


                                                                                

In [16]:
# Fill propertys having no schools within 3km as 0
school_count[['count']] = school_count[['count']].fillna(0)

In [17]:
# Get the number of school, and the position of nearest school
distanceSchool = pd.merge(school_count, nearestSchoolCoord, how='inner', on = 'index')

In [18]:
distanceSchool = distanceSchool.rename(columns = {'count':'numSchool_3km', 'distance': 'distance_school'})

In [19]:
distanceSchool = distanceSchool[['index', 'numSchool_3km', 'school_lat', 'school_long', 'distance_school']]

In [20]:
# Combine the distance to each property
properties = pd.merge(properties, distanceSchool, how='inner', on='index')

### Calculate Distance to Hospital

In [23]:
hospitals = pd.read_csv("../data/curated/hospitals_vic.csv")
sparkHospital = spark.createDataFrame(hospitals).select('Latitude', 'Longitude')

In [26]:
# For each property, calculate its distance to every hospital
hospital_distance = sparkProperty \
            .crossJoin(sparkHospital) \
            .withColumn("distance", 
                        distanceUDF(F.col("prop_lat"), F.col("prop_long"), F.col("Latitude"), F.col("Longitude"))
                       )

In [27]:
# Count for number of hospitals within 1km of each property
hospital_count = hospital_distance \
            .filter(F.col("distance") <= 1) \
            .groupBy(F.col("index")) \
            .count() \
            .join(sparkProperty, 'index', 'right') \
            .orderBy('index') \
            .toPandas()

22/09/17 12:27:39 WARN ExtractPythonUDFFromJoinCondition: The join condition:(cast(<lambda>(prop_lat#1, prop_long#2, Latitude#94, Longitude#95)#127 as int) <= 1) of the join plan contains PythonUDF only, it will be moved out and the join plan will be turned to cross join.


                                                                                

In [28]:
# Find the nearest distance to hospital for each property
nearestHospital = hospital_distance \
            .groupBy(F.col('index')) \
            .agg({'distance': 'min'})

In [29]:
# Find the nearest hospital for each property according to distance
nearestHospitalCoord = hospital_distance \
            .join(nearestHospital, 'index') \
            .filter(F.col("distance") == F.col("min(distance)")) \
            .select('index','Latitude', 'Longitude', 'distance') \
            .toPandas()

22/09/17 12:27:47 WARN ExtractPythonUDFFromJoinCondition: The join condition:isnotnull(<lambda>(prop_lat#1, prop_long#2, Latitude#94, Longitude#95)#127) of the join plan contains PythonUDF only, it will be moved out and the join plan will be turned to cross join.


                                                                                

In [30]:
# Fill propertys having no hospitals within 1km as 0
hospital_count[['count']] = hospital_count[['count']].fillna(0)

In [31]:
# Get the number of Hospital, and the position of nearest hospital
distanceHospital = pd.merge(hospital_count, nearestHospitalCoord, how='inner', on = 'index')

In [32]:
distanceHospital = distanceHospital.rename(columns = {'count':'numHospital_1km', 
                                                      'Latitude': 'hospital_lat', 
                                                      'Longitude':'hospital_long',
                                                      'distance': 'distance_hospital'})

In [33]:
distanceHospital = distanceHospital[['index', 'numHospital_1km', 'hospital_lat', 'hospital_long', 'distance_hospital']]

In [34]:
# Combine the distance to each property
properties = pd.merge(properties, distanceHospital, how='inner', on='index')

In [38]:
properties

Unnamed: 0,index,prop_lat,prop_long,numSchool_3km,school_lat,school_long,distance_school,numHospital_1km,hospital_lat,hospital_long,distance_hospital
0,0,-37.780782,144.813020,33.0,-37.78219,144.82020,0.6501404158195666,0.0,-37.693365,144.758677,10.83147814710251
1,1,-37.869028,144.807250,12.0,-37.86390,144.81167,0.6896526476051238,1.0,-37.869448,144.828494,1.8653500382034183
2,2,-37.276411,142.920079,6.0,-37.27736,142.92100,0.1333351425115027,1.0,-37.278816,142.932858,1.1618697631779715
3,3,-37.292905,142.921881,6.0,-37.28600,142.92196,0.7678326903981126,1.0,-37.278816,142.932858,1.843199687372491
4,4,-37.782361,144.808833,30.0,-37.78753,144.80752,0.5862197076000492,0.0,-37.693365,144.758677,10.834261807364065
...,...,...,...,...,...,...,...,...,...,...,...
9115,9077,-38.105231,145.134350,15.0,-38.10444,145.13050,0.34816382989589356,0.0,-38.243369,143.985357,101.60528521826753
9116,9078,-38.354365,144.758948,2.0,-37.46289,144.59658,100.14569569048717,0.0,-38.361748,144.883554,10.895777765396797
9117,9079,-37.833083,144.966977,50.0,-37.83589,144.97186,0.5303940155951052,2.0,-37.834246,144.972151,0.47241226296551453
9118,9080,-37.840129,144.996587,58.0,-37.83762,144.99768,0.29501884947033125,6.0,-37.847714,144.997225,0.8452948276125342


### Calculate Distance to Train Stations

In [40]:
stations = pd.read_csv('../data/curated/traffic_dataset.csv')
sparkStation = spark.createDataFrame(stations).select('LATITUDE', 'LONGITUDE')

In [41]:
# For each property, calculate its distance to every train station
station_distance = sparkProperty \
            .crossJoin(sparkStation) \
            .withColumn("distance", 
                        distanceUDF(F.col("prop_lat"), F.col("prop_long"), F.col("LATITUDE"), F.col("LONGITUDE"))
                       )

In [42]:
# Count for number of stations within 1km of each property
station_count = station_distance \
            .filter(F.col("distance") <= 1) \
            .groupBy(F.col("index")) \
            .count() \
            .join(sparkProperty, 'index', 'right') \
            .orderBy('index') \
            .toPandas()

22/09/17 12:28:59 WARN ExtractPythonUDFFromJoinCondition: The join condition:(cast(<lambda>(prop_lat#1, prop_long#2, LATITUDE#218, LONGITUDE#219)#240 as int) <= 1) of the join plan contains PythonUDF only, it will be moved out and the join plan will be turned to cross join.


                                                                                

In [43]:
# Find the nearest distance to station for each property
nearestStation = station_distance \
            .groupBy(F.col('index')) \
            .agg({'distance': 'min'})

In [44]:
# Find the nearest station for each property according to distance
nearestStationCoord = station_distance \
            .join(nearestStation, 'index') \
            .filter(F.col("distance") == F.col("min(distance)")) \
            .select('index','LATITUDE', 'LONGITUDE', 'distance') \
            .toPandas()

22/09/17 12:29:07 WARN ExtractPythonUDFFromJoinCondition: The join condition:isnotnull(<lambda>(prop_lat#1, prop_long#2, LATITUDE#218, LONGITUDE#219)#240) of the join plan contains PythonUDF only, it will be moved out and the join plan will be turned to cross join.


                                                                                

In [45]:
# Fill propertys having no train stations within 1km as 0
station_count[['count']] = station_count[['count']].fillna(0)

In [46]:
# Get the number of stations, and the position of nearest station
distanceStation = pd.merge(station_count, nearestStationCoord, how='inner', on = 'index')


In [47]:
distanceStation = distanceStation.rename(columns = {'count':'numStation_1km', 
                                                      'LATITUDE': 'station_lat', 
                                                      'LONGITUDE':'station_long',
                                                      'distance': 'distance_station'})


In [48]:
distanceStation = distanceStation[['index', 'numStation_1km', 'station_lat', 'station_long', 'distance_station']]

In [49]:
# Combine the distance to each property
properties = pd.merge(properties, distanceStation, how='inner', on='index')

### Calculate Distance to Shopping Centres

In [51]:
shopping = pd.read_csv('../data/curated/shopping_centre.csv')
sparkShopping = spark.createDataFrame(shopping).select('latitude','longitude')

In [52]:
# For each property, calculate its distance to every shopping centre
shopping_distance = sparkProperty \
            .crossJoin(sparkShopping) \
            .withColumn("distance", 
                        distanceUDF(F.col("prop_lat"), F.col("prop_long"), F.col("latitude"), F.col("longitude"))
                       )

In [53]:
# Count for number of shopping centre within 3km of each property
shopping_count = shopping_distance \
            .filter(F.col("distance") <= 3) \
            .groupBy(F.col("index")) \
            .count() \
            .join(sparkProperty, 'index', 'right') \
            .orderBy('index') \
            .toPandas()

22/09/17 12:29:44 WARN ExtractPythonUDFFromJoinCondition: The join condition:(cast(<lambda>(prop_lat#1, prop_long#2, latitude#312, longitude#313)#327 as int) <= 3) of the join plan contains PythonUDF only, it will be moved out and the join plan will be turned to cross join.


                                                                                

In [54]:
# Find the nearest distance to shopping centre for each property
nearestShopping = shopping_distance \
            .groupBy(F.col('index')) \
            .agg({'distance': 'min'})

In [55]:
# Find the nearest shopping centre for each property according to distance
nearestShoppingCoord = shopping_distance \
            .join(nearestShopping, 'index') \
            .filter(F.col("distance") == F.col("min(distance)")) \
            .select('index','latitude', 'longitude', 'distance') \
            .toPandas()

22/09/17 12:29:51 WARN ExtractPythonUDFFromJoinCondition: The join condition:isnotnull(<lambda>(prop_lat#1, prop_long#2, latitude#312, longitude#313)#327) of the join plan contains PythonUDF only, it will be moved out and the join plan will be turned to cross join.


                                                                                

In [56]:
# Fill propertys having no shopping centre within 3km as 0
shopping_count[['count']] = shopping_count[['count']].fillna(0)

In [57]:
# Get the number of shopping centres, and the position of nearest shopping centre
distanceShopping = pd.merge(shopping_count, nearestShoppingCoord, how='inner', on = 'index')

In [58]:
distanceShopping = distanceShopping.rename(columns = {'count':'numShopping_3km', 
                                                      'latitude': 'shopping_lat', 
                                                      'longitude':'shopping_long',
                                                      'distance': 'distance_shopping'})


In [59]:
distanceShopping = distanceShopping[['index', 'numShopping_3km', 'shopping_lat', 'shopping_long', 'distance_shopping']]

In [60]:
# Combine the distance to each property
properties = pd.merge(properties, distanceShopping, how='inner', on='index')

### Calculate Distance to Entertainment Facilities

In [61]:
facility = pd.read_csv('../data/curated/Facilites_list.csv')
facility = facility[['Latitude', 'Longitude']]

# some different facility shares same position, we decide to treat them as one facility
# This is because we need to ensure each property only has one nearest facility
# Otherwise inconsistency will occur in dataframes, resulting in more records
facility = facility.dropna().drop_duplicates()
sparkFacility = spark.createDataFrame(facility)

In [63]:
# For each property, calculate its distance to every entertainment facility
facility_distance = sparkProperty \
            .crossJoin(sparkFacility) \
            .withColumn("distance", 
                        distanceUDF(F.col("prop_lat"), F.col("prop_long"), F.col("Latitude"), F.col("Longitude"))
                       )

In [64]:
# Count for number of facilities within 1km of each property
facility_count = facility_distance \
            .filter(F.col("distance") <= 3) \
            .groupBy(F.col("index")) \
            .count() \
            .join(sparkProperty, 'index', 'right') \
            .orderBy('index') \
            .toPandas()

22/09/17 12:30:27 WARN ExtractPythonUDFFromJoinCondition: The join condition:(cast(<lambda>(prop_lat#1, prop_long#2, Latitude#393, Longitude#394)#402 as int) <= 3) of the join plan contains PythonUDF only, it will be moved out and the join plan will be turned to cross join.


                                                                                

In [65]:
# Find the nearest distance to facility for each property
nearestFacility = facility_distance \
            .groupBy(F.col('index')) \
            .agg({'distance': 'min'})

In [66]:
# Find the nearest facility for each property according to distance
nearestFacilityCoord = facility_distance \
            .join(nearestFacility, 'index', 'inner') \
            .filter(F.col("distance") == F.col("min(distance)")) \
            .select('index','Latitude', 'Longitude', 'distance') \
            .orderBy('index') \
            .toPandas()

22/09/17 12:30:55 WARN ExtractPythonUDFFromJoinCondition: The join condition:isnotnull(<lambda>(prop_lat#1, prop_long#2, Latitude#393, Longitude#394)#402) of the join plan contains PythonUDF only, it will be moved out and the join plan will be turned to cross join.


                                                                                

In [67]:
# Fill propertys having no facilities within 3km as 0
facility_count[['count']] = facility_count[['count']].fillna(0)

In [68]:
# Get the number of facilities, and the position of nearest facility
distanceFacility = pd.merge(facility_count, nearestFacilityCoord, how='inner', on = 'index')

In [69]:
distanceFacility = distanceFacility.rename(columns = {'count':'numFacility_3km', 
                                                      'Latitude': 'facility_lat', 
                                                      'Longitude':'facility_long',
                                                      'distance': 'distance_facility'})


In [70]:
distanceFacility = distanceFacility[['index', 'numFacility_3km', 'facility_lat', 'facility_long', 'distance_facility']]

In [71]:
# Combine the distance to each property
properties = pd.merge(properties, distanceFacility, how='inner', on='index')

### Calculate Distance to Melbourne CBD

In [72]:
# We assume the coordinate of Melbourne Central is the position of CBD
CBD_LAT, CBD_LONG = -37.810454, 144.962379

In [73]:
properties['cbd_lat'] = CBD_LAT
properties['cbd_long'] = CBD_LONG

In [74]:
properties = spark \
        .createDataFrame(properties) \
        .withColumn('distance_CBD',
                    distanceUDF(F.col('prop_lat'), F.col('prop_long'), F.col('cbd_lat'), F.col('cbd_long'))) \
        .toPandas()

22/09/17 12:32:25 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [76]:
properties = properties.drop_duplicates(subset = 'index').set_index('index')

In [86]:
properties = properties.astype(float)

In [93]:
# Convert distance from m to km, this is for consistency with API output later
properties['distance_school'] *= 1000
properties['distance_hospital'] *= 1000
properties['distance_shopping'] *= 1000
properties['distance_station'] *= 1000
properties['distance_facility'] *= 1000
properties['distance_CBD'] *= 1000

In [96]:
properties.to_csv('../data/curated/distance_to_property.csv', index = False)

In [97]:
# Here is the result dataframe, it contains each property's index and its location
# also the number of different buildings near them, as well as the distance to nearest buildings
properties

Unnamed: 0_level_0,prop_lat,prop_long,numSchool_3km,school_lat,school_long,distance_school,numHospital_1km,hospital_lat,hospital_long,distance_hospital,...,shopping_lat,shopping_long,distance_shopping,numFacility_3km,facility_lat,facility_long,distance_facility,cbd_lat,cbd_long,distance_CBD
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-37.780782,144.813020,33.0,-37.78219,144.82020,650.140416,0.0,-37.693365,144.758677,10831.478147,...,-37.781870,144.831506,1629.119423,58.0,-37.781765,144.814164,148.508823,-37.810454,144.962379,13532.036617
1,-37.869028,144.807250,12.0,-37.86390,144.81167,689.652648,1.0,-37.869448,144.828494,1865.350038,...,-37.861908,144.687260,10562.834620,21.0,-37.873873,144.811386,649.677910,-37.810454,144.962379,15099.409296
2,-37.276411,142.920079,6.0,-37.27736,142.92100,133.335143,1.0,-37.278816,142.932858,1161.869763,...,-38.381549,142.520583,127796.151362,20.0,-37.279115,142.914920,546.596079,-37.810454,144.962379,189594.752847
3,-37.292905,142.921881,6.0,-37.28600,142.92196,767.832690,1.0,-37.278816,142.932858,1843.199687,...,-38.381549,142.520583,126076.628837,21.0,-37.292104,142.935676,1223.562625,-37.810454,144.962379,188858.155226
4,-37.782361,144.808833,30.0,-37.78753,144.80752,586.219708,0.0,-37.693365,144.758677,10834.261807,...,-37.781870,144.831506,1993.325580,55.0,-37.782572,144.805283,312.843990,-37.810454,144.962379,13848.330174
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9077,-38.105231,145.134350,15.0,-38.10444,145.13050,348.163830,0.0,-38.243369,143.985357,101605.285218,...,-38.008072,145.086614,11583.859093,31.0,-38.101721,145.129820,556.271010,-37.810454,144.962379,36079.069306
9078,-38.354365,144.758948,2.0,-37.46289,144.59658,100145.695690,0.0,-38.361748,144.883554,10895.777765,...,-37.652833,145.518043,102511.257440,7.0,-38.362027,144.764383,974.923702,-37.810454,144.962379,63046.501597
9079,-37.833083,144.966977,50.0,-37.83589,144.97186,530.394016,2.0,-37.834246,144.972151,472.412263,...,-37.817970,144.969024,1690.020089,70.0,-37.839805,144.967048,747.533606,-37.810454,144.962379,2548.385841
9080,-37.840129,144.996587,58.0,-37.83762,144.99768,295.018849,6.0,-37.847714,144.997225,845.294828,...,-37.838366,144.996065,201.300973,85.0,-37.836983,144.995195,370.537663,-37.810454,144.962379,4462.630147
