In [1]:
import pandas as pd
from pyspark.sql import functions as F
import numpy as np

In [2]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

22/09/12 15:38:50 WARN Utils: Your hostname, DESKTOP-1D7SN6N resolves to a loopback address: 127.0.1.1; using 172.23.1.215 instead (on interface eth0)
22/09/12 15:38:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/12 15:38:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [10]:
properties = pd.read_csv('../data/curated/properties.csv') 
properties = properties.reset_index()
sparkProperty=spark.createDataFrame(properties) 

### Calculate Distance to School

In [11]:
schools = pd.read_csv('../data/curated/schools.csv')
sparkSchool = spark.createDataFrame(schools)

In [13]:
#https://stackoverflow.com/questions/27928/calculate-distance-between-two-latitude-longitude-points-haversine-formula

from math import cos, asin, sqrt, pi

def distance(lat1, lon1, lat2, lon2):
    p = pi/180
    a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p) * cos(lat2*p) * (1-cos((lon2-lon1)*p))/2
    return 12742 * asin(sqrt(a)) #2*R*asin...
distanceUDF = F.udf(lambda a,b,c,d: distance(a,b,c,d))

In [44]:
school_distance = sparkProperty \
            .crossJoin(sparkSchool) \
            .withColumn("distance", 
                        distanceUDF(F.col("prop_lat"), F.col("prop_long"), F.col("school_lat"), F.col("school_long"))
                       )

In [47]:
# For each property, calculate its distance to every school
# then count for number of schools within 3km of each property
school_count = school_distance \
            .filter(F.col("distance") <= 3) \
            .groupBy(F.col("index")) \
            .count() \
            .join(sparkProperty, 'index', 'right') \
            .orderBy('index')

In [48]:
nearestSchool = school_distance \
            .groupBy(F.col('index')) \
            .agg({'distance': 'min'})

In [54]:
school_distance \
            .join(nearestSchool, 'index') \
            .filter(F.col("distance") == F.col("min(distance)")) \
            .select('index','school_lat', 'school_long', 'distance') \
            .toPandas()

22/09/12 16:05:09 WARN ExtractPythonUDFFromJoinCondition: The join condition:isnotnull(<lambda>(prop_lat#49, prop_long#50, school_lat#67, school_long#66)#798) of the join plan contains PythonUDF only, it will be moved out and the join plan will be turned to cross join.


                                                                                

Unnamed: 0,index,school_lat,school_long,distance
0,163,-37.68227,145.01150,0.41993489911442805
1,357,-38.17551,144.38277,1.503205163147383
2,509,-36.72058,144.25426,0.19287648807960583
3,216,-37.79838,144.89267,0.6771463834716006
4,472,-37.75310,144.92104,0.4410581494860552
...,...,...,...,...
9389,9358,-37.93846,145.02350,0.5555827925406773
9390,9326,-37.80519,144.95509,0.607354674226439
9391,9265,-37.81154,144.97056,0.7384529716809485
9392,8977,-37.81154,144.97056,0.8273281932123162


school_long,school_lat
146.6666,-38.61771
142.59039,-38.38628
143.47565,-37.0845
145.23472,-36.90137
145.21398,-37.74268
144.3421,-38.17067
144.44312,-37.67303
143.85394,-37.56401
146.88982,-36.12921
145.35442,-38.04409


In [16]:
# Fill propertys having no schools within 3km as 0
school_count[['count']] = school_count[['count']].fillna(0)

In [17]:
distanceSchool = pd.merge(school_count, nearestSchool, how='inner', on = 'index')

In [18]:
distanceSchool = distanceSchool.rename(columns = {'count':'numSchool_3km', 'min(distance)': 'distance_to_nearest_school'})

In [19]:
distanceSchool = distanceSchool[['index', 'numSchool_3km', 'distance_to_nearest_school']]

In [20]:
properties = pd.merge(properties, distanceSchool, how='inner', on='index')

### Calculate Distance to Hospital

In [22]:
hospitals = pd.read_csv("../data/curated/hospitals_vic.csv")
sparkHospital = spark.createDataFrame(hospitals)

In [36]:
hospital_count = sparkProperty \
            .crossJoin(sparkHospital) \
            .withColumn("distance", 
                        distanceUDF(F.col("prop_lat"), F.col("prop_long"), F.col("Latitude"), F.col("Longitude"))
                       ) \
            .filter(F.col("distance") <= 3) \
            .groupBy(F.col("index")) \
            .count() \
            .join(sparkProperty, 'index', 'right') \
            .orderBy('index')

In [37]:
nearestHospital = sparkProperty \
            .crossJoin(sparkHospital) \
            .withColumn("distance", 
                        distanceUDF(F.col("prop_lat"), F.col("prop_long"), F.col("Latitude"), F.col("Longitude"))
                       ) \
            .groupBy(F.col('index')) \
            .agg({'distance': 'min'})

In [38]:
hospital_count

22/09/12 15:49:59 WARN ExtractPythonUDFFromJoinCondition: The join condition:(cast(<lambda>(prop_lat#49, prop_long#50, Latitude#190, Longitude#191)#347 as int) <= 3) of the join plan contains PythonUDF only, it will be moved out and the join plan will be turned to cross join.


                                                                                

22/09/12 15:50:07 WARN ExtractPythonUDFFromJoinCondition: The join condition:(cast(<lambda>(prop_lat#49, prop_long#50, Latitude#190, Longitude#191)#347 as int) <= 3) of the join plan contains PythonUDF only, it will be moved out and the join plan will be turned to cross join.


                                                                                

index,count,name,type,prop_lat,prop_long,Beds,Baths,Parking,weekly_cost,suburb
0,1,56 Selwyn Street ...,House,-37.7807821,144.8130199,3,2,2,300.0,3020
1,1,1/3 Robin Street ...,Apartment / Unit ...,-37.8690276,144.8072501,2,1,1,300.0,3018
2,1,34 Hucker Street ...,House,-37.27641089999999,142.920079,3,1,1,300.0,3377
3,1,3 Neylan Street A...,House,-37.292905,142.9218811,2,1,2,300.0,3377
4,1,8 Suspension Stre...,House,-37.7823611,144.8088328,2,1,2,300.0,3022
5,14,14/27 St Georges ...,Apartment / Unit ...,-37.860329,145.0220163,1,1,1,300.0,3143
6,8,1/28 Ormond Road ...,Apartment / Unit ...,-37.7730663,144.9303035,1,1,1,300.0,3032
7,1,30B San Remo Driv...,Apartment / Unit ...,-37.7573875,144.8691038,1,1,0,300.0,3034
8,3,205 Johns Street ...,House,-37.5570369,143.8794746,3,1,1,300.0,3350
9,1,14B Purcell St Be...,House,-36.5566188,145.973497,1,1,0,300.0,3672


In [25]:
# Fill propertys having no hospitals within 3km as 0
hospital_count[['count']] = hospital_count[['count']].fillna(0)

In [26]:
distanceHospital = pd.merge(hospital_count, nearestHospital, how='inner', on = 'index')

In [27]:
distanceHospital = distanceHospital.rename(columns = {'count':'numHospital_3km', 'min(distance)': 'distance_to_nearest_hospital'})

In [28]:
distanceHospital = distanceHospital[['index', 'numHospital_3km', 'distance_to_nearest_hospital']]

In [29]:
properties = pd.merge(properties, distanceHospital, how='inner', on='index')

In [30]:
properties

Unnamed: 0,index,name,type,prop_lat,prop_long,Beds,Baths,Parking,weekly_cost,suburb,numSchool_3km,distance_to_nearest_school,numHospital_3km,distance_to_nearest_hospital
0,0,56 Selwyn Street Albion VIC 3020,House,-37.780782,144.813020,3,2,2,300.0,3020,33.0,0.6501404158195666,1.0,10.83147814710251
1,1,1/3 Robin Street Altona VIC 3018,Apartment / Unit / Flat,-37.869028,144.807250,2,1,1,300.0,3018,12.0,0.6896526476051238,1.0,1.8653500382034183
2,2,34 Hucker Street Ararat VIC 3377,House,-37.276411,142.920079,3,1,1,300.0,3377,6.0,0.1333351425115027,1.0,1.1618697631779715
3,3,3 Neylan Street Ararat VIC 3377,House,-37.292905,142.921881,2,1,2,300.0,3377,6.0,0.7678326903981126,1.0,1.843199687372491
4,4,8 Suspension Street Ardeer VIC 3022,House,-37.782361,144.808833,2,1,2,300.0,3022,30.0,0.5862197076000492,1.0,10.834261807364065
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9389,9389,31 Austin Road Seaford VIC 3198,House,-38.105231,145.134350,4,2,2,850.0,3198,15.0,0.34816382989589356,0.0,101.60528521826753
9390,9390,12 Morobe Street Sorrento VIC 3943,House,-38.354365,144.758948,5,2,0,850.0,3943,2.0,100.14569569048717,0.0,10.895777765396797
9391,9391,7/278 Kings Way South Melbourne VIC 3205,Townhouse,-37.833083,144.966977,3,2,1,850.0,3205,50.0,0.5303940155951052,26.0,0.47241226296551453
9392,9392,37 Cunningham Street South Yarra VIC 3141,House,-37.840129,144.996587,3,1,1,850.0,3141,58.0,0.29501884947033125,21.0,0.8452948276125342
