In [1]:
import findspark # You will need to pip install findspark
findspark.init()

In [2]:
from pyspark.sql import SQLContext
from pyspark import SparkContext

In [3]:
sc = SparkContext("local", "MyShell") # Spark UI at http://localhost:4040/jobs/
sqlContext = SQLContext(sc)

In [25]:
df0 = sqlContext.read.json('HAUGHWOUT_MAC_BELLFOREST')
df0.registerTempTable('df0')
df0.printSchema()

root
 |-- accuracy: double (nullable = true)
 |-- asOfTimestamp: long (nullable = true)
 |-- event: string (nullable = true)
 |-- location: struct (nullable = true)
 |    |-- lat: double (nullable = true)
 |    |-- lng: double (nullable = true)
 |-- sensorId: string (nullable = true)
 |-- wifiAccessPoints: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- channel: string (nullable = true)
 |    |    |-- macAddress: string (nullable = true)
 |    |    |-- signalStrength: long (nullable = true)



In [26]:
reads = sqlContext.sql('''
SELECT
sensorId as deviceId,
event as eventType, 
from_unixtime(asOfTimestamp, "yyyy-MM-dd'T'HH:mm:ss'Z'") as timestamp,
location.lat as latitude,
location.lng as longitude,
accuracy
FROM df0
ORDER BY timestamp DESC
''').toPandas()
reads.head(10)

Unnamed: 0,deviceId,eventType,timestamp,latitude,longitude,accuracy
0,32:00:16:53:d8:80,GeolocationUpdate,2017-08-17T12:50:21Z,38.881007,-77.226957,59.0
1,32:00:16:53:d8:80,GeolocationUpdate,2017-08-17T12:35:17Z,38.88104,-77.226939,58.0
2,32:00:16:53:d8:80,GeolocationUpdate,2017-08-17T12:20:13Z,38.880992,-77.226939,57.0
3,32:00:16:53:d8:80,GeolocationUpdate,2017-08-17T12:05:09Z,38.880979,-77.226959,56.0
4,32:00:16:53:d8:80,GeolocationUpdate,2017-08-17T11:50:05Z,38.881029,-77.226933,64.0
5,32:00:16:53:d8:80,GeolocationUpdate,2017-08-17T11:35:01Z,38.881029,-77.226923,56.0
6,32:00:16:53:d8:80,GeolocationUpdate,2017-08-17T11:19:57Z,38.881035,-77.226919,63.0
7,32:00:16:53:d8:80,GeolocationUpdate,2017-08-17T11:04:53Z,38.881024,-77.226924,85.0
8,32:00:16:53:d8:80,GeolocationUpdate,2017-08-17T10:49:48Z,38.880998,-77.226941,45.0
9,32:00:16:53:d8:80,GeolocationUpdate,2017-08-17T10:34:44Z,38.881003,-77.226928,60.0


In [27]:
MARKER = [38.881029, -77.227023]

In [28]:
import folium

def geomap(reads_df, marker=None):
    df = reads_df.dropna()
    mapa = folium.Map(location=[df['latitude'].median(), df['longitude'].median()])
    mapa.fit_bounds([[df['latitude'].min(), df['longitude'].min()], [df['latitude'].max(), df['longitude'].max()]])
    mapa.add_child(folium.PolyLine(list(zip(df.latitude, df.longitude))))
    if marker:
        marker_location = marker
        label = 'MARKER: (%.4f, %.4f)' % (marker_location[0], marker_location[1])
    else:
        marker_location = list(df.iloc[1][['latitude', 'longitude']].values)
        label = 'LATEST LOCATION: (%.4f, %.4f)' % (marker_location[0], marker_location[1])
    folium.RegularPolygonMarker(location=marker_location, popup=label, fill_color='blue', 
                                number_of_sides=4, radius=8).add_to(mapa)
    return mapa

geomap(reads, MARKER)  # If you exclude MARKER, the diamond is the latest location

In [29]:
# Calculate the Standard Error of Prediction - a good means to calculate for sensor accuracy
from geopy.distance import vincenty
def calc_err_m(lat, lng):
    """Calculate Vincenty distance from MARKER"""
    p = (lat, lng)
    return vincenty(p, MARKER).m


from math import sqrt
def calc_sep(df_col):
    """Square root of the sum to the squared error over the count of reads"""
    return sqrt(df_col.apply(lambda x: x*x).sum() / len(df_col))


def compute_error(reads_df):
    """Computes statistics on geolocation accuracy"""
    df = reads_df[['latitude', 'longitude']].dropna() 
    df['err_meters'] = df.apply(lambda row: calc_err_m(row['latitude'], row['longitude']), axis=1)
    results = df[['err_meters']].describe()
    results.loc['accuracy'] = calc_sep(df['err_meters'])  # Expected error in meters
    return results

compute_error(reads)

Unnamed: 0,err_meters
count,12.0
mean,8.0987
std,0.925298
min,6.207206
25%,7.683298
50%,8.134828
75%,8.698316
max,9.57635
accuracy,8.14701
