#### Summary
*Using WiFi-only for geolocation in a congested area has indoor accuracy of +/- 8 m. This is far better than the commercial GPS sensor standard. It is intesting to see that, like GPS, the error correlates to a higher level of WiFi signal received through windows, vs. walls.*

In [1]:
import findspark # You will need to pip install findspark
findspark.init()

In [2]:
from pyspark.sql import SQLContext
from pyspark import SparkContext

In [3]:
sc = SparkContext("local", "MyShell") # Spark UI at http://localhost:4040/jobs/
sqlContext = SQLContext(sc)

In [4]:
df0 = sqlContext.read.json('HAUGHWOUT_MAC_BELLFOREST')
df0.registerTempTable('df0')
df0.printSchema()

root
 |-- accuracy: double (nullable = true)
 |-- asOfTimestamp: long (nullable = true)
 |-- event: string (nullable = true)
 |-- location: struct (nullable = true)
 |    |-- lat: double (nullable = true)
 |    |-- lng: double (nullable = true)
 |-- sensorId: string (nullable = true)
 |-- wifiAccessPoints: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- channel: string (nullable = true)
 |    |    |-- macAddress: string (nullable = true)
 |    |    |-- signalStrength: long (nullable = true)



In [5]:
reads = sqlContext.sql('''
SELECT
sensorId as deviceId,
event as eventType, 
from_unixtime(asOfTimestamp, "yyyy-MM-dd'T'HH:mm:ss'Z'") as timestamp,
location.lat as latitude,
location.lng as longitude,
accuracy
FROM df0
ORDER BY timestamp DESC
''').toPandas()
reads.head(10)

Unnamed: 0,deviceId,eventType,timestamp,latitude,longitude,accuracy
0,32:00:16:53:d8:80,GeolocationUpdate,2017-08-18T04:35:24Z,38.881004,-77.226941,61.0
1,32:00:16:53:d8:80,GeolocationUpdate,2017-08-18T04:20:19Z,38.880996,-77.226938,44.0
2,32:00:16:53:d8:80,GeolocationUpdate,2017-08-18T04:05:16Z,38.880999,-77.226985,35.0
3,32:00:16:53:d8:80,GeolocationUpdate,2017-08-18T03:50:12Z,38.880994,-77.226897,61.0
4,32:00:16:53:d8:80,GeolocationUpdate,2017-08-18T03:35:07Z,38.880989,-77.226937,38.0
5,32:00:16:53:d8:80,GeolocationUpdate,2017-08-18T03:20:03Z,38.88102,-77.226916,65.0
6,32:00:16:53:d8:80,GeolocationUpdate,2017-08-18T03:04:59Z,38.880992,-77.226928,43.0
7,32:00:16:53:d8:80,GeolocationUpdate,2017-08-18T02:49:55Z,38.880977,-77.226957,40.0
8,32:00:16:53:d8:80,GeolocationUpdate,2017-08-18T02:34:51Z,38.881013,-77.226945,61.0
9,32:00:16:53:d8:80,GeolocationUpdate,2017-08-18T02:19:47Z,38.880989,-77.226962,34.0


In [6]:
MARKER = [38.881029, -77.227023]

In [7]:
import folium

def geomap(reads_df, marker=None):
    df = reads_df.dropna()
    mapa = folium.Map(location=[df['latitude'].median(), df['longitude'].median()])
    mapa.fit_bounds([[df['latitude'].min(), df['longitude'].min()], [df['latitude'].max(), df['longitude'].max()]])
    mapa.add_child(folium.PolyLine(list(zip(df.latitude, df.longitude))))
    if marker:
        marker_location = marker
        label = 'MARKER: (%.4f, %.4f)' % (marker_location[0], marker_location[1])
    else:
        marker_location = list(df.iloc[1][['latitude', 'longitude']].values)
        label = 'LATEST LOCATION: (%.4f, %.4f)' % (marker_location[0], marker_location[1])
    folium.RegularPolygonMarker(location=marker_location, popup=label, fill_color='blue', 
                                number_of_sides=4, radius=8).add_to(mapa)
    return mapa

geomap(reads, MARKER)  # If you exclude MARKER, the diamond is the latest location

In [8]:
# Calculate the Standard Error of Prediction - a good means to calculate for sensor accuracy
from geopy.distance import vincenty
def calc_err_m(lat, lng):
    """Calculate Vincenty distance from MARKER"""
    p = (lat, lng)
    return vincenty(p, MARKER).m


from math import sqrt
def calc_sep(df_col):
    """Square root of the sum to the squared error over the count of reads"""
    return sqrt(df_col.apply(lambda x: x*x).sum() / len(df_col))


def compute_error(reads_df):
    """Computes statistics on geolocation accuracy"""
    df = reads_df[['latitude', 'longitude']].dropna() 
    df['err_meters'] = df.apply(lambda row: calc_err_m(row['latitude'], row['longitude']), axis=1)
    results = df[['err_meters']].describe()
    results.loc['accuracy'] = calc_sep(df['err_meters'])  # Expected error in meters
    return results

compute_error(reads)

Unnamed: 0,err_meters
count,54.0
mean,8.035156
std,1.352839
min,4.742524
25%,7.113893
50%,8.0508
75%,8.71011
max,11.574066
accuracy,8.146165
