In [1]:
import gpxpy, glob, geohash2
import pandas as pd

In [2]:
def get_df_of_gpx_file(val_filename):
    df_rtn_val = []
    try:
        gpx_file = open(val_filename, 'r')
        gpx = gpxpy.parse(gpx_file)
        for i in range(len(gpx.tracks)):
            for j in range(len(gpx.tracks[i].segments)):
                data = gpx.tracks[i].segments[j].points
                df = pd.DataFrame(columns=['Lon', 'Lat', 'Alt', 'Time'])
                for point in data:
                    df = df.append({'Lon': point.longitude, 'Lat': point.latitude, 'Alt': point.elevation, 'Time': str(point.time)},ignore_index=True)
                df['Track'] = i
                df['Segment'] = j
                df['Src'] = val_filename
                df_rtn_val.append(df)
        df_rtn_val = pd.concat(df_rtn_val)
    except:
        print('ERROR: Could not process file named = '+val_filename)
    return(df_rtn_val)

def get_hashes(df_src,val_precision=8,col_lat='Lat',col_lon='Lon'):
    df_src['Geohash'] = df_src.apply(lambda x: geohash2.encode(x[col_lat],x[col_lon],precision=val_precision),axis=1)
    return(df_src)

def get_df_unique_hashes(df_src,col_identifier='Src',col_geohash='Geohash'):
    vec_unique_runs = df_src[col_identifier].unique()
    df_geohashes = pd.DataFrame(columns=[col_identifier,'Unique'+col_geohash])
    for i in vec_unique_runs:
        df_tmp = df_src[df_src[col_identifier]==i].copy()
        tmp_unique_geohashes = df_tmp[col_geohash].unique()
        df_geohashes = df_geohashes.append({col_identifier:i,'Unique'+col_geohash:tmp_unique_geohashes},ignore_index=True)
    return(df_geohashes)

def compare_geohashes(str_src_file,df_src,col_identifier='Src',col_unique_geohash='UniqueGeohash'):
    df_geohashes_src = df_src.loc[df_src[col_identifier]==str_src_file].copy()
    df_geohashes_others = df_src.loc[~(df_src[col_identifier]==str_src_file)].copy()
    set_src_geohashes = []
    for i,row in df_geohashes_src.iterrows():
        tmp_geohashes = row[col_unique_geohash].tolist()
        set_src_geohashes.extend(tmp_geohashes)
    set_other_geohashes = []
    for i,row in df_geohashes_others.iterrows():
        tmp_geohashes = row[col_unique_geohash].tolist()
        set_other_geohashes.extend(tmp_geohashes)
    print('Source Geohashes = '+str(len(set_src_geohashes)))
    print('Sink Geohashes = '+str(len(set_other_geohashes)))
    rtn_val = set(set_src_geohashes)-set(set_other_geohashes)
    return(rtn_val)

In [3]:
%%time
val_files = glob.glob('./Data/*.gpx')
df_gpx_rtn_val = []
for i in val_files:
    df_gpx_rtn_val.append(get_df_of_gpx_file(i))
df_gpx = pd.concat(df_gpx_rtn_val)
df_runner = get_hashes(df_gpx)
df_geohashes = get_df_unique_hashes(df_runner)
print(df_runner.shape)

(2584, 8)
CPU times: user 4.15 s, sys: 36.2 ms, total: 4.18 s
Wall time: 4.23 s


In [4]:
%%time
unique_hashes = compare_geohashes(str_src_file='./2019-03-16_Run.gpx',df_src=df_geohashes)

Source Geohashes = 0
Sink Geohashes = 461
CPU times: user 2.35 ms, sys: 156 µs, total: 2.5 ms
Wall time: 2.45 ms


In [5]:
len(unique_hashes)

0

In [6]:
#df_runner[df_runner['Geohash'].isin(list(unique_hashes))].copy().to_excel('./Data/Output.xlsx',index=False)