This is our data to go through and extract similarities between routes

In [4]:
import gpxpy
import pandas as pd
import numpy as np

#all of this is the same as it was in the minhash code
def get_loc_set(file_path):
    with open(file_path, 'r') as f:
        gpx = gpxpy.parse(f)

    lats = []
    longs = []
    
    for track in gpx.tracks:
        for segment in track.segments:
            # print(f"Track Name: {track.name}")
            # print(f"Track Type: {track.type}")

            for point in segment.points:
                lats.append(point.latitude)
                longs.append(point.longitude)
    
    lats, longs = np.around(lats, decimals=4), np.around(longs, decimals=4)
    return set(zip(lats, longs))

import hashlib

def hash_function(seed, element):
    return int(hashlib.md5((str(seed) + str(element)).encode('utf8')).hexdigest(), 16) % 1000000


def minhash(coords_list, num_hashes=100):
    num_sets = len(coords_list)
    
    minhash_sig = np.full((num_hashes, num_sets), np.inf)

    for i, coord_set in enumerate(coords_list):
        for seed in range(num_hashes):
            for element in coord_set:
                hash_value = hash_function(seed, element)
                minhash_sig[seed, i] = min(minhash_sig[seed, i], hash_value)
    
    return minhash_sig

def jac_sim(minhash_sig, num_sets):
    similarities = np.zeros((num_sets, num_sets))
    
    for i in range(num_sets):
        for j in range(i, num_sets):
            matching_hashes = np.sum(minhash_sig[:, i] == minhash_sig[:, j])
            sim = matching_hashes / minhash_sig.shape[0]
            similarities[i, j] = sim
            similarities[j, i] = sim
    
    return similarities

In [5]:
#similar to the bulk test, this goes through and gets all theh data 

coords_list = []

activities = pd.read_csv('Data/activities.csv', delimiter = ',')
files = activities['Filename'].tolist()[:-1]
filenames = [] 

for f in files:
    if not isinstance(f, float):
        if f[-3:] == 'gpx':
            filenames.append(f)
# print(filenames)

for i, file in enumerate(filenames[1::]):
    s = 'Data/' + file
    # print(i)
    curr_coords = get_loc_set(s)
    coords_list.append(curr_coords)

all_coords = set()

for s in coords_list:
    all_coords = all_coords | s

s0 = pd.Series(list(all_coords))
result_dict = {'checker': s0}

for i, shingle_set in enumerate(coords_list):
    # print(f"Processing file {i+1}: {filenames[i]}")
    column_name = filenames[i]
    
    result_dict[column_name] = s0.isin(list(shingle_set))

df = pd.DataFrame(result_dict)
df.head()

Unnamed: 0,checker,activities/7664030898.gpx,activities/9201531314.gpx,activities/9208399131.gpx,activities/10195845861.gpx,activities/10213672242.gpx,activities/10236119203.gpx,activities/10297798873.gpx,activities/10306102382.gpx,activities/10328738170.gpx,...,activities/12900677054.gpx,activities/12948155220.gpx,activities/13027178753.gpx,activities/13041585169.gpx,activities/13073839675.gpx,activities/13131191690.gpx,activities/13164669518.gpx,activities/13249523361.gpx,activities/13293241181.gpx,activities/13344307734.gpx
0,"(39.5821, -105.0321)",False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,"(36.3302, -94.1021)",True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,"(39.983, -105.2426)",False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,"(39.7477, -105.2191)",False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,"(39.6847, -105.0402)",False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [14]:
num_sets = len(coords_list)

#now we have a huge similarity matrix

minhash_sig = minhash(coords_list, 100)
similarities = jac_sim(minhash_sig, num_sets)
print(similarities)

# we print the sums to see which ones have a high number -> these have lots of similarities and would be good candidates for our next analysis
print(np.sum(similarities, axis=0))

[[1.   0.   0.   ... 0.   0.   0.  ]
 [0.   1.   0.   ... 0.   0.   0.  ]
 [0.   0.   1.   ... 0.   0.   0.  ]
 ...
 [0.   0.   0.   ... 1.   0.   0.  ]
 [0.   0.   0.   ... 0.   1.   0.06]
 [0.   0.   0.   ... 0.   0.06 1.  ]]
[ 1.02  1.03  7.89  7.44  1.75 10.17  3.24  7.07  1.4   1.46  3.6   9.25
  1.57  2.41  8.8   9.36  6.89  9.33  9.16  9.02  8.2   7.52  8.37  9.16
  1.74  1.95  2.23  1.02  2.43  2.08  6.66  9.39  9.85  8.91  1.12  1.08
  9.48  8.85  2.57  2.12  1.23  1.31  1.23  2.74  1.1   2.37  2.17  2.1
  1.05  2.    1.    1.72  5.87  1.45  1.28  1.49  1.07  2.81  2.    1.68
  1.67  3.12  2.71  2.22  1.14  2.66  1.85  3.45  1.57  2.05  1.    1.78
  2.73  1.35  3.14  1.59  3.42  3.35  3.48  1.49  2.77  2.09  2.35  1.
  3.21  2.01]
