This is our minhashing algorithm base. We test it on a small set of files and then perform a volumn test. Our main use, in the bulk minhash code file, is when we analyze all the data of a sample person

In [4]:
import gpxpy
import pandas as pd
import numpy as np

def get_loc_set(file_path):
    with open(file_path, 'r') as f:
        gpx = gpxpy.parse(f)

    lats = []
    longs = []
    
    for track in gpx.tracks:
        for segment in track.segments:
            # print(f"Track Name: {track.name}")
            print(f"Track Type: {track.type}") #progress update

            for point in segment.points:
                lats.append(point.latitude)
                longs.append(point.longitude)
    
    lats, longs = np.around(lats, decimals=4), np.around(longs, decimals=4)
    return set(zip(lats, longs))

#sample files to test on
filenames = ['Data/04062025_run.gpx', 'Data/04052025_run.gpx', 'Data/04072025_run.gpx', 'Data/04112025_run.gpx', 'Data/04132025_run.gpx']
coords_list = []

#run the analysis, inlcude another progress update
for i, file in enumerate(filenames):
    print(f"Analyzing file {i+1}")
    curr_coords = get_loc_set(file)
    coords_list.append(curr_coords)

#make the big set
all_coords = set()
for s in coords_list:
    all_coords = all_coords | s

#df of data D
s0 = pd.Series(list(all_coords))
result_dict = {'checker': s0}

for i, shingle_set in enumerate(coords_list):
    print(f"Processing file {i+1}: {filenames[i]}")
    column_name = filenames[i]
    
    result_dict[column_name] = s0.isin(list(shingle_set))

df = pd.DataFrame(result_dict)
df.head()

Analyzing file 1
Track Type: running
Analyzing file 2
Track Type: running
Analyzing file 3
Track Type: running
Analyzing file 4
Track Type: running
Analyzing file 5
Track Type: running
Processing file 1: Data/04062025_run.gpx
Processing file 2: Data/04052025_run.gpx
Processing file 3: Data/04072025_run.gpx
Processing file 4: Data/04112025_run.gpx
Processing file 5: Data/04132025_run.gpx


Unnamed: 0,checker,Data/04062025_run.gpx,Data/04052025_run.gpx,Data/04072025_run.gpx,Data/04112025_run.gpx,Data/04132025_run.gpx
0,"(39.6895, -105.1028)",True,True,False,False,False
1,"(39.6822, -105.1119)",True,True,False,False,False
2,"(39.6893, -105.1006)",True,True,False,False,False
3,"(39.6827, -105.1118)",True,True,False,False,False
4,"(39.6896, -105.0997)",True,True,False,False,False


In [2]:
import numpy as np
import pandas as pd
import hashlib

#hashing
def hash_function(seed, element):
    return int(hashlib.md5((str(seed) + str(element)).encode('utf8')).hexdigest(), 16) % 1000000

#making M
def minhash(coords_list, num_hashes=100):
    num_sets = len(coords_list)
    
    minhash_sig = np.full((num_hashes, num_sets), np.inf) #start at -infty

    for i, coord_set in enumerate(coords_list):
        for seed in range(num_hashes):
            for element in coord_set:
                hash_value = hash_function(seed, element)
                minhash_sig[seed, i] = min(minhash_sig[seed, i], hash_value) #update each entry
    
    return minhash_sig

#similarity from signature matrix
def jac_sim(minhash_sig, num_sets):
    similarities = np.zeros((num_sets, num_sets))
    
    for i in range(num_sets): #plain old jaccard similarity
        for j in range(i, num_sets):
            matching_hashes = np.sum(minhash_sig[:, i] == minhash_sig[:, j])
            sim = matching_hashes / minhash_sig.shape[0]
            similarities[i, j] = sim
            similarities[j, i] = sim #setting both just for visibility and lining up
    
    return similarities

num_sets = len(coords_list)

minhash_sig = minhash(coords_list, 100)
similarities = jac_sim(minhash_sig, num_sets)
print(similarities) #should see results


[[1.   0.71 0.   0.   0.  ]
 [0.71 1.   0.   0.   0.  ]
 [0.   0.   1.   0.67 0.62]
 [0.   0.   0.67 1.   0.61]
 [0.   0.   0.62 0.61 1.  ]]


This is the volume test, which we did to make sure this was usable with the amount of data

In [5]:
activities = pd.read_csv('Data/activities.csv', delimiter = ',')
files = activities['Filename'].tolist()[:-1]
filenames = [] 

#a bit annoying to get out the files, just use these
for f in files:
    if not isinstance(f, float): #some values aren't numeric, some have nan issues
        if f[-3:] == 'gpx':
            filenames.append(f)
print(filenames)
for i, file in enumerate(filenames[1::]):
    s = 'Data/' + file
    # print(i)
    curr_coords = get_loc_set(s)
    coords_list.append(curr_coords)

#same as before
all_coords = set()
for s in coords_list:
    all_coords = all_coords | s

s0 = pd.Series(list(all_coords))
result_dict = {'checker': s0}

for i, shingle_set in enumerate(coords_list):
    if (i < 87): #general cutoff to make sure it doesn't take too long, can modify
        print(f"Processing file {i+1}: {filenames[i]}")
        column_name = filenames[i]
        
        result_dict[column_name] = s0.isin(list(shingle_set))

df = pd.DataFrame(result_dict)
df.head()

['activities/7664030898.gpx', 'activities/9201531314.gpx', 'activities/9208399131.gpx', 'activities/10195845861.gpx', 'activities/10213672242.gpx', 'activities/10236119203.gpx', 'activities/10297798873.gpx', 'activities/10306102382.gpx', 'activities/10328738170.gpx', 'activities/10339669514.gpx', 'activities/10359765824.gpx', 'activities/10368538574.gpx', 'activities/10380623110.gpx', 'activities/10426958756.gpx', 'activities/10470627375.gpx', 'activities/10608835232.gpx', 'activities/10676173524.gpx', 'activities/10722751798.gpx', 'activities/10755668518.gpx', 'activities/10790430718.gpx', 'activities/10796100865.gpx', 'activities/10821874440.gpx', 'activities/10842702052.gpx', 'activities/10862944105.gpx', 'activities/10938133614.gpx', 'activities/10994695872.gpx', 'activities/11017203672.gpx', 'activities/11031595610.gpx', 'activities/11047818855.gpx', 'activities/11062019984.gpx', 'activities/11070852384.gpx', 'activities/11092450380.gpx', 'activities/11143843698.gpx', 'activities/

Unnamed: 0,checker,activities/7664030898.gpx,activities/9201531314.gpx,activities/9208399131.gpx,activities/10195845861.gpx,activities/10213672242.gpx,activities/10236119203.gpx,activities/10297798873.gpx,activities/10306102382.gpx,activities/10328738170.gpx,...,activities/12948155220.gpx,activities/13027178753.gpx,activities/13041585169.gpx,activities/13073839675.gpx,activities/13131191690.gpx,activities/13164669518.gpx,activities/13249523361.gpx,activities/13293241181.gpx,activities/13344307734.gpx,activities/13502255084.gpx
0,"(39.5821, -105.0321)",False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,"(36.3302, -94.1021)",False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,"(39.983, -105.2426)",False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,"(39.7477, -105.2191)",False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,"(39.6847, -105.0402)",False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
