In [5]:
import gpxpy
import pandas as pd
import numpy as np

def get_loc_set(file_path):
    with open(file_path, 'r') as f:
        gpx = gpxpy.parse(f)

    lats = []
    longs = []
    
    for track in gpx.tracks:
        for segment in track.segments:
            # print(f"Track Name: {track.name}")
            print(f"Track Type: {track.type}")

            for point in segment.points:
                lats.append(point.latitude)
                longs.append(point.longitude)
    
    lats, longs = np.around(lats, decimals=4), np.around(longs, decimals=4)
    return set(zip(lats, longs))

filenames = ['Data/04062025_run.gpx', 'Data/04052025_run.gpx', 'Data/04072025_run.gpx', 'Data/04112025_run.gpx', 'Data/04132025_run.gpx']
coords_list = []

for i, file in enumerate(filenames):
    print(f"Analyzing file {i+1}")
    curr_coords = get_loc_set(file)
    coords_list.append(curr_coords)

all_coords = set()
for s in coords_list:
    all_coords = all_coords | s

s0 = pd.Series(list(all_coords))
result_dict = {'checker': s0}

for i, shingle_set in enumerate(coords_list):
    print(f"Processing file {i+1}: {filenames[i]}")
    column_name = filenames[i]
    
    result_dict[column_name] = s0.isin(list(shingle_set))

df = pd.DataFrame(result_dict)
df.head()

Analyzing file 1
Track Type: running
Analyzing file 2
Track Type: running
Analyzing file 3
Track Type: running
Analyzing file 4
Track Type: running
Analyzing file 5
Track Type: running
Processing file 1: Data/04062025_run.gpx
Processing file 2: Data/04052025_run.gpx
Processing file 3: Data/04072025_run.gpx
Processing file 4: Data/04112025_run.gpx
Processing file 5: Data/04132025_run.gpx


Unnamed: 0,checker,Data/04062025_run.gpx,Data/04052025_run.gpx,Data/04072025_run.gpx,Data/04112025_run.gpx,Data/04132025_run.gpx
0,"(39.6895, -105.1028)",True,True,False,False,False
1,"(39.6822, -105.1119)",True,True,False,False,False
2,"(39.6893, -105.1006)",True,True,False,False,False
3,"(39.6827, -105.1118)",True,True,False,False,False
4,"(39.6896, -105.0997)",True,True,False,False,False


In [9]:
import numpy as np
import pandas as pd
import hashlib

def hash_function(seed, element):
    return int(hashlib.md5((str(seed) + str(element)).encode('utf8')).hexdigest(), 16) % 1000000

# Minhashing function
def minhash(coords_list, num_hashes=100):
    num_sets = len(coords_list)
    
    minhash_sig = np.full((num_hashes, num_sets), np.inf)

    for i, coord_set in enumerate(coords_list):
        for seed in range(num_hashes):
            for element in coord_set:
                hash_value = hash_function(seed, element)
                minhash_sig[seed, i] = min(minhash_signatures[seed, i], hash_value)
    
    return minhash_sig

def jac_sim(minhash_sig, num_sets):
    similarities = np.zeros((num_sets, num_sets))
    
    for i in range(num_sets):
        for j in range(i, num_sets):
            matching_hashes = np.sum(minhash_sig[:, i] == minhash_sig[:, j])
            sim = matching_hashes / minhash_sig.shape[0]
            similarities[i, j] = sim
            similarities[j, i] = sim
    
    return similarities

num_sets = len(coords_list)

minhash_sig = minhash(coords_list, 100)
similarities = jac_sim(minhash_sig, num_sets)
print(similarities)


[[1.   0.71 0.   0.   0.  ]
 [0.71 1.   0.   0.   0.  ]
 [0.   0.   1.   0.67 0.62]
 [0.   0.   0.67 1.   0.61]
 [0.   0.   0.62 0.61 1.  ]]
