In [1]:
import math
import pickle
import numpy as np
import pandas as pd
import scipy.stats as st
from numba import njit, prange
from math import sin, cos, sqrt, atan2, radians

In [2]:
path = '../data/'

In [3]:
!ls $path

df_tdrive_ALL-taxis.csv  exp1-trj.h5	 tdrive.csv		   val.mta
D_p-pts.pickle		 exp1-trj.label  tdrive_formato_t2vec.csv  val.src
D_q-pts.pickle		 exp1-trj.t	 train.mta		   val.trg
Dq-pts.pickle		 mv.csv		 train.src		   vocab.txt
exp1-querydb.h5		 saved_models	 train.trg


In [4]:
# Reloading Dq...
with open(path+'Dq-pts.pickle', 'rb') as file:
    Dq = pickle.load(file)

# Reloading D_q...
with open(path+'D_q-pts.pickle', 'rb') as file:
    D_q = pickle.load(file)

# Reloading D_p...
with open(path+'D_p-pts.pickle', 'rb') as file:
    D_p = pickle.load(file)

In [5]:
Dq[0]

array([[116.82171,  40.37281],
       [116.82175,  40.37281],
       [116.82152,  40.37284],
       [116.82182,  40.37286],
       [116.82177,  40.37284],
       [116.82184,  40.37366]])

In [6]:
D_q[0]

array([[116.82174,  40.3728 ],
       [116.82171,  40.37251],
       [116.82152,  40.3722 ],
       [116.82178,  40.37288],
       [116.82183,  40.37386]])

In [7]:
D_p[0]

array([[116.41468,  39.87831],
       [116.41383,  39.88515],
       [116.42151,  39.89236],
       [116.43186,  39.89213],
       [116.43391,  39.88984]])

In [8]:
@njit
def haversine(p1, p2):
    # Raio médio da Terra em metros
    earth_radius = 6371000.0

    # Conversão de graus para radianos
    lat1 = math.radians(p1[0])
    lon1 = math.radians(p1[1])
    lat2 = math.radians(p2[0])
    lon2 = math.radians(p2[1])

    # Diferença de latitudes e longitudes
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    # Fórmula de haversine
    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    # Distância em metros
    distance = earth_radius * c
    return distance

In [9]:
@njit
def dist_lcss(T1, T2): # Programação Dinâmica da LCSS para trajetórias
    m = len(T1) # length of the traj T1
    n = len(T2) # length of the traj T2
    thr = 225 # thr --> threshold in meters

    # Declarando a matrix L(m+1 x n+1), lista de listas!
    L = [[0]*(n + 1) for _ in range(m + 1)]

    for i in prange(m + 1):
        for j in prange(n + 1):
            if i == 0 or j == 0 :
                L[i][j] = 0
            elif haversine(T1[i-1], T2[j-1]) <= thr:
                L[i][j] = L[i-1][j-1] + 1
            else:
                L[i][j] = max(L[i-1][j], L[i][j-1])

    lcss_length = L[m][n]
    
    return 1 - (lcss_length/max(m, n))

In [10]:
t1 =  np.array([[-37.97151, -4.93422],
                [-37.971341, -4.934051],
                [-37.971111, -4.933850],
                [-37.970728, -4.933544],
                [-37.970429, -4.933256],
                [-37.970207, -4.933063]], dtype=np.float32)

t2 =  np.array([[-37.97135, -4.93438],
                [-37.971143, -4.934198],
                [-37.970769, -4.933945],
                [-37.969606, -4.934348]], dtype=np.float32)

In [11]:
dist_lcss(t1, t2)

0.33333333333333337

In [12]:
dist_lcss(t1, t1)

0.0

In [13]:
def rank(t_query, i, db_search):
    dists = list(map(lambda t: dist_lcss(t_query, t), db_search)) # pega todas as distâncias de Dq[i] as trajs de D_qUD_p
    dists = np.array(dists)
    order = dists.argsort() # pega a ordem
    ranks = order.argsort() # pega o rank

    return ranks[i] + 1 # retorna o ranking de ta' no dbsearch | soma +1 pq o argsort rankea a partir de 0 (zero)

In [14]:
def acc(ranks):
    count = 0
    for i in range(len(ranks)):
        if ranks[i] == 1:
            count += 1

    return round(count/len(ranks), 3)

In [15]:
def mr(ranks):
    return (sum(ranks)/len(ranks))

In [16]:
def mrr(ranks):
    count = 0
    for i in range(len(ranks)):
        count += 1/ranks[i]

    return round(count/len(ranks), 3)

In [17]:
# Intervalo de Confiança do Ranks
def cip_r(ranks):
    data = []
    for i in range(len(ranks)):
        data.append(ranks[i]) # Add os Ranks
    
    #create 95% confidence interval for population mean weight
    ic = st.t.interval(alpha=0.95, df=len(data)-1, loc=np.mean(data), scale=st.sem(data))
    
    return tuple(round(valor, 3) for valor in ic) # arredonda pra 3 casas decimais

In [18]:
# Intervalo de Confiança dos Reciprocal Ranks
def cip_rr(ranks):
    data = []
    for i in range(len(ranks)):
        data.append(1/ranks[i]) # Add os RRs...
    
    #create 95% confidence interval for population mean weight
    ic = st.t.interval(alpha=0.95, df=len(data)-1, loc=np.mean(data), scale=st.sem(data))
    
    return tuple(round(valor, 3) for valor in ic) # arredonda pra 3 casas decimais

In [19]:
%%time
dbsizes = [10000, 20000, 30000, 40000, 50000]
entire_db = D_q + D_p
for dbsize in dbsizes:
    ranks = []
    dbsearch = entire_db[:dbsize]
    for i in range(len(Dq)):
        ranks.append(rank(Dq[i], i, dbsearch)) # rank de ta' no dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

Mean rank: 660.584(487.381, 833.787), Acc: 0.636, MRR: 0.702(0.666, 0.738) with dbsearch size: 10000
Mean rank: 1355.62(999.541, 1711.699), Acc: 0.618, MRR: 0.683(0.646, 0.72) with dbsearch size: 20000
Mean rank: 2004.378(1478.108, 2530.648), Acc: 0.598, MRR: 0.669(0.631, 0.706) with dbsearch size: 30000
Mean rank: 2677.752(1974.896, 3380.608), Acc: 0.576, MRR: 0.647(0.609, 0.685) with dbsearch size: 40000
Mean rank: 3362.308(2480.163, 4244.453), Acc: 0.584, MRR: 0.645(0.607, 0.683) with dbsearch size: 50000
CPU times: user 11min 48s, sys: 3.97 ms, total: 11min 48s
Wall time: 11min 48s
