In [2]:
!pip install numba

Defaulting to user installation because normal site-packages is not writeable
Collecting numba
  Downloading numba-0.56.4-cp37-cp37m-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.5 MB)
     |████████████████████████████████| 3.5 MB 1.1 MB/s            
[?25hCollecting llvmlite<0.40,>=0.39.0dev0
  Downloading llvmlite-0.39.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.6 MB)
     |████████████████████████████████| 34.6 MB 14.7 MB/s            
Installing collected packages: llvmlite, numba
Successfully installed llvmlite-0.39.1 numba-0.56.4


In [2]:
import math
import time
import pickle
import numpy as np
import pandas as pd
import scipy.stats as st
from numba import njit, prange
from math import sin, cos, sqrt, atan2, radians

In [3]:
path = '../data/'

In [4]:
!ls $path

df_tdrive_ALL-taxis.csv  exp1-trj.h5	 tdrive.csv		   val.mta
D_p-pts.pickle		 exp1-trj.label  tdrive_formato_t2vec.csv  val.src
D_q-pts.pickle		 exp1-trj.t	 train.mta		   val.trg
Dq-pts.pickle		 mv.csv		 train.src		   vocab.txt
exp1-querydb.h5		 saved_models	 train.trg


In [5]:
# Reloading Dq...
with open(path+'Dq-pts.pickle', 'rb') as file:
    Dq = pickle.load(file)

# Reloading D_q...
with open(path+'D_q-pts.pickle', 'rb') as file:
    D_q = pickle.load(file)

# Reloading D_p...
with open(path+'D_p-pts.pickle', 'rb') as file:
    D_p = pickle.load(file)

In [6]:
@njit
def haversine(p1, p2):
    # Raio médio da Terra em metros
    earth_radius = 6371000.0

    # Conversão de graus para radianos
    lat1 = math.radians(p1[0])
    lon1 = math.radians(p1[1])
    lat2 = math.radians(p2[0])
    lon2 = math.radians(p2[1])

    # Diferença de latitudes e longitudes
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    # Fórmula de haversine
    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    # Distância em metros
    distance = earth_radius * c
    return distance

In [7]:
@njit
def dtw_numba(t1, t2, dist):
    len_t1, len_t2 = t1.shape[0], t2.shape[0]
    dists = np.full((len_t1+1, len_t2+1), np.inf)
    dists[0, 0] = 0.0
    for i in prange(1, len_t1+1):
        for j in prange(1, len_t2+1):
            dt = dist(t1[i-1], t2[j-1])
            dists[i, j] = min((dists[i-1, j]+dt), (dists[i, j-1]+dt), (dists[i-1, j-1]+dt))

    return dists[len_t1, len_t2]

In [8]:
Dq[0]

array([[116.82171,  40.37281],
       [116.82175,  40.37281],
       [116.82152,  40.37284],
       [116.82182,  40.37286],
       [116.82177,  40.37284],
       [116.82184,  40.37366]])

In [9]:
D_q[0]

array([[116.82174,  40.3728 ],
       [116.82171,  40.37251],
       [116.82152,  40.3722 ],
       [116.82178,  40.37288],
       [116.82183,  40.37386]])

In [10]:
D_p[0]

array([[116.41468,  39.87831],
       [116.41383,  39.88515],
       [116.42151,  39.89236],
       [116.43186,  39.89213],
       [116.43391,  39.88984]])

In [11]:
dtw_numba(Dq[0], D_q[0], dist=haversine)

68.12910457154875

In [12]:
dtw_numba(Dq[0], Dq[0], dist=haversine)

0.0

In [20]:
def rank(t_query, i, db_search):
    dists = list(map(lambda t: dtw_numba(t_query, t, dist=haversine), db_search)) # pega todas as distâncias de Dq[i] as trajs de D_qUD_p
    dists = np.array(dists)
    order = dists.argsort() # pega a ordem
    ranks = order.argsort() # pega o rank

    return ranks[i] + 1 # retorna o ranking de ta' no dbsearch | soma +1 pq o argsort rankea a partir de 0 (zero)

In [21]:
def acc(ranks):
    count = 0
    for i in range(len(ranks)):
        if ranks[i] == 1:
            count += 1

    return round(count/len(ranks), 3)

In [22]:
def mr(ranks):
    return (sum(ranks)/len(ranks))

In [23]:
def mrr(ranks):
    count = 0
    for i in range(len(ranks)):
        count += 1/ranks[i]

    return round(count/len(ranks), 3)

In [24]:
# Intervalo de Confiança dos Ranks
def cip_r(ranks):
    data = []
    for i in range(len(ranks)):
        data.append(ranks[i]) # Add os Ranks
    
    #create 95% confidence interval for population mean weight
    ic = st.t.interval(alpha=0.95, df=len(data)-1, loc=np.mean(data), scale=st.sem(data))
    
    return tuple(round(valor, 3) for valor in ic) # arredonda pra 3 casas decimais

In [25]:
# Intervalo de Confiança dos Reciprocal Ranks
def cip_rr(ranks):
    data = []
    for i in range(len(ranks)):
        data.append(1/ranks[i]) # Add os RRs...
    
    #create 95% confidence interval for population mean weight
    ic = st.t.interval(alpha=0.95, df=len(data)-1, loc=np.mean(data), scale=st.sem(data))
    
    return tuple(round(valor, 3) for valor in ic) # arredonda pra 3 casas decimais

In [26]:
%%time
dbsizes = [10000, 20000, 30000, 40000, 50000]
entire_db = D_q + D_p
for dbsize in dbsizes:
    ranks = []
    dbsearch = entire_db[:dbsize]
    for i in range(len(Dq)):
        ranks.append(rank(Dq[i], i, dbsearch)) # rank de ta' no dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

Mean rank: 454.64(-98.42, 1007.7), Acc: 0.766, MRR: 0.822(0.793, 0.852) with dbsearch size: 10000
Mean rank: 892.04(-213.68, 1997.76), Acc: 0.732, MRR: 0.792(0.76, 0.823) with dbsearch size: 20000
Mean rank: 1327.98(-330.83, 2986.79), Acc: 0.712, MRR: 0.775(0.743, 0.808) with dbsearch size: 30000
Mean rank: 1774.94(-440.77, 3990.65), Acc: 0.702, MRR: 0.763(0.73, 0.796) with dbsearch size: 40000
Mean rank: 2214.63(-553.97, 4983.25), Acc: 0.696, MRR: 0.756(0.722, 0.789) with dbsearch size: 50000
CPU times: user 28min 14s, sys: 4.11 ms, total: 29min 32s
Wall time: 29min 32s
