In [2]:
!pip install numba

Defaulting to user installation because normal site-packages is not writeable
Collecting numba
  Downloading numba-0.56.4-cp37-cp37m-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.5 MB)
     |████████████████████████████████| 3.5 MB 1.1 MB/s            
[?25hCollecting llvmlite<0.40,>=0.39.0dev0
  Downloading llvmlite-0.39.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.6 MB)
     |████████████████████████████████| 34.6 MB 14.7 MB/s            
Installing collected packages: llvmlite, numba
Successfully installed llvmlite-0.39.1 numba-0.56.4


In [1]:
import time
import pickle
import numpy as np
import pandas as pd
import scipy.stats as st
from numba import njit, prange
from math import sin, cos, sqrt, atan2, radians

In [2]:
path = '../data/'

In [3]:
!ls $path

D_p-pts.pickle	exp1-trj.label	porto.csv     train.src  val.trg
D_q-pts.pickle	exp1-trj.pts	README.md     train.trg  vocab.txt
Dq-pts.pickle	exp1-trj.t	saved_models  val.src


In [4]:
# Reloading Dq...
with open(path+'Dq-pts.pickle', 'rb') as file:
    Dq = pickle.load(file)

# Reloading D_q...
with open(path+'D_q-pts.pickle', 'rb') as file:
    D_q = pickle.load(file)

# Reloading D_p...
with open(path+'D_p-pts.pickle', 'rb') as file:
    D_p = pickle.load(file)

In [5]:
@njit
def dist_haversine(p1, p2):
  """
  p1 --> numpy array [lon, lat]
  p2 --> numpy array [lon, lat]
  """
  # approximate radius of earth in km
  R = 6371.0

  lon1 = radians(p1[0]) # p1[x, ]
  lat1 = radians(p1[1]) # p1[ ,y]
  lon2 = radians(p2[0]) # p2[x, ]
  lat2 = radians(p2[1]) # p2[ ,y]

  dlon = lon2 - lon1
  dlat = lat2 - lat1

  a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
  c = 2 * atan2(sqrt(a), sqrt(1 - a))

  distance = R * c

  return distance * 1000

In [6]:
@njit
def dtw_numba(t1, t2, dist):
    len_t1, len_t2 = t1.shape[0], t2.shape[0]
    dists = np.full((len_t1+1, len_t2+1), np.inf)
    dists[0, 0] = 0.0
    for i in prange(1, len_t1+1):
        for j in prange(1, len_t2+1):
            dt = dist(t1[i-1], t2[j-1])
            dists[i, j] = min((dists[i-1, j]+dt), (dists[i, j-1]+dt), (dists[i-1, j-1]+dt))

    return dists[len_t1, len_t2]

In [7]:
Dq[0]

array([[-8.585676, 41.148621],
       [-8.585721, 41.148864],
       [-8.587161, 41.149044],
       [-8.590383, 41.150052],
       [-8.593281, 41.150772],
       [-8.593254, 41.150808],
       [-8.59401 , 41.150493],
       [-8.593335, 41.148324],
       [-8.593056, 41.147415],
       [-8.592993, 41.147397],
       [-8.592939, 41.147352],
       [-8.59293 , 41.147334]])

In [8]:
D_q[0]

array([[-8.585694, 41.148639],
       [-8.586135, 41.14899 ],
       [-8.588574, 41.149557],
       [-8.592345, 41.150547],
       [-8.593272, 41.150808],
       [-8.59356 , 41.150871],
       [-8.593767, 41.149683],
       [-8.593083, 41.14746 ],
       [-8.59302 , 41.147406],
       [-8.592966, 41.147379],
       [-8.59293 , 41.147343],
       [-8.59293 , 41.147325]])

In [9]:
D_p[0]

array([[-8.620002, 41.147901],
       [-8.620308, 41.147343],
       [-8.620146, 41.146308],
       [-8.620155, 41.14629 ],
       [-8.618013, 41.145975],
       [-8.616114, 41.145246],
       [-8.613639, 41.145894],
       [-8.611974, 41.146002],
       [-8.61093 , 41.145723],
       [-8.61102 , 41.145021],
       [-8.612577, 41.143869],
       [-8.613864, 41.143023],
       [-8.615214, 41.142087],
       [-8.615097, 41.140836],
       [-8.613495, 41.141286]])

In [10]:
dtw_numba(Dq[0], D_q[0], dist=dist_haversine)

574.2711876701195

In [11]:
dtw_numba(Dq[0], Dq[0], dist=dist_haversine)

0.0

In [13]:
def rank(t_query, i, db_search):
    dists = list(map(lambda t: dtw_numba(t_query, t, dist=dist_haversine), db_search)) # pega todas as distâncias de Dq[i] as trajs de D_qUD_p
    dists = np.array(dists)
    order = dists.argsort() # pega a ordem
    ranks = order.argsort() # pega o rank

    return ranks[i] + 1 # retorna o ranking de ta' no dbsearch | soma +1 pq o argsort rankea a partir de 0 (zero)

In [15]:
def acc(ranks):
    count = 0
    for i in range(len(ranks)):
        if ranks[i] == 1:
            count += 1

    return round(count/len(ranks), 3)

In [16]:
def mr(ranks):
    return (sum(ranks)/len(ranks))

In [17]:
def mrr(ranks):
    count = 0
    for i in range(len(ranks)):
        count += 1/ranks[i]

    return round(count/len(ranks), 3)

In [18]:
# Intervalo de Confiança dos Ranks
def cip_r(ranks):
    data = []
    for i in range(len(ranks)):
        data.append(ranks[i]) # Add os Ranks
    
    #create 95% confidence interval for population mean weight
    ic = st.t.interval(alpha=0.95, df=len(data)-1, loc=np.mean(data), scale=st.sem(data))
    
    return tuple(round(valor, 3) for valor in ic) # arredonda pra 3 casas decimais

In [19]:
# Intervalo de Confiança dos Reciprocal Ranks
def cip_rr(ranks):
    data = []
    for i in range(len(ranks)):
        data.append(1/ranks[i]) # Add os RRs...
    
    #create 95% confidence interval for population mean weight
    ic = st.t.interval(alpha=0.95, df=len(data)-1, loc=np.mean(data), scale=st.sem(data))
    
    return tuple(round(valor, 3) for valor in ic) # arredonda pra 3 casas decimais

In [24]:
%%time
dbsizes = [20000, 40000, 60000, 80000, 100000]
entire_db = D_q + D_p
for dbsize in dbsizes:
    ranks = []
    dbsearch = entire_db[:dbsize]
    for i in range(len(Dq)):
        ranks.append(rank(Dq[i], i, dbsearch)) # rank de ta' no dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

Mean rank: 18.28(16.192, 20.377), Acc: 0.758, MRR: 0.847(0.829, 0.864) with dbsearch size: 20000
Mean rank: 26.59(22.834, 30.357), Acc: 0.656, MRR: 0.769(0.748, 0.79) with dbsearch size: 40000
Mean rank: 35.27(29.668, 40.881), Acc: 0.612, MRR: 0.725(0.703, 0.748) with dbsearch size: 60000
Mean rank: 45.53(37.459, 53.611), Acc: 0.563, MRR: 0.684(0.661, 0.707) with dbsearch size: 80000
Mean rank: 54.27(44.198, 64.356), Acc: 0.534, MRR: 0.659(0.635, 0.683) with dbsearch size: 100000
CPU times: user 3h 11min 47s, sys: 132 ms, total: 3h 11min 48s
Wall time: 3h 11min 48s
