In [1]:
import gensim
import numpy as np
import pandas as pd
import scipy.stats as st

from sklearn.metrics.pairwise import cosine_distances
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
!head -n 4 '../data/exp1-trj.t'

508 465 1641 857 3176 1346 1301 3303 3277 3977 4430 8513 9755 11383 9496 12228 11150 13279 9215 17279 14428 9279 14792 14310 18351 7997 15024 15267 15665 16329 15125 14591 14797 3
19 191 68 41 46 4 964 543 154 171 382 732 632 923 4815 460 273 439 607 908 726 6277 5338 5821 6933 5438 8005 10438 9812 17030 12351 12132 13988 9320 5581 13142 15720 14944 5048 11012
51 2263 345 53 120 405 803 585 692 1566 533 1728 1880 739 544 226 8
15 4 58 9 38 54 196 648 193 1022 575 1560 753 229 260 532 317 764 159 633 1304 207 798 2811 301 211 880 80 23 89 40 537 781 141 5900 240 38 54 655 1616 196 648 437 749 575


In [3]:
with open('../data/exp1-trj.t', 'r') as file:
    trajs_teste = []

    for line in file:
        traj = line.strip().split()# Divide cada linha(traj) em cels usando espaço como delimitador e convertendo-as em strings
        trajs_teste.append(traj)

In [4]:
print("Quantidade de trajetórias de teste:", len(trajs_teste))

Quantidade de trajetórias de teste: 101000


In [5]:
print(trajs_teste[2]) # query par

['51', '2263', '345', '53', '120', '405', '803', '585', '692', '1566', '533', '1728', '1880', '739', '544', '226', '8']


In [6]:
print(trajs_teste[1002]) # "alvo" da query par, ou seja, a query ímpar

['51', '430', '345', '120', '856', '131', '673', '585', '233', '2200', '533', '361', '1299', '1215', '66', '588', '7', '8']


In [7]:
!ls '../data/saved_models/D2V/'

d2v_w5_s256_ep30_dm0.model
d2v_w5_s256_ep30_dm0.model.dv.vectors.npy
d2v_w5_s64_ep30_dm0.model
d2v_w5_s64_ep30_dm0.model.dv.vectors.npy


In [8]:
# Carregando o modelo...
model = Doc2Vec.load('../data/saved_models/D2V/d2v_w5_s256_ep30_dm0.model')

In [9]:
# Size vocab
len(model.wv.key_to_index)

18827

In [10]:
model.vector_size

256

In [11]:
# Pegando as trajetórias mais similares a trajetória id = 0
mosts = model.dv.most_similar(0)
mosts

[(1, 0.9288357496261597),
 (2, 0.9009144902229309),
 (8, 0.8770485520362854),
 (4, 0.8750783801078796),
 (10, 0.8543340563774109),
 (9, 0.8264051675796509),
 (6, 0.8199347853660583),
 (15, 0.7996591329574585),
 (14, 0.7890172004699707),
 (12, 0.7884296178817749)]

In [12]:
def get_embedding_for_all_trajs(trajs):
    emb_trajs = np.zeros((len(trajs),) + (model.vector_size,), dtype=np.float32) # shape --> (len(trajs), vector_size). Ex. para Dq, shape --> (1000, 64)

    idx = 0
    for traj in trajs:
        emb_trajs[idx] = model.infer_vector(traj, epochs=model.epochs)
        idx += 1

        percent_done = (idx / len(trajs)) * 100 # Calcula a porcentagem concluída
        print(f"Progresso: {percent_done:.2f}% concluído", end="\r")  # A opção `end="\r"` permite que a impressão seja substituída na mesma linha

    return emb_trajs

In [13]:
# Segmentando: query (trajs pares) e dbsearch (querys ímpar + 99000 outras ímpares)
query = trajs_teste[:1000] # trajs query (pares)
dbsearch = trajs_teste[1000:101000] # dbsearch trajs (as 1000 primeiras são as query ímpar)

In [14]:
print(len(query))
print(len(dbsearch))

1000
100000


In [15]:
# Pegando os embeddings das trajs query:
query = get_embedding_for_all_trajs(query)

Progresso: 100.00% concluído

In [16]:
# Pegando os embeddings das trajs do dbsearch:
dbsearch = get_embedding_for_all_trajs(dbsearch)

Progresso: 100.00% concluído

In [17]:
def rank(t_query, i, db_search):
    dists = cosine_distances(t_query.reshape(1, -1), db_search) # pega todas as distâncias de Dq[i] as trajs do db_search (D_qUD_p)
    dists = dists.flatten()
    order = dists.argsort() # pega a ordem
    ranks = order.argsort() # pega o rank

    return ranks[i] + 1 # retorna o ranking de ta' no dbsearch | soma +1 pq o argsort rankea a partir de 0 (zero)

In [18]:
def acc(ranks):
    count = 0
    for i in range(len(ranks)):
        if ranks[i] == 1:
            count += 1

    return round(count/len(ranks), 2)

In [19]:
def mr(ranks):
    return (sum(ranks)/len(ranks))

In [20]:
def mrr(ranks):
    count = 0
    for i in range(len(ranks)):
        count += 1/ranks[i]

    return round(count/len(ranks), 2)

In [21]:
# Intervalo de Confiança do Ranks
def cip_r(ranks):
    data = []
    for i in range(len(ranks)):
        data.append(ranks[i]) # Add os Ranks

    #create 95% confidence interval for population mean weight
    ic = st.t.interval(alpha=0.95, df=len(data)-1, loc=np.mean(data), scale=st.sem(data))

    return tuple(round(valor, 3) for valor in ic) # arredonda pra 3 casas decimais

In [22]:
# Intervalo de Confiança dos Reciprocal Ranks
def cip_rr(ranks):
    data = []
    for i in range(len(ranks)):
        data.append(1/ranks[i]) # Add os RRs...

    #create 95% confidence interval for population mean weight
    ic = st.t.interval(alpha=0.95, df=len(data)-1, loc=np.mean(data), scale=st.sem(data))

    return tuple(round(valor, 3) for valor in ic) # arredonda pra 3 casas decimais

In [23]:
%%time
# d2v_w5_s256_ep30_dm0.model
dbsizes = [20000, 40000, 60000, 80000, 100000]
for dbsize in dbsizes:
    ranks = []
    search = dbsearch[:dbsize]
    for i in range(len(query)):
        ranks.append(rank(query[i], i, search)) # rank das ta no dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

Mean rank: 10.25(7.939, 12.561), Acc: 0.55, MRR: 0.66(0.636, 0.685) with dbsearch size: 20000
Mean rank: 19.379(14.797, 23.961), Acc: 0.47, MRR: 0.58(0.556, 0.608) with dbsearch size: 40000
Mean rank: 28.375(21.597, 35.153), Acc: 0.43, MRR: 0.54(0.517, 0.57) with dbsearch size: 60000
Mean rank: 38.304(29.055, 47.553), Acc: 0.4, MRR: 0.51(0.485, 0.538) with dbsearch size: 80000
Mean rank: 47.626(36.082, 59.17), Acc: 0.39, MRR: 0.49(0.464, 0.517) with dbsearch size: 100000
CPU times: user 15min 34s, sys: 38min 25s, total: 54min
Wall time: 13min 39s


In [23]:
%%time
# d2v_w5_s128_ep30_dm0.model
dbsizes = [20000, 40000, 60000, 80000, 100000]
for dbsize in dbsizes:
    ranks = []
    search = dbsearch[:dbsize]
    for i in range(len(query)):
        ranks.append(rank(query[i], i, search)) # rank das ta no dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

Mean rank: 3.052(2.369, 3.735), Acc: 0.75, MRR: 0.82(0.805, 0.844) with dbsearch size: 20000
Mean rank: 5.069(3.743, 6.395), Acc: 0.68, MRR: 0.77(0.748, 0.792) with dbsearch size: 40000
Mean rank: 7.047(5.089, 9.005), Acc: 0.64, MRR: 0.73(0.711, 0.758) with dbsearch size: 60000
Mean rank: 9.072(6.54, 11.604), Acc: 0.61, MRR: 0.71(0.681, 0.729) with dbsearch size: 80000
Mean rank: 10.982(7.884, 14.08), Acc: 0.58, MRR: 0.68(0.659, 0.708) with dbsearch size: 100000
CPU times: user 12min 39s, sys: 34min 5s, total: 46min 44s
Wall time: 11min 47s


In [27]:
%%time
# d2v_w5_s64_ep1_dm0.model
dbsizes = [20000, 40000, 60000, 80000, 100000]
for dbsize in dbsizes:
    ranks = []
    search = dbsearch[:dbsize]
    for i in range(len(query)):
        ranks.append(rank(query[i], i, search)) # rank das ta no dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

Mean rank: 3185.365(2894.678, 3476.052), Acc: 0.04, MRR: 0.07(0.054, 0.079) with dbsearch size: 20000
Mean rank: 6368.482(5787.291, 6949.673), Acc: 0.02, MRR: 0.05(0.037, 0.057) with dbsearch size: 40000
Mean rank: 9559.959(8687.629, 10432.289), Acc: 0.02, MRR: 0.04(0.028, 0.046) with dbsearch size: 60000
Mean rank: 12782.632(11617.801, 13947.463), Acc: 0.02, MRR: 0.03(0.024, 0.041) with dbsearch size: 80000
Mean rank: 16021.086(14563.174, 17478.998), Acc: 0.01, MRR: 0.03(0.021, 0.037) with dbsearch size: 100000
CPU times: user 10min 44s, sys: 30min 30s, total: 41min 15s
Wall time: 10min 22s


In [50]:
%%time
# d2v_w5_s64_ep5_dm0.model
dbsizes = [20000, 40000, 60000, 80000, 100000]
for dbsize in dbsizes:
    ranks = []
    search = dbsearch[:dbsize]
    for i in range(len(query)):
        ranks.append(rank(query[i], i, search)) # rank das ta no dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

Mean rank: 5.47(2.428, 8.512), Acc: 0.71, MRR: 0.79(0.771, 0.813) with dbsearch size: 20000
Mean rank: 9.482(3.527, 15.437), Acc: 0.63, MRR: 0.73(0.704, 0.751) with dbsearch size: 40000
Mean rank: 13.475(4.661, 22.289), Acc: 0.58, MRR: 0.69(0.664, 0.712) with dbsearch size: 60000
Mean rank: 18.213(6.451, 29.975), Acc: 0.55, MRR: 0.66(0.63, 0.68) with dbsearch size: 80000
Mean rank: 22.57(7.938, 37.202), Acc: 0.53, MRR: 0.63(0.607, 0.657) with dbsearch size: 100000
CPU times: user 10min 38s, sys: 30min 21s, total: 41min
Wall time: 10min 19s


In [66]:
%%time
# d2v_w5_s64_ep10_dm0.model
dbsizes = [20000, 40000, 60000, 80000, 100000]
for dbsize in dbsizes:
    ranks = []
    search = dbsearch[:dbsize]
    for i in range(len(query)):
        ranks.append(rank(query[i], i, search)) # rank das ta no dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

Mean rank: 11.079(-4.652, 26.81), Acc: 0.81, MRR: 0.87(0.85, 0.885) with dbsearch size: 20000
Mean rank: 21.008(-10.157, 52.173), Acc: 0.75, MRR: 0.83(0.806, 0.845) with dbsearch size: 40000
Mean rank: 30.78(-15.743, 77.303), Acc: 0.71, MRR: 0.8(0.777, 0.818) with dbsearch size: 60000
Mean rank: 40.338(-20.812, 101.488), Acc: 0.69, MRR: 0.77(0.751, 0.795) with dbsearch size: 80000
Mean rank: 49.359(-25.52, 124.238), Acc: 0.67, MRR: 0.75(0.732, 0.777) with dbsearch size: 100000
CPU times: user 10min 46s, sys: 30min 32s, total: 41min 19s
Wall time: 10min 23s


In [82]:
%%time
# d2v_w5_s64_ep20_dm0.model
dbsizes = [20000, 40000, 60000, 80000, 100000]
for dbsize in dbsizes:
    ranks = []
    search = dbsearch[:dbsize]
    for i in range(len(query)):
        ranks.append(rank(query[i], i, search)) # rank das ta no dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

Mean rank: 1.837(1.042, 2.632), Acc: 0.86, MRR: 0.91(0.898, 0.926) with dbsearch size: 20000
Mean rank: 2.644(0.973, 4.315), Acc: 0.82, MRR: 0.88(0.866, 0.898) with dbsearch size: 40000
Mean rank: 3.436(0.886, 5.986), Acc: 0.79, MRR: 0.86(0.841, 0.876) with dbsearch size: 60000
Mean rank: 4.289(0.955, 7.623), Acc: 0.76, MRR: 0.83(0.814, 0.852) with dbsearch size: 80000
Mean rank: 5.076(0.969, 9.183), Acc: 0.74, MRR: 0.82(0.797, 0.837) with dbsearch size: 100000
CPU times: user 10min 44s, sys: 30min 29s, total: 41min 13s
Wall time: 10min 22s


In [98]:
%%time
# d2v_w5_s64_ep30_dm0.model
dbsizes = [20000, 40000, 60000, 80000, 100000]
for dbsize in dbsizes:
    ranks = []
    search = dbsearch[:dbsize]
    for i in range(len(query)):
        ranks.append(rank(query[i], i, search)) # rank das ta no dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

Mean rank: 1.441(1.234, 1.648), Acc: 0.88, MRR: 0.92(0.907, 0.934) with dbsearch size: 20000
Mean rank: 1.805(1.464, 2.146), Acc: 0.82, MRR: 0.88(0.867, 0.899) with dbsearch size: 40000
Mean rank: 2.164(1.693, 2.635), Acc: 0.8, MRR: 0.86(0.845, 0.88) with dbsearch size: 60000
Mean rank: 2.623(1.935, 3.311), Acc: 0.77, MRR: 0.84(0.821, 0.859) with dbsearch size: 80000
Mean rank: 2.996(2.128, 3.864), Acc: 0.75, MRR: 0.82(0.805, 0.844) with dbsearch size: 100000
CPU times: user 10min 37s, sys: 30min 8s, total: 40min 46s
Wall time: 10min 15s


In [114]:
%%time
# d2v_w5_s64_ep40_dm0.model
dbsizes = [20000, 40000, 60000, 80000, 100000]
for dbsize in dbsizes:
    ranks = []
    search = dbsearch[:dbsize]
    for i in range(len(query)):
        ranks.append(rank(query[i], i, search)) # rank das ta no dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

Mean rank: 1.447(1.282, 1.612), Acc: 0.88, MRR: 0.92(0.908, 0.935) with dbsearch size: 20000
Mean rank: 1.882(1.565, 2.199), Acc: 0.82, MRR: 0.88(0.869, 0.901) with dbsearch size: 40000
Mean rank: 2.287(1.847, 2.727), Acc: 0.8, MRR: 0.86(0.845, 0.88) with dbsearch size: 60000
Mean rank: 2.796(2.181, 3.411), Acc: 0.77, MRR: 0.84(0.82, 0.858) with dbsearch size: 80000
Mean rank: 3.188(2.436, 3.94), Acc: 0.75, MRR: 0.83(0.806, 0.845) with dbsearch size: 100000
CPU times: user 10min 43s, sys: 30min 18s, total: 41min 2s
Wall time: 10min 19s


In [130]:
%%time
# d2v_w5_s64_ep50_dm0.model
dbsizes = [20000, 40000, 60000, 80000, 100000]
for dbsize in dbsizes:
    ranks = []
    search = dbsearch[:dbsize]
    for i in range(len(query)):
        ranks.append(rank(query[i], i, search)) # rank das ta no dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

Mean rank: 1.389(1.286, 1.492), Acc: 0.86, MRR: 0.91(0.898, 0.926) with dbsearch size: 20000
Mean rank: 1.739(1.551, 1.927), Acc: 0.82, MRR: 0.88(0.863, 0.896) with dbsearch size: 40000
Mean rank: 2.061(1.796, 2.326), Acc: 0.79, MRR: 0.85(0.837, 0.873) with dbsearch size: 60000
Mean rank: 2.478(2.117, 2.839), Acc: 0.75, MRR: 0.83(0.81, 0.848) with dbsearch size: 80000
Mean rank: 2.828(2.377, 3.279), Acc: 0.73, MRR: 0.81(0.791, 0.831) with dbsearch size: 100000
CPU times: user 10min 44s, sys: 30min 29s, total: 41min 13s
Wall time: 10min 22s


In [146]:
%%time
# d2v_w5_s64_ep100_dm0.model
dbsizes = [20000, 40000, 60000, 80000, 100000]
for dbsize in dbsizes:
    ranks = []
    search = dbsearch[:dbsize]
    for i in range(len(query)):
        ranks.append(rank(query[i], i, search)) # rank das ta no dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

Mean rank: 2.347(1.58, 3.114), Acc: 0.84, MRR: 0.89(0.877, 0.908) with dbsearch size: 20000
Mean rank: 3.633(2.074, 5.192), Acc: 0.78, MRR: 0.85(0.834, 0.87) with dbsearch size: 40000
Mean rank: 4.901(2.556, 7.246), Acc: 0.75, MRR: 0.83(0.808, 0.846) with dbsearch size: 60000
Mean rank: 6.234(3.159, 9.309), Acc: 0.72, MRR: 0.8(0.781, 0.822) with dbsearch size: 80000
Mean rank: 7.417(3.64, 11.194), Acc: 0.7, MRR: 0.79(0.764, 0.806) with dbsearch size: 100000
CPU times: user 10min 46s, sys: 30min 26s, total: 41min 12s
Wall time: 10min 22s
