In [1]:
import gensim
import numpy as np
import pandas as pd
import scipy.stats as st

from sklearn.metrics.pairwise import cosine_distances
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
!head -n 20 '../data/exp1-trj.t'

3
3
3
3
3
3
3
3
3
3
9926 3189 88 7163 4076 13490 19059 13124 7310 5479
9042 3107 4616 5742 4717 6339 15188 18899 14919 7274 11265 1081 7609 1351 393
427 5334 8388 8967 4445 2056
542 4013 17293 3322 11357 483 11138
481 8071 657 2231 1769 173 1579 6198
854 3107 5989 8778 6218 7338
5773 8574 13056 11926 15551 6461
4134 3764 9341 15412 13024 9086
962 9873 469 373 6096 567 8620
14405 4076 5305 3107 6807 1155


In [3]:
with open('../data/exp1-trj.t', 'r') as file:
    trajs_teste = []

    for line in file:
        traj = line.strip().split()# Divide cada linha(traj) em cels usando espaço como delimitador e convertendo-as em strings
        trajs_teste.append(traj)

In [4]:
print("Quantidade de trajetórias de teste:", len(trajs_teste))

Quantidade de trajetórias de teste: 50500


In [5]:
print(trajs_teste[10]) # query par

['9926', '3189', '88', '7163', '4076', '13490', '19059', '13124', '7310', '5479']


In [6]:
print(trajs_teste[510]) # "alvo" da query par, ou seja, a query ímpar

['9926', '8231', '92', '12594', '14968', '11048', '7821', '577']


In [7]:
!ls '../data/saved_models/D2V/'

d2v_w5_s512_ep30_dm0.model
d2v_w5_s512_ep30_dm0.model.dv.vectors.npy
d2v_w5_s64_ep30_dm0.model
d2v_w5_s64_ep30_dm0.model.dv.vectors.npy


In [8]:
# Carregando o modelo...
model = Doc2Vec.load('../data/saved_models/D2V/d2v_w5_s512_ep30_dm0.model')

In [9]:
# Size vocab
len(model.wv.key_to_index)

19690

In [10]:
model.vector_size

512

In [11]:
# Pegando as trajetórias mais similares a trajetória id = 0
mosts = model.dv.most_similar(0)
mosts

[(13, 0.9870317578315735),
 (1131668, 0.9858295917510986),
 (1132234, 0.9857214093208313),
 (809, 0.9851027131080627),
 (1131656, 0.9838312864303589),
 (807, 0.9836079478263855),
 (1131552, 0.9835062623023987),
 (1131617, 0.9835059642791748),
 (1131627, 0.9834228754043579),
 (349, 0.983258843421936)]

In [12]:
def get_embedding_for_all_trajs(trajs):
    emb_trajs = np.zeros((len(trajs),) + (model.vector_size,), dtype=np.float32) # shape --> (len(trajs), vector_size). Ex. para Dq, shape --> (1000, 64)

    idx = 0
    for traj in trajs:
        emb_trajs[idx] = model.infer_vector(traj, epochs=model.epochs)
        idx += 1

        percent_done = (idx / len(trajs)) * 100 # Calcula a porcentagem concluída
        print(f"Progresso: {percent_done:.2f}% concluído", end="\r")  # A opção `end="\r"` permite que a impressão seja substituída na mesma linha

    return emb_trajs

In [13]:
# Segmentando: query (trajs pares) e dbsearch (querys ímpar + 49500 outras ímpares)
query = trajs_teste[:500] # trajs query (pares)
dbsearch = trajs_teste[500:50500] # dbsearch trajs (as 500 primeiras são as query ímpar)

In [14]:
print(len(query))
print(len(dbsearch))

500
50000


In [15]:
# Pegando os embeddings das trajs query:
query = get_embedding_for_all_trajs(query)

Progresso: 100.00% concluído

In [16]:
# Pegando os embeddings das trajs do dbsearch:
dbsearch = get_embedding_for_all_trajs(dbsearch)

Progresso: 100.00% concluído

In [17]:
def rank(t_query, i, db_search):
    dists = cosine_distances(t_query.reshape(1, -1), db_search) # pega todas as distâncias de Dq[i] as trajs do db_search (D_qUD_p)
    dists = dists.flatten()
    order = dists.argsort() # pega a ordem
    ranks = order.argsort() # pega o rank

    return ranks[i] + 1 # retorna o ranking de ta' no dbsearch | soma +1 pq o argsort rankea a partir de 0 (zero)

In [18]:
def acc(ranks):
    count = 0
    for i in range(len(ranks)):
        if ranks[i] == 1:
            count += 1

    return round(count/len(ranks), 2)

In [19]:
def mr(ranks):
    return (sum(ranks)/len(ranks))

In [20]:
def mrr(ranks):
    count = 0
    for i in range(len(ranks)):
        count += 1/ranks[i]

    return round(count/len(ranks), 2)

In [21]:
# Intervalo de Confiança do Ranks
def cip_r(ranks):
    data = []
    for i in range(len(ranks)):
        data.append(ranks[i]) # Add os Ranks

    #create 95% confidence interval for population mean weight
    ic = st.t.interval(alpha=0.95, df=len(data)-1, loc=np.mean(data), scale=st.sem(data))

    return tuple(round(valor, 3) for valor in ic) # arredonda pra 3 casas decimais

In [22]:
# Intervalo de Confiança dos Reciprocal Ranks
def cip_rr(ranks):
    data = []
    for i in range(len(ranks)):
        data.append(1/ranks[i]) # Add os RRs...

    #create 95% confidence interval for population mean weight
    ic = st.t.interval(alpha=0.95, df=len(data)-1, loc=np.mean(data), scale=st.sem(data))

    return tuple(round(valor, 3) for valor in ic) # arredonda pra 3 casas decimais

In [23]:
%%time
# d2v_w5_s512_ep30_dm0.model
dbsizes = [10000, 20000, 30000, 40000, 50000]
for dbsize in dbsizes:
    ranks = []
    search = dbsearch[:dbsize]
    for i in range(len(query)):
        ranks.append(rank(query[i], i, search)) # rank das ta no dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

Mean rank: 115.628(88.274, 142.982), Acc: 0.47, MRR: 0.55(0.506, 0.584) with dbsearch size: 10000
Mean rank: 215.004(163.783, 266.225), Acc: 0.44, MRR: 0.52(0.476, 0.555) with dbsearch size: 20000
Mean rank: 301.128(229.488, 372.768), Acc: 0.42, MRR: 0.49(0.452, 0.531) with dbsearch size: 30000
Mean rank: 381.74(291.096, 472.384), Acc: 0.41, MRR: 0.48(0.439, 0.519) with dbsearch size: 40000
Mean rank: 498.948(380.017, 617.879), Acc: 0.41, MRR: 0.47(0.433, 0.513) with dbsearch size: 50000
CPU times: user 7min 33s, sys: 18min 48s, total: 26min 21s
Wall time: 6min 41s


In [23]:
%%time
# d2v_w5_s128_ep30_dm0.model
dbsizes = [10000, 20000, 30000, 40000, 50000]
for dbsize in dbsizes:
    ranks = []
    search = dbsearch[:dbsize]
    for i in range(len(query)):
        ranks.append(rank(query[i], i, search)) # rank das ta no dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

Mean rank: 117.976(90.709, 145.243), Acc: 0.49, MRR: 0.56(0.518, 0.597) with dbsearch size: 10000
Mean rank: 220.76(169.203, 272.317), Acc: 0.46, MRR: 0.53(0.489, 0.568) with dbsearch size: 20000
Mean rank: 308.78(237.004, 380.556), Acc: 0.44, MRR: 0.51(0.466, 0.546) with dbsearch size: 30000
Mean rank: 390.98(300.267, 481.693), Acc: 0.43, MRR: 0.49(0.453, 0.533) with dbsearch size: 40000
Mean rank: 511.966(392.292, 631.64), Acc: 0.42, MRR: 0.48(0.443, 0.523) with dbsearch size: 50000
CPU times: user 5min 26s, sys: 15min 41s, total: 21min 7s
Wall time: 5min 19s


In [36]:
%%time
# d2v_w5_s64_ep1_dm0.model
dbsizes = [10000, 20000, 30000, 40000, 50000]
for dbsize in dbsizes:
    ranks = []
    search = dbsearch[:dbsize]
    for i in range(len(query)):
        ranks.append(rank(query[i], i, search)) # rank das ta no dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

Mean rank: 2040.386(1831.495, 2249.277), Acc: 0.02, MRR: 0.04(0.03, 0.057) with dbsearch size: 10000
Mean rank: 4079.408(3658.796, 4500.02), Acc: 0.02, MRR: 0.03(0.022, 0.048) with dbsearch size: 20000
Mean rank: 6155.294(5518.514, 6792.074), Acc: 0.02, MRR: 0.03(0.019, 0.043) with dbsearch size: 30000
Mean rank: 8349.212(7488.715, 9209.709), Acc: 0.02, MRR: 0.03(0.019, 0.043) with dbsearch size: 40000
Mean rank: 10446.29(9371.695, 11520.885), Acc: 0.02, MRR: 0.03(0.018, 0.042) with dbsearch size: 50000
CPU times: user 4min 10s, sys: 12min 56s, total: 17min 7s
Wall time: 4min 39s


In [112]:
%%time
# d2v_w5_s64_ep5_dm0.model
dbsizes = [10000, 20000, 30000, 40000, 50000]
for dbsize in dbsizes:
    ranks = []
    search = dbsearch[:dbsize]
    for i in range(len(query)):
        ranks.append(rank(query[i], i, search)) # rank das ta no dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

Mean rank: 431.646(287.203, 576.089), Acc: 0.42, MRR: 0.49(0.453, 0.532) with dbsearch size: 10000
Mean rank: 866.47(575.451, 1157.489), Acc: 0.37, MRR: 0.44(0.397, 0.475) with dbsearch size: 20000
Mean rank: 1302.85(863.755, 1741.945), Acc: 0.34, MRR: 0.4(0.366, 0.444) with dbsearch size: 30000
Mean rank: 1728.682(1143.171, 2314.193), Acc: 0.31, MRR: 0.38(0.343, 0.42) with dbsearch size: 40000
Mean rank: 2158.192(1428.54, 2887.844), Acc: 0.3, MRR: 0.37(0.33, 0.406) with dbsearch size: 50000
CPU times: user 3min 12s, sys: 9min 57s, total: 13min 10s
Wall time: 3min 40s


In [128]:
%%time
# d2v_w5_s64_ep10_dm0.model
dbsizes = [10000, 20000, 30000, 40000, 50000]
for dbsize in dbsizes:
    ranks = []
    search = dbsearch[:dbsize]
    for i in range(len(query)):
        ranks.append(rank(query[i], i, search)) # rank das ta no dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

Mean rank: 174.19(98.7, 249.68), Acc: 0.54, MRR: 0.6(0.564, 0.643) with dbsearch size: 10000
Mean rank: 337.208(186.955, 487.461), Acc: 0.49, MRR: 0.56(0.521, 0.6) with dbsearch size: 20000
Mean rank: 492.096(267.08, 717.112), Acc: 0.46, MRR: 0.53(0.487, 0.566) with dbsearch size: 30000
Mean rank: 636.986(339.296, 934.676), Acc: 0.45, MRR: 0.51(0.473, 0.554) with dbsearch size: 40000
Mean rank: 811.684(438.229, 1185.139), Acc: 0.44, MRR: 0.5(0.464, 0.544) with dbsearch size: 50000
CPU times: user 3min 15s, sys: 10min 5s, total: 13min 21s
Wall time: 3min 43s


In [144]:
%%time
# d2v_w5_s64_ep20_dm0.model
dbsizes = [10000, 20000, 30000, 40000, 50000]
for dbsize in dbsizes:
    ranks = []
    search = dbsearch[:dbsize]
    for i in range(len(query)):
        ranks.append(rank(query[i], i, search)) # rank das ta no dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

Mean rank: 115.096(84.5, 145.692), Acc: 0.51, MRR: 0.58(0.54, 0.618) with dbsearch size: 10000
Mean rank: 215.712(156.968, 274.456), Acc: 0.48, MRR: 0.55(0.506, 0.585) with dbsearch size: 20000
Mean rank: 303.812(220.572, 387.052), Acc: 0.46, MRR: 0.52(0.482, 0.563) with dbsearch size: 30000
Mean rank: 383.104(276.999, 489.209), Acc: 0.45, MRR: 0.51(0.469, 0.55) with dbsearch size: 40000
Mean rank: 502.048(364.02, 640.076), Acc: 0.44, MRR: 0.5(0.459, 0.539) with dbsearch size: 50000
CPU times: user 3min 11s, sys: 9min 55s, total: 13min 7s
Wall time: 3min 39s


In [160]:
%%time
# d2v_w5_s64_ep30_dm0.model
dbsizes = [10000, 20000, 30000, 40000, 50000]
for dbsize in dbsizes:
    ranks = []
    search = dbsearch[:dbsize]
    for i in range(len(query)):
        ranks.append(rank(query[i], i, search)) # rank das ta no dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

Mean rank: 111.594(84.69, 138.498), Acc: 0.48, MRR: 0.56(0.518, 0.596) with dbsearch size: 10000
Mean rank: 207.702(156.982, 258.422), Acc: 0.45, MRR: 0.52(0.482, 0.561) with dbsearch size: 20000
Mean rank: 291.716(220.724, 362.708), Acc: 0.44, MRR: 0.5(0.462, 0.542) with dbsearch size: 30000
Mean rank: 370.068(280.25, 459.886), Acc: 0.42, MRR: 0.48(0.445, 0.524) with dbsearch size: 40000
Mean rank: 482.872(364.936, 600.808), Acc: 0.41, MRR: 0.48(0.438, 0.518) with dbsearch size: 50000
CPU times: user 3min 11s, sys: 9min 53s, total: 13min 4s
Wall time: 3min 38s


In [177]:
%%time
# d2v_w5_s64_ep40_dm0.model
dbsizes = [10000, 20000, 30000, 40000, 50000]
for dbsize in dbsizes:
    ranks = []
    search = dbsearch[:dbsize]
    for i in range(len(query)):
        ranks.append(rank(query[i], i, search)) # rank das ta no dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

Mean rank: 121.794(92.553, 151.035), Acc: 0.49, MRR: 0.56(0.519, 0.598) with dbsearch size: 10000
Mean rank: 226.314(171.473, 281.155), Acc: 0.46, MRR: 0.53(0.487, 0.566) with dbsearch size: 20000
Mean rank: 319.188(241.883, 396.493), Acc: 0.44, MRR: 0.5(0.464, 0.544) with dbsearch size: 30000
Mean rank: 403.072(305.497, 500.647), Acc: 0.43, MRR: 0.49(0.45, 0.53) with dbsearch size: 40000
Mean rank: 525.43(397.503, 653.357), Acc: 0.42, MRR: 0.48(0.439, 0.519) with dbsearch size: 50000
CPU times: user 3min 11s, sys: 9min 51s, total: 13min 3s
Wall time: 3min 38s


In [193]:
%%time
# d2v_w5_s64_ep50_dm0.model
dbsizes = [10000, 20000, 30000, 40000, 50000]
for dbsize in dbsizes:
    ranks = []
    search = dbsearch[:dbsize]
    for i in range(len(query)):
        ranks.append(rank(query[i], i, search)) # rank das ta no dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

Mean rank: 133.904(100.425, 167.383), Acc: 0.49, MRR: 0.56(0.519, 0.598) with dbsearch size: 10000
Mean rank: 249.138(185.772, 312.504), Acc: 0.46, MRR: 0.52(0.485, 0.565) with dbsearch size: 20000
Mean rank: 354.762(263.146, 446.378), Acc: 0.44, MRR: 0.5(0.463, 0.543) with dbsearch size: 30000
Mean rank: 451.52(333.93, 569.11), Acc: 0.41, MRR: 0.49(0.446, 0.526) with dbsearch size: 40000
Mean rank: 586.03(434.759, 737.301), Acc: 0.4, MRR: 0.48(0.437, 0.516) with dbsearch size: 50000
CPU times: user 3min 9s, sys: 9min 56s, total: 13min 6s
Wall time: 3min 39s


In [212]:
%%time
# d2v_w5_s64_ep100_dm0.model
dbsizes = [10000, 20000, 30000, 40000, 50000]
for dbsize in dbsizes:
    ranks = []
    search = dbsearch[:dbsize]
    for i in range(len(query)):
        ranks.append(rank(query[i], i, search)) # rank das ta no dbsearch!
    print('Mean rank: {}{}, Acc: {}, MRR: {}{} with dbsearch size: {}'.format(mr(ranks), cip_r(ranks), acc(ranks), mrr(ranks), cip_rr(ranks), dbsize))

Mean rank: 157.178(119.262, 195.094), Acc: 0.44, MRR: 0.51(0.471, 0.551) with dbsearch size: 10000
Mean rank: 298.332(224.99, 371.674), Acc: 0.43, MRR: 0.49(0.45, 0.531) with dbsearch size: 20000
Mean rank: 429.38(323.277, 535.483), Acc: 0.42, MRR: 0.47(0.433, 0.514) with dbsearch size: 30000
Mean rank: 547.088(410.726, 683.45), Acc: 0.41, MRR: 0.46(0.422, 0.503) with dbsearch size: 40000
Mean rank: 702.808(529.153, 876.463), Acc: 0.4, MRR: 0.46(0.416, 0.496) with dbsearch size: 50000
CPU times: user 4min 25s, sys: 14min 7s, total: 18min 32s
Wall time: 4min 39s
