In [2]:
import faiss
import json
import mlflow
import os
import torch
import numpy as np
from src.data.text_retriever import TextRetriever
from src.models.knrm_model import KNRM

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
PARENT_DIR = os.path.abspath(os.path.join('', os.pardir))
DOCUMENTS_PATH = PARENT_DIR + '/data/processed/documents.json'
ML_RUNS_PATH = PARENT_DIR + '/models/ml_runs/'

with open(DOCUMENTS_PATH) as f:
    documents = json.load(f)

if mlflow.get_tracking_uri() != 'file:///' + ML_RUNS_PATH:
    mlflow.set_tracking_uri('file:///' + ML_RUNS_PATH)
EXP_ID = mlflow.get_experiment_by_name('QuoraRankingExtendedTraining').experiment_id
RUN_ID = mlflow.search_runs(experiment_ids=[EXP_ID])['run_id'][0]

MODEL_URI = "runs:/{}/model".format(RUN_ID)
VOCAB_URI = "runs:/{}/vocab".format(RUN_ID)
knrm = mlflow.pytorch.load_model(MODEL_URI)
vocab = mlflow.artifacts.load_dict(VOCAB_URI)

In [4]:
len(documents)

537916

In [5]:
idxs, docs = [], []
for idx in documents:
    idxs.append(int(idx))
    docs.append(documents[idx])

In [6]:
embeddings = []
oov_val = vocab['OOV']
tr = TextRetriever()
emb_layer = knrm.embeddings.state_dict()['weight']
for d in docs:
    tmp_emb = [vocab.get(w, oov_val) for w in tr.lower_and_tokenize_words(d)]
    tmp_emb = emb_layer[tmp_emb].mean(dim=0)
    embeddings.append(np.array(tmp_emb))

embeddings = np.array([embedding for embedding in embeddings]).astype(np.float32)

In [7]:
embeddings.shape[1]

50

In [60]:
index = faiss.IndexFlatL2(embeddings.shape[1])
index = faiss.IndexIDMap(index)
index.add_with_ids(embeddings, np.array(idxs))

In [10]:
def get_memory(index):
    faiss.write_index(index, './temp.index')
    file_size = os.path.getsize('./temp.index')
    os.remove('./temp.index')
    return file_size

print(get_memory(index))

111886618


In [62]:
query = 'What doctor should I visit if I broke my leg?'

q_vector = [vocab.get(token, oov_val) for token in tr.lower_and_tokenize_words(query)]
q_emb = emb_layer[q_vector].mean(dim=0).reshape(1, -1)
q_emb = np.array(q_emb).astype(np.float32)

In [53]:
_, I = index.search(q_emb, k=100)

In [54]:
def text_to_token_ids(text_list, vocab):
    tokenized, max_len = [], 30
    for text in text_list:
        tokenized_text = tr.lower_and_tokenize_words(text)[:max_len]
        token_idxs = [vocab.get(i, vocab["OOV"]) for i in tokenized_text]
        tokenized.append(token_idxs)
    tokenized = [elem + [0] * (max_len - len(elem)) for elem in tokenized]
    tokenized = torch.LongTensor(tokenized)
    return tokenized

In [55]:
cands = [(str(i), documents[str(i)]) for i in I[0] if i != -1]
inputs = dict()
inputs['query'] = text_to_token_ids([query] * len(cands), vocab)
inputs['document'] = text_to_token_ids([cnd[1] for cnd in cands], vocab)
scores = knrm.predict(inputs)

In [56]:
res_ids = scores.reshape(-1).argsort(descending=True)
res_ids = res_ids[:10]
res = [cands[i] for i in res_ids.tolist()]

In [57]:
query, res

('What doctor should I visit if I broke my leg?',
 [('295074',
   'My crush said he likes me, but he also likes this other girl. He knows I like him. I told him if he chooses her its okay. What should I do?'),
  ('475642',
   "I broke up with my boyfriend of 3 years because he never had time for me and never called me. When I would call, he would cut my call saying he was very busy even if he wasn't. My mother also hates him. I still like him a lot. Did I do the right thing?"),
  ('150738',
   'I ask for a second chance from my girlfriend she rejected me, it is still possible to ask again? How long should I wait until I can ask her again?'),
  ('326864',
   'What should I do when I told my boyfriend to let me know if he wants to grab dinner later on and he went with his friend instead?'),
  ('246895',
   'We broke up like a year ago because she cheated on me, I am going to meet her in a week. I still love her. What should I do?'),
  ('535839',
   'My best friend cried when I told her h