# Coordle Idea
1. Get query, e.g. "What is love?"
2. Tokenize query
3. Create a vector using word2vec of the tokens by summing them, or finding the average or whatever.
   lets call it the query vector. 
4. Find sentence vectors that are the closesest to the query vector.
5. Return the papers corresponding to said sentence vectors.

# Coordle Backend

In [1]:
import pandas as pd 
import numpy as np
from importlib import reload
import nltk
import spacy
import en_core_sci_lg # Biomedical word embeddings
from utils import clean_text, load_pickle, save_pickle
from nltk.corpus import stopwords as _stopwords
from tqdm import tqdm
from collections import deque
from copy import deepcopy
from collections.abc import Iterable
from typing import Union
from pprint import pprint
from itertools import chain

import os
import json
import re 
from string import punctuation as PUNCTUATION
from nltk.corpus import stopwords as _stopwords
from gensim.models import Word2Vec
from os.path import join as join_path
from gensim.models.callbacks import CallbackAny2Vec

In [2]:
df = pd.read_csv('data/cord-19-data.csv', nrows=16000)

In [3]:
class EpochSaver(CallbackAny2Vec):
    '''Callback to save model after each epoch.'''

    def __init__(self, output_dir: str, prefix: str, logs_filename: str):
        self.output_dir = output_dir
        self.prefix = prefix
        self.logs_filename = logs_filename
        self.epoch = 0

    def on_epoch_end(self, model):
        cum_loss = model.get_latest_training_loss()
        if self.epoch == 0:
            loss = cum_loss
        else:
            loss = cum_loss - self.loss_previous_step
        self.loss_previous_step = loss
        with open(join_path(self.output_dir, self.logs_filename), 'a+') as file:
            file.write(f'Epoch #{self.epoch}, loss: {loss}\n')
        
        output_path = join_path(self.output_dir, f'{self.prefix}_epoch_{self.epoch}.model')
        model.save(output_path)
        self.epoch += 1

class DocEpochSaver(CallbackAny2Vec):
    '''Callback to save model after each epoch.'''

    def __init__(self, output_dir: str, prefix: str, start_epoch: int = 1):
        self.output_dir = output_dir
        self.prefix = prefix
        self.epoch = start_epoch

    def on_epoch_end(self, model):        
        output_path = join_path(self.output_dir, f'{self.prefix}_epoch_{self.epoch}.model')
        model.save(output_path)
        self.epoch += 1    

In [4]:
# Load the last trained model
model = Word2Vec.load(join_path('models-word2vec', 'w2v_model_epoch_29.model'))
word_to_int = {word:i for i, word in enumerate(model.wv.index2word)}
int_to_word = np.array(model.wv.index2word)

In [6]:
import coordle_backend
reload(coordle_backend)
from coordle_backend import (CordDoc, Index, RecursiveDescentParser, 
                             AI_Index, DocWordCounts, WordToDoc)

DocWordCounts.objects.delete()
WordToDoc.objects.delete()

ai_index = AI_Index(model.wv.most_similar, n_similars=1)
ai_index.build_from_df(
    df.iloc[:128],
    'cord_uid',
    'title',
    'body_text', 
    verbose=True, 
    use_multiprocessing=True,
    workers=-1
)

Text cleaning initilized on 16 workers


Cleaning texts: 100%|██████████| 128/128 [00:03<00:00, 41.05it/s]
Adding to index: 100%|██████████| 128/128 [00:00<00:00, 451.74it/s]


In [7]:
import coordle_backend
reload(coordle_backend)
from coordle_backend import CordDoc, Index, RecursiveDescentParser, AI_Index

docs, scores, errmsgs = ai_index.search('fever')

n = 69
if errmsgs:
    print(errmsgs)
else:
    for doc, score in zip(docs, scores):
        print(f'{doc.uid}  {str(doc.title)[:70]:<70}  {score:.4f}')


0s6ort9f  Accuracy of parents in measuring body temperature with a tympanic ther  26.7016
fy4w7xz8  Association of HLA class I with severe acute respiratory syndrome coro  15.1699
1wswi7us  Relationship of SARS-CoV to other pathogenic RNA viruses explored by t  6.4734
i948aq4b  A simple and rapid approach for screening of SARS-coronavirus genotype  5.8192
ln8ddyuj  Persistence of lung inflammation and lung cytokines with high-resoluti  5.4613
jzj8q25c  Proteomics computational analyses suggest that the carboxyl terminal g  5.4006
mtmgur1u  Dynamic changes of serum SARS-Coronavirus IgG, pulmonary function and   3.5266
jh9e85c0  Molecular mechanisms of severe acute respiratory syndrome (SARS)        2.9878
04cuk2cn  Recombinant Tula hantavirus shows reduced fitness but is able to survi  2.9438
efrv5nvf  Reference gene selection for quantitative real-time PCR analysis in vi  2.8672
i4pmux28  Why can't I visit? The ethics of visitation restrictions – lessons lea  2.8202
zc491h8v  Peptide i

In [8]:
with open('textfile.txt', 'w+') as f:
    uid = 'qzm9wgde'
    
    print(f'dumping {uid}')
    f.write(df[df.cord_uid == uid].body_text.values[0].lower())

dumping qzm9wgde


In [None]:
raise ValueError

# Mongo

Try to insert to database using multiprocessing

In [10]:
from multiprocessing import Pool

In [29]:
temp = WordToDoc(word='giggolo') 
temp.uids.extend(['1','2','3'])
temp.uids

['1', '2', '3']

In [40]:
docmap = {word:[doc.uid for doc in docs] for key, docs in ai_index.docmap.items()}

In [37]:
for word, uids in ai_index.docmap.items():
    temp = WordToDoc(word='giggolo')
    struids = [uids]
    temp.uids.extend(list(uids))
    temp.save()
    break

temp.uids

ValidationError: ValidationError (WordToDoc:giggolo) (StringField only accepts string values 1.StringField only accepts string values: ['uids'])

In [15]:

# for word, uids in tqdm(ai_index.docmap.items(), desc='Adding to DB',
#                                total=len(ai_index.docmap), position=0):
#     if len(word) < 500:
#         temp = WordToDoc(word=word)
#         temp.uids.extend(list(uids))
#         temp.save()

Adding to DB:   0%|          | 0/21296 [00:00<?, ?it/s]


ValidationError: ValidationError (WordToDoc:10minute) (StringField only accepts string values 1.StringField only accepts string values: ['uids'])

<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />

In [None]:
import coordle_backend
reload(coordle_backend)
from coordle_backend import CordDoc, Index2, RecursiveDescentParser, AI_Index2
Index = Index2

queries = [
    'retarded!!!',
#     '(',
#     ')',
    'retarded (white AND (woman NOT man))',
    'retarded (white AND white) man',
#     'retarded OR white OR woman',
#     'retarded white AND woman',
#     'retarded OR white NOT woman',
#     'retarded (white NOT woman)',
#     'retarded (white NOT woman)',
#     'OR retarded AND white woman',
#     'retarded AND AND white NOT woman',
#     'retarded (white NOT woman) AND',
#     ')retarded ((white NOT woman) AND',
#     'retarded ((white NOT woman)',
#     'AND retarded)) ((white NOT woman) NOT',
]

rdp = RecursiveDescentParser(index.docmap)
for query in queries:
    errmsgs = []
    tokens = rdp.get_logical_querytokens(query)
    pass_ = rdp.assert_query(tokens, errmsgs)
    
#     print(tokens)
    print(rdp.parenthesis_handler(tokens))
#     print(errmsgs)
    print(pass_)
    print()
    