# Coordle Idea
1. Get query, e.g. "What is love?"
2. Tokenize query
3. Create a vector using word2vec of the tokens by summing them, or finding the average or whatever.
   lets call it the query vector. 
4. Find sentence vectors that are the closesest to the query vector.
5. Return the papers corresponding to said sentence vectors.

# Coordle Backend

In [1]:
import pandas as pd 
import numpy as np
from importlib import reload
import nltk
import spacy
import en_core_sci_lg # Biomedical word embeddings
from utils import clean_text, load_pickle, save_pickle
from nltk.corpus import stopwords as _stopwords
from tqdm import tqdm
from collections import deque
from copy import deepcopy
from collections.abc import Iterable
from typing import Union
from pprint import pprint
from itertools import chain
import pymongo

import os
import json
import re 
from string import punctuation as PUNCTUATION
from nltk.corpus import stopwords as _stopwords
from gensim.models import Word2Vec
from os.path import join as join_path
from gensim.models.callbacks import CallbackAny2Vec

In [2]:
df = pd.read_csv('data/cord-19-data.csv')

In [3]:
class EpochSaver(CallbackAny2Vec):
    '''Callback to save model after each epoch.'''
class DocEpochSaver(CallbackAny2Vec):
    '''Callback to save model after each epoch.'''

In [4]:
# Load the last trained model
model = Word2Vec.load(join_path('models-word2vec', 'w2v_model_epoch_29.model'))
word_to_int = {word:i for i, word in enumerate(model.wv.index2word)}
int_to_word = np.array(model.wv.index2word)

In [5]:
import coordle_backend
reload(coordle_backend)
from coordle_backend import (CordDoc, Index, RecursiveDescentParser, 
                             AI_Index)

ai_index = AI_Index(model.wv.most_similar, n_similars=1)
ai_index.build_from_df(
    df[:16],
    'cord_uid',
    'title',
    'body_text', 
    verbose=True, 
    use_multiprocessing=True,
    workers=-1
)

Text cleaning initilized on 16 workers


Cleaning texts: 100%|██████████| 16/16 [00:00<00:00, 101.59it/s]
Adding to index: 100%|██████████| 16/16 [00:00<00:00, 322.17it/s]


In [6]:
import coordle_backend
reload(coordle_backend)
from coordle_backend import CordDoc, Index, RecursiveDescentParser, AI_Index

docs, scores, errmsgs = ai_index.search('virus')

n = 69
if errmsgs:
    print(errmsgs)
else:
    for doc, score in zip(docs, scores):
        print(f'{doc.uid}  {str(doc.title)[:70]:<70}  {score:.4f}')


1wswi7us  Relationship of SARS-CoV to other pathogenic RNA viruses explored by t  39.6302
yy96yeu9  Viral Discovery and Sequence Recovery Using DNA Microarrays             15.6958
xqhn0vbp  Airborne rhinovirus detection and effect of ultraviolet irradiation on  10.8963
qj4dh6rg  Cloaked similarity between HIV-1 and SARS-CoV suggests an anti-SARS st  8.0116
5s6acr7m  The Virus That Changed My World                                         7.2854
le0ogx1s  A new recruit for the army of the men of death                          7.0304
0qaoam29  A double epidemic model for the SARS propagation                        5.1278
gi6uaa83  Discovering human history from stomach bacteria                         2.6177
ng4rrdte  Pro/con clinical debate: Steroids are a key component in the treatment  1.9475
fy4w7xz8  Association of HLA class I with severe acute respiratory syndrome coro  0.9052
1769ovyk  8th Annual Toronto Critical Care Medicine Symposium, 30 October–1 Nove  0.4316
kuybfc1y  Descript

In [7]:
ai_index.docmap['grace']

{1769ovyk}

In [8]:
with open('textfile.txt', 'w+') as f:
    uid = 'qzm9wgde'
    
    print(f'dumping {uid}')
    f.write(df[df.cord_uid == uid].body_text.values[0].lower())

dumping qzm9wgde


# Mongo

In [19]:
import coordle_backend
reload(coordle_backend)
from coordle_backend import (CordDoc, Index, RecursiveDescentParser, 
                             AI_Index)

ram_index = AI_Index(model.wv.most_similar, 1)
# ram_index.build_from_df(
#     df,
#     'cord_uid',
#     'title',
#     'body_text', 
#     verbose=True, 
#     use_multiprocessing=True,
#     workers=-1
# )

In [21]:
import coordle_mongobackend as cm
reload(cm)

mongoindex = cm.AI_Index('coordle', model.wv.most_similar, 1)
# mongoindex.build_from_df(df, 'cord_uid', 'title', 'body_text', 
#                          use_multiprocessing=True, workers=-1, verbose=True)

In [11]:
docs, scores, errmsgs = ram_index.search('white retarded AND woman')

n = 69
if errmsgs:
    print(errmsgs)
else:
    for doc, score in zip(docs, scores):
        print(f'{doc.uid}   {score:.4f}')

In [32]:
mongoindex.wordcounts.count_documents({})

36565

In [22]:
docs, scores, errmsgs = mongoindex.search('white AND retarded AND woman')

n = 69
if errmsgs:
    print(errmsgs)
else:
    for doc, score in zip(docs, scores):
        print(f'{doc}   {score:.4f}')

5496
1388
278
503
3891
598
8a348729   nan
cd7adns8   nan
rv2hrsbo   nan
hau0cshe   nan
qvn33l36   nan
6q4yhekq   nan
a6cepu5h   nan
xihpfidg   nan
cqqumvsb   nan
zp9k1k3z   nan
l0kc731z   nan
vipx6t7e   nan
yetd2u2a   nan
a714injz   nan
vtlf65vq   nan
tj3ye1mx   nan
cgcvfftf   nan
p8no6rc9   nan
zotfbuwu   nan
cvj9zn0w   nan
alm3p31f   nan
ru7mvfc0   nan
jmifk1q0   nan
rz4r5sj7   nan
cvqt35ao   nan
hxj4z228   nan
m1cuuehi   nan
rm8d8fyj   nan
26su14qs   nan
71b0nyti   nan
r5mmsnbx   nan
pd3vu5y6   nan
fed9xg86   nan
39vjafky   nan
8t3rptw0   nan
cqlt5mq2   nan
cldrzet3   nan
soahaqup   nan
epchoupz   nan
49oti4zg   nan
r73datur   nan
0cfafydb   nan
lut7vovl   nan
lkyvok5t   nan
vymlfsdn   nan
oofrmpw5   nan
ehq9qnoo   nan
n8mlxe7p   nan
3s4jrkuo   nan
0smnl70i   nan
rhyrhh01   nan
sdz6d1r5   nan
4kkjlyky   nan
eh21tdhp   nan
0am4l5ms   nan
1ycn9xwc   nan
8xmi0sd4   nan
fro63b1z   nan
1oudyt9s   nan
2u6daypm   nan
ihat3yy8   nan
rrverrsj   nan
jux2xc6i   nan
acwkh6ed   nan
mludwtgc   na

In [13]:
wordmap = {word:[doc.uid for doc in docs] for word, docs in ai_index.docmap.items() if len(word) < 100}

In [14]:
A = pd.value_counts(['a','a','a','d','c','c','c','c','d'], sort=False)
A.index.values

array(['a', 'd', 'c'], dtype=object)

In [15]:
A = [[],0]
A[0].append(1)
A[1] += 1
A

[[1], 1]

<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<br />
<center><h2> Junkyard </h2></center>

In [16]:
raise ValueError

ValueError: 

Creating sets from dicts is faster than from lists

In [None]:
from time import time 
from matplotlib import pyplot as plt 
import numpy as np

dict_time = []
list_time = []
ratios = []

xrange = np.arange(1000,1000000,1000)
B = set(list(range(100000)))

for j in xrange:
    a = list(range(j))

    t0=time()
    A = set(a)
    A | B
    t1_list = time()-t0
    list_time.append(t1_list)
    
    a = {i:None for i in range(j)}
    
    t0=time()
#     A = set(a)
    a.keys() | B
    t1_dict = time()-t0
    dict_time.append(t1_dict)

    ratios.append(t1_list/t1_dict)

In [None]:
plt.plot(xrange[1:], dict_time[1:], label='dict')
plt.plot(xrange[1:], list_time[1:], label='list')
plt.legend()
plt.show()

In [None]:
plt.plot(xrange[1:], ratios[1:])
plt.show()

In [None]:
import coordle_backend
reload(coordle_backend)
from coordle_backend import CordDoc, Index2, RecursiveDescentParser, AI_Index2
Index = Index2

queries = [
    'retarded!!!',
#     '(',
#     ')',
    'retarded (white AND (woman NOT man))',
    'retarded (white AND white) man',
#     'retarded OR white OR woman',
#     'retarded white AND woman',
#     'retarded OR white NOT woman',
#     'retarded (white NOT woman)',
#     'retarded (white NOT woman)',
#     'OR retarded AND white woman',
#     'retarded AND AND white NOT woman',
#     'retarded (white NOT woman) AND',
#     ')retarded ((white NOT woman) AND',
#     'retarded ((white NOT woman)',
#     'AND retarded)) ((white NOT woman) NOT',
]

rdp = RecursiveDescentParser(index.docmap)
for query in queries:
    errmsgs = []
    tokens = rdp.get_logical_querytokens(query)
    pass_ = rdp.assert_query(tokens, errmsgs)
    
#     print(tokens)
    print(rdp.parenthesis_handler(tokens))
#     print(errmsgs)
    print(pass_)
    print()
    