# Coordle Idea
1. Get query, e.g. "What is love?"
2. Tokenize query
3. Create a vector using word2vec of the tokens by summing them, or finding the average or whatever.
   lets call it the query vector. 
4. Find sentence vectors that are the closesest to the query vector.
5. Return the papers corresponding to said sentence vectors.

# Coordle Frontend
💩

# Coordle Backend

In [1]:
import pandas as pd 
import numpy as np
from importlib import reload
import nltk
import spacy
import en_core_sci_lg # Biomedical word embeddings
from utils import clean_text, load_pickle, save_pickle
from nltk.corpus import stopwords as _stopwords
from tqdm import tqdm
from collections import deque
from copy import deepcopy
from collections.abc import Iterable
from typing import Union
from pprint import pprint

import os
import json
import re 
from string import punctuation as PUNCTUATION
from nltk.corpus import stopwords as _stopwords
from gensim.models import Word2Vec
from os.path import join as join_path
from gensim.models.callbacks import CallbackAny2Vec

In [2]:
df = pd.read_csv('data/cord-19-data.csv')

In [3]:
df.columns

Index(['cord_uid', 'paper_id', 'source', 'is_pmc', 'title', 'body_text', 'doi',
       'pubmed_id', 'license', 'abstract', 'publish_time', 'authors',
       'journal', 'url', 'language'],
      dtype='object')

In [4]:
class EpochSaver(CallbackAny2Vec):
    '''Callback to save model after each epoch.'''

    def __init__(self, output_dir: str, prefix: str, logs_filename: str):
        self.output_dir = output_dir
        self.prefix = prefix
        self.logs_filename = logs_filename
        self.epoch = 0

    def on_epoch_end(self, model):
        cum_loss = model.get_latest_training_loss()
        if self.epoch == 0:
            loss = cum_loss
        else:
            loss = cum_loss - self.loss_previous_step
        self.loss_previous_step = loss
        with open(join_path(self.output_dir, self.logs_filename), 'a+') as file:
            file.write(f'Epoch #{self.epoch}, loss: {loss}\n')
        
        output_path = join_path(self.output_dir, f'{self.prefix}_epoch_{self.epoch}.model')
        model.save(output_path)
        self.epoch += 1    

class DocEpochSaver(CallbackAny2Vec):
    '''Callback to save model after each epoch.'''

    def __init__(self, output_dir: str, prefix: str, start_epoch: int = 1):
        self.output_dir = output_dir
        self.prefix = prefix
        self.epoch = start_epoch

    def on_epoch_end(self, model):        
        output_path = join_path(self.output_dir, f'{self.prefix}_epoch_{self.epoch}.model')
        model.save(output_path)
        self.epoch += 1    

In [5]:
# Load the last trained model
model = Word2Vec.load(join_path('models-word2vec', 'w2v_model_epoch_29.model'))
word_to_int = {word:i for i, word in enumerate(model.wv.index2word)}
int_to_word = np.array(model.wv.index2word)

In [6]:
model2 = Word2Vec.load(join_path('models-doc2vec', 'model_epoch_9.model'))

In [7]:
word = clean_text('abilities', return_list=False)
print(word)
print()
pprint(model.wv.most_similar(word))
print()
pprint(model2.wv.most_similar(word))

ability

[('able', 0.7927937507629395),
 ('abilities', 0.7574480175971985),
 ('capability', 0.7555814981460571),
 ('capacity', 0.713545024394989),
 ('inability', 0.6850271224975586),
 ('unable', 0.6693112850189209),
 ('could', 0.6653134822845459),
 ('effectively', 0.6319371461868286),
 ('efficiently', 0.5879989862442017),
 ('importantly', 0.5701096057891846)]

[('capability', 0.7155430912971497),
 ('capacity', 0.6828085780143738),
 ('inability', 0.650448203086853),
 ('able', 0.5796688795089722),
 ('unable', 0.5606363415718079),
 ('propensity', 0.524204671382904),
 ('tendency', 0.48691150546073914),
 ('proclivity', 0.47566479444503784),
 ('failed', 0.4682592451572418),
 ('wishing', 0.4622066020965576)]


In [33]:
# %%time
import coordle_backend
reload(coordle_backend)
from coordle_backend import SentVectorDoc, Index

docsample = df.iloc[0]
def test_SentVectorDoc():
    doc = SentVectorDoc(docsample['cord_uid'], docsample['title'])
    doc, _ = doc.fit(docsample['body_text'])
    print(doc.tf_idf_score)
    return doc

def test_Index():
    coordle = Index()
    for i in tqdm(range(1024), position=0):
        sample = df.iloc[i]
        coordle.add(sample['cord_uid'], sample['title'], sample['body_text'])
    return coordle

def test_Index2():
    coordle = Index()
    coordle.build_from_df(
        df.iloc[:4096],
        'cord_uid',
        'title',
        'body_text', 
        verbose=True, 
        use_multiprocessing=True,
        workers=-1
    )
    return coordle

index = test_Index2()
# fuck = test_SentVectorDoc()

Text cleaning initilized on 16 workers


Cleaning texts: 100%|██████████| 4096/4096 [00:15<00:00, 258.14it/s]
Adding to index: 100%|██████████| 4096/4096 [00:22<00:00, 180.81it/s]


In [39]:
docs, scores = index.search('retarded white woman', verbose=True)
n = 16
print()
for doc, score in zip(docs[:n], scores[:n]):
    print(f'{doc.uid}   {str(doc.title)[:80]:<80}   {score:.4f}')

Query tokens:  ['retarded', 'white', 'woman']

iwueedmm   Post-traumatic stress disorder among Chinese women survivors of intimate partner   2.0220
alm3p31f   Chapter 3 Immunobiological aspects of vaccines in pregnancy: Maternal perspectiv   1.4323
94anrxyw   Chapter 47 Infections in Pregnancy                                                 1.2428
8p4t1dr6   Emergency Caesarean delivery in a patient with confirmed coronavirus disease 201   0.8291
ibrvg7rd   88 Gender-Specific Issues in Non-HIV Viral Infections                              0.7680
xfr3cyql   Chapter 10 Respiratory syncytial virus                                             0.7319
vq0p5d2x   nan                                                                                0.6711
v0gngwzx   Pregnancy and perinatal outcomes of women with severe acute respiratory syndrome   0.6644
wnf8fozk   5.16 Infections in Pregnancy☆                                                      0.6100
2zfrogg9   Hypoxia-like tissue injury as a c

In [None]:
import coordle_backend
reload(coordle_backend)
from coordle_backend import SentVectorDoc, Index, AI_Index

def test_AI_Index(model):
    coordle = AI_Index(model.wv.most_similar, 3)
    coordle.build_from_df(
        df.iloc[:4096],
        'cord_uid',
        'title',
        'body_text',
        use_multiprocessing=True,
        workers=-1,
        verbose=True
    )
    return coordle

ai_index = test_AI_Index(model)
            

Text cleaning initilized on 16 workers


Cleaning texts: 100%|██████████| 4096/4096 [00:18<00:00, 224.97it/s]
Adding to index:  37%|███▋      | 1524/4096 [00:10<00:18, 137.43it/s]

In [40]:
docs, scores = ai_index.search('retarded white woman', verbose=True)
n = 16
print()
for doc, score in zip(docs[:n], scores[:n]):
    print(f'{doc.uid}   {doc.title[:80]:<80}   {score:.4f}')

Query tokens:  ['retarded', 'white', 'woman', 'girl', '52yearold', '59yearold', 'gray', 'red', 'graywhite', 'retardation', 'diminished', 'reduced']

cmat0grk   Chapter 15 Captive Red Panda Medicine                                              0.7252
iwueedmm   Post-traumatic stress disorder among Chinese women survivors of intimate partner   0.7100
alm3p31f   Chapter 3 Immunobiological aspects of vaccines in pregnancy: Maternal perspectiv   0.5522
94anrxyw   Chapter 47 Infections in Pregnancy                                                 0.4715
f33fzent   Avian Infectious Bronchitis Virus                                                  0.4588
2qg9vuvd   Chapter 3 Blood Vital but Potentially Dangerous                                    0.4164
8p4t1dr6   Emergency Caesarean delivery in a patient with confirmed coronavirus disease 201   0.3904
w5r96iq3   Human coronavirus gene expression in the brains of multiple sclerosis patients     0.3410
w232rzi9   Chapter 132 Utilizing Blood Bank

## Multiprocessing showdown

In [16]:
from multiprocessing import Pool
from time import time

In [17]:
t0 = time()
with Pool(None) as p:
    cleaned_texts = list(tqdm(p.imap(clean_text, df.body_text), position=0, total=len(df)))
print(f'Text cleaning with multiprocessing took {time()-t0:.2f} seconds')

100%|██████████| 33554/33554 [01:19<00:00, 420.93it/s]


Text cleaning with multiprocessing took 81.86 seconds


In [18]:
t0 = time()
gen = tqdm((clean_text(text) for text in df.body_text), position=0, total=len(df))
cleaned_texts = list(gen)
print(f'Naïve text cleaning took {time()-t0:.2f} seconds')

100%|██████████| 33554/33554 [10:38<00:00, 52.56it/s] 

Naïve text cleaning took 640.67 seconds





In [41]:
t0 = time()
with Pool(None) as p:
    cleaned_texts = list(tqdm(p.imap(clean_text, df.body_text[:2048]), position=0, total=2048))
print(f'Text cleaning with multiprocessing took {time()-t0:.2f} seconds')

100%|██████████| 2048/2048 [00:07<00:00, 268.61it/s]


Text cleaning with multiprocessing took 12.61 seconds


In [20]:
t0 = time()
gen = tqdm((clean_text(text) for text in df.body_text[:2048]), position=0, total=2048)
cleaned_texts = list(gen)
print(f'Naïve text cleaning took {time()-t0:.2f} seconds')

100%|██████████| 2048/2048 [00:54<00:00, 37.55it/s]

Naïve text cleaning took 54.70 seconds



