In [12]:
import json
import os
from pprint import pprint

from resources.similarity import CovidSimilarityResource
from corpus_index import load_corpus_index
from encoders.bert_encoder import BertEncoder
from encoders.simple_encoder import SimpleEncoder
import pandas as pd

In [2]:
corpus_index_folder = '../../../workspace/kaggle/covid19/data/corpus_index_bert'

In [3]:
with open('../answers/1.json', 'r') as in_fp:
    seed_sentences_json = json.load(in_fp)

In [4]:
print(seed_sentences_json['taskName'])

What is known about transmission, incubation, and environmental stability?


In [5]:
corpus_index_fname = os.path.join(corpus_index_folder, 'bert-encoder-nmslib-768d.bin')
corpus_index = load_corpus_index(corpus_index_fname)

In [6]:
data_dir = "../../../workspace/kaggle/covid19/data"

In [7]:
sentences_df = pd.read_csv(os.path.join(data_dir, "sentences_with_metadata_no_phrases_blingfire.csv"))

In [8]:
def create_articles_metadata_mapping(sentences_df: pd.DataFrame) -> dict:
    sentence_id_to_metadata = {}
    for row_count, row in sentences_df.iterrows():
        sentence_id_to_metadata[row_count] = dict(
            paper_id=row['paper_id'],
            cord_uid=row['cord_uid'],
            source=row['source'],
            url=row['url'],
            publish_time=row['publish_time'],
            authors=row['authors'],
            section=row['section'],
            sentence=row['sentence'],
        )
    return sentence_id_to_metadata

In [9]:
sentence_id_to_metadata = create_articles_metadata_mapping(sentences_df)

In [13]:
sentence_encoder = BertEncoder()

In [14]:
from nlp import text_tokenizer
from nlp.cleaning import clean_tokenized_sentence

def clean_sentence(sentence) -> str:
    doc = text_tokenizer.tokenize_text(sentence)
    tokens = [str(token) for token in doc]
    return clean_tokenized_sentence(tokens)

In [19]:
import murmurhash
import numpy as np
import scipy.spatial

from corpus_index.base_index import Aggregation
from nlp.textrank import calc_textrank

class CovidSimilarity:
    def __init__(self, corpus_index, sentence_encoder, sentence_id_to_metadata, bigram_model=None, trigram_model=None):
        self.corpus_index = corpus_index
        self.sentence_encoder = sentence_encoder
        self.sentence_id_to_metadata = sentence_id_to_metadata
        self.bigram_model = bigram_model
        self.trigram_model = trigram_model

    def similar_k(self, input_sentences, limit=10, method='union', group_by='cosine'):
        """Find similar sentences.

        Args:
            input_sentences (str/list[str]): one or more input sentences.
            sentence_encoder  : encoder
            limit (int): limit result set size to ``limit``.
            corpus_index : type of corpus where to fetch the suggestions from
            db_session  : Database to get neighbors from
            method (str): aggregation method ('union', 'mean', 'pc1', 'pc2').
            group_by (str): distance metric to use to group the result set. Default is 'cosine'.

        Returns:
            list<dict>
        """
        res = []
        nearest = dict()

        if isinstance(sentence_encoder, SimpleEncoder):
            if method == 'textrank':
                _, _, _, phrase_list = calc_textrank(input_sentences, num_phrases=5)
                input_sentences = [' '.join(phrase[0] for phrase in phrase_list)]
                method = Aggregation.UNION

            cleaned_sentences = [clean_sentence(sentence) for sentence in input_sentences]

            if self.bigram_model and self.trigram_model:
                tokenzied_sentences = [sentence.split(' ') for sentence in cleaned_sentences]
                sentences_with_bigrams = self.bigram_model[tokenzied_sentences]
                sentences_with_trigrams = self.trigram_model[sentences_with_bigrams]
                cleaned_sentences = [' '.join(sentence) for sentence in sentences_with_trigrams]
        else:
            cleaned_sentences = input_sentences

        embeddings = self.sentence_encoder.encode(cleaned_sentences)

        for idx, dist in self.corpus_index.knn_query_batch(embeddings, limit=limit, method=method):
            similarity = 1.0 - dist
            if idx not in nearest:
                nearest[idx] = similarity
            else:
                nearest[idx] = max(nearest[idx], similarity)

        print(nearest)
        
        results = []
        for idx, sim in nearest.items():
            if sim >= 0.7:
                results.append(dict(similarity=sim, metadata=self.sentence_id_to_metadata[idx]))

        return {
            "results": results,
            "sentencs": cleaned_sentences
        }

        return {
            'results': sorted(res, key=lambda x: x['dist']),
            'sentences': [
                {
                    'id': sent_id,
                    'text': sent
                } for sent_id, sent in zip(indices, cleaned_sentences)
            ]
        }

In [20]:
covid_similarity = CovidSimilarity(corpus_index, sentence_encoder, sentence_id_to_metadata)

In [21]:
def find_similar_sentences(sentences, method="union", limit=10):
    return covid_similarity.similar_k( sentences, method=method, limit=limit)

In [22]:
find_similar_sentences(["Both SARS-CoV and MERS-CoV are thought to have originated in colonies of bats, eventually transmitted to humans, putatively facilitated by intermediate hosts such as palm civets and dromedary camels, respectively (Cui et al., 2019) ."])

Batches: 100%|██████████| 1/1 [00:00<00:00,  8.68it/s]

{145948: 0.8387337923049927, 110676: 0.8026413321495056, 153567: 0.799142599105835, 67908: 0.7951556444168091, 64417: 0.7951555252075195, 99679: 0.7919644713401794, 47751: 0.7906805872917175, 49075: 0.7756433486938477, 129499: 0.7725346088409424}





{'results': [{'similarity': 0.8387337923049927,
   'metadata': {'paper_id': '8699664d8100af1e701e25daeb8a4fced18a3daf',
    'cord_uid': 'epcel2ez',
    'source': 'Elsevier',
    'url': 'https://doi.org/10.1016/j.micinf.2020.02.002',
    'publish_time': '2020-03-31',
    'authors': 'Jin-Yan Li, Zhi You, Qiong Wang, Zhi-Jian Zhou, Ye Qiu, Rui Luo, Xing-Yi Ge',
    'section': 'body',
    'sentence': 'Indeed, SARS-CoV and MERS-CoV originated from bats, the natural reservoir, but transmitted to humans via intermediate host civets and camels, respectively.'}},
  {'similarity': 0.8026413321495056,
   'metadata': {'paper_id': '45a5c4910f9537156f3b729c8e938c82bd57d358',
    'cord_uid': 'cigv3t0h',
    'source': 'Elsevier',
    'url': 'https://doi.org/10.1016/j.jinf.2020.03.010',
    'publish_time': '2020-03-13',
    'authors': nan,
    'section': 'body',
    'sentence': 'Both SARS-CoV and MERS-CoV are believed to originate in bats, and these infections have been transmitted directly to humans f