In [1]:
import json
import os
from pprint import pprint

from resources.similarity import CovidSimilarityResource
from corpus_index import load_corpus_index
from encoders.simple_encoder import SimpleEncoder
import pandas as pd

In [2]:
corpus_index_folder = '../../../workspace/kaggle/covid19/data/corpus_index_no_remove_principles'

In [3]:
with open('../answers/1.json', 'r') as in_fp:
    seed_sentences_json = json.load(in_fp)

In [4]:
print(seed_sentences_json['taskName'])

What is known about transmission, incubation, and environmental stability?


In [5]:
corpus_index_fname = os.path.join(corpus_index_folder, 'simple-encoder-nmslib-100d.bin')
corpus_index = load_corpus_index(corpus_index_fname)

In [6]:
data_dir = "../../../workspace/kaggle/covid19/data"

In [7]:
sentences_df = pd.read_csv(os.path.join(data_dir, "sentences_with_metadata_no_phrases_blingfire.csv"))

In [8]:
def create_articles_metadata_mapping(sentences_df: pd.DataFrame) -> dict:
    sentence_id_to_metadata = {}
    for row_count, row in sentences_df.iterrows():
        sentence_id_to_metadata[row_count] = dict(
            paper_id=row['paper_id'],
            cord_uid=row['cord_uid'],
            source=row['source'],
            url=row['url'],
            publish_time=row['publish_time'],
            authors=row['authors'],
            section=row['section'],
            sentence=row['sentence'],
        )
    return sentence_id_to_metadata

In [9]:
sentence_id_to_metadata = create_articles_metadata_mapping(sentences_df)

In [10]:
sentence_encoder = SimpleEncoder.load(
    "../../../workspace/kaggle/covid19/data/fasttext_no_subwords_no_phrases_blingfire/word-vectors-100d.txt",
    "../../../workspace/kaggle/covid19/data/fasttext_no_subwords_no_phrases_blingfire/word-counts.txt"
    # os.path.join(corpus_index_folder, "simple-encoder-100d-components.npy")
)

In [11]:
from nlp import text_tokenizer
from nlp.cleaning import clean_tokenized_sentence

def clean_sentence(sentence) -> str:
    doc = text_tokenizer.tokenize_text(sentence)
    tokens = [str(token) for token in doc]
    return clean_tokenized_sentence(tokens)

In [22]:
import murmurhash
import numpy as np
import scipy.spatial

from corpus_index.base_index import Aggregation
from nlp.textrank import calc_textrank

class CovidSimilarity:
    def __init__(self, corpus_index, sentence_encoder, sentence_id_to_metadata, bigram_model=None, trigram_model=None):
        self.corpus_index = corpus_index
        self.sentence_encoder = sentence_encoder
        self.sentence_id_to_metadata = sentence_id_to_metadata
        self.bigram_model = bigram_model
        self.trigram_model = trigram_model

    def similar_k(self, input_sentences, limit=10, method='union', group_by='cosine'):
        """Find similar sentences.

        Args:
            input_sentences (str/list[str]): one or more input sentences.
            sentence_encoder  : encoder
            limit (int): limit result set size to ``limit``.
            corpus_index : type of corpus where to fetch the suggestions from
            db_session  : Database to get neighbors from
            method (str): aggregation method ('union', 'mean', 'pc1', 'pc2').
            group_by (str): distance metric to use to group the result set. Default is 'cosine'.

        Returns:
            list<dict>
        """
        res = []
        nearest = dict()

        if method == 'textrank':
            _, _, _, phrase_list = calc_textrank(input_sentences, num_phrases=10)
            input_sentences = [' '.join(phrase[0] for phrase in phrase_list)]
            method = Aggregation.UNION

        cleaned_sentences = [clean_sentence(sentence) for sentence in input_sentences]

        if self.bigram_model and self.trigram_model:
            tokenzied_sentences = [sentence.split(' ') for sentence in cleaned_sentences]
            sentences_with_bigrams = self.bigram_model[tokenzied_sentences]
            sentences_with_trigrams = self.trigram_model[sentences_with_bigrams]
            cleaned_sentences = [' '.join(sentence) for sentence in sentences_with_trigrams]

        embeddings = self.sentence_encoder.encode(cleaned_sentences)

        for idx, dist in self.corpus_index.knn_query_batch(embeddings, limit=limit, method=method):
            similarity = 1.0 - dist
            if idx not in nearest:
                nearest[idx] = similarity
            else:
                nearest[idx] = max(nearest[idx], similarity)

        print(nearest)
        
        results = []
        for idx, sim in nearest.items():
            if sim >= 0.5:
                results.append(dict(similarity=sim, metadata=self.sentence_id_to_metadata[idx]))

        return {
            "results": results,
            "sentencs": cleaned_sentences
        }

        return {
            'results': sorted(res, key=lambda x: x['dist']),
            'sentences': [
                {
                    'id': sent_id,
                    'text': sent
                } for sent_id, sent in zip(indices, cleaned_sentences)
            ]
        }

In [23]:
covid_similarity = CovidSimilarity(corpus_index, sentence_encoder, sentence_id_to_metadata)

In [24]:
def find_similar_sentences(sentences, method="textrank", limit=10):
    return covid_similarity.similar_k( sentences, method=method, limit=limit)

In [25]:
find_similar_sentences(["Both SARS-CoV and MERS-CoV are thought to have originated in colonies of bats, eventually transmitted to humans, putatively facilitated by intermediate hosts such as palm civets and dromedary camels, respectively (Cui et al., 2019) ."])

{82747: 0.8962383270263672, 217386: 0.6820434331893921, 217362: 0.6366391777992249, 217378: 0.6274011135101318, 26668: 0.0, 53909: 0.0, 57795: 0.0, 58960: 0.0, 59899: 0.0, 63965: 0.0}


{'results': [{'similarity': 0.8962383270263672,
   'metadata': {'paper_id': 'c097a8a9a543d69c34f10e5c3fd78019e560026a',
    'cord_uid': 'mn0l7nar',
    'source': 'PMC',
    'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7067204/',
    'publish_time': '2020-01-28',
    'authors': 'Jasper Fuk-Woo Chan, Kin-Hang Kok, Zheng Zhu, Hin Chu, Kelvin , Kai-Wang To, Shuofeng Yuan, Kwok-Yung Yuen',
    'section': 'abstract',
    'sentence': 'Learning from the roles of civet in SARS and camel in MERS, hunting for the animal source of 2019-nCoV and its more ancestral virus would be important for understanding the origin and evolution of this novel lineage B betacoronavirus.'}},
  {'similarity': 0.6820434331893921,
   'metadata': {'paper_id': 'ffbd51670f3a5dcf4a02696788726a3531da449b',
    'cord_uid': 'si7csqr2',
    'source': 'Elsevier',
    'url': 'https://doi.org/10.1016/j.jmii.2020.03.011',
    'publish_time': '2020-03-14',
    'authors': 'Muh-Yong Yen, Jonathan Schwartz, Shey-Ying Chen, Ch