In [56]:
import json
import os
from pprint import pprint
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker

from resources.similarity import CovidSimilarityResource
from corpus_index import load_corpus_index
from encoders import get_encoder
from db import get_session

In [57]:
corpus_index_folder = '../../../workspace/kaggle/covid19/data/corpus_index'
db_dir = '../../../workspace/kaggle/covid19/data/db'

In [58]:
%env W2V_PATH=../../../workspace/kaggle/covid19/data/fasttext_no_subwords_trigrams/word-vectors-100d.txt
%env WC_PATH=../../../workspace/kaggle/covid19/data/fasttext_no_subwords_trigrams/word-counts.txt
%env PC_PATH=../../../workspace/kaggle/covid19/data/corpus_index/simple-encoder-100d-components.npy
%env DB_CONNECTION=sqlite:///../../../workspace/kaggle/covid19/data/db/covid19.sqlite

env: W2V_PATH=../../../workspace/kaggle/covid19/data/fasttext_no_subwords_trigrams/word-vectors-100d.txt
env: WC_PATH=../../../workspace/kaggle/covid19/data/fasttext_no_subwords_trigrams/word-counts.txt
env: PC_PATH=../../../workspace/kaggle/covid19/data/corpus_index/simple-encoder-100d-components.npy
env: DB_CONNECTION=sqlite:///../../../workspace/kaggle/covid19/data/db/covid19.sqlite


In [59]:
with open('../answers/1.json', 'r') as in_fp:
    seed_sentences_json = json.load(in_fp)

In [60]:
print(seed_sentences_json['taskName'])

What is known about transmission, incubation, and environmental stability?


In [61]:
sentence_encoder = get_encoder('simple_encoder')

In [62]:
corpus_index_fname = os.path.join(corpus_index_folder, 'simple-encoder-nmslib-100d.bin')
corpus_index = load_corpus_index(corpus_index_fname)

In [63]:
def get_sqlite_session(conn):
    connect_args = {}
    if conn.startswith('sqlite:///'):
        connect_args.update({'check_same_thread': False})
    engine = create_engine(conn, connect_args=connect_args)
    Session = sessionmaker(bind=engine)
    return Session()

In [64]:
db_session = get_sqlite_session(conn=f"sqlite:///{os.path.join(db_dir, 'covid19.sqlite')}")

In [65]:
covid_resource = CovidSimilarityResource(corpus_index, sentence_encoder, db_session)

In [66]:
def find_similar_sentences(sentences, method="union", limit=10):
    return covid_resource.similar_k(
        sentences, covid_resource.sentence_encoder, covid_resource.corpus_index, covid_resource.db_session,
        method=method, limit=limit
    )

In [68]:
find_similar_sentences(["Coronavirus came from bats"])

{'results': [{'id': 77703,
   'text': 'General anesthesia and specific sedatives, such as xylazine, romifidine, or detomidine, also disturb motility.',
   'paper_id': '9d118e16908ed81fea07403ca6deef78623dd14d',
   'cord_uid': 'j9kg00qf',
   'publish_time': '2004-12-31',
   'nearest': -1905314997,
   'dist': 0.4055670499801636},
  {'id': 30592,
   'text': 'The market sells many species including seafood, birds, snakes, marmots and bats (Gralinski and Menachery, 2020) .',
   'paper_id': '7ab9f9fcea519ebce527c3ede8091beedbb26ad9',
   'cord_uid': 'xa6kwguo',
   'publish_time': '2020-02-18',
   'nearest': -1905314997,
   'dist': 0.4187812805175781},
  {'id': 94506,
   'text': 'The pH of the growth environment can influence stability and gene expression.',
   'paper_id': 'bdca6cb34a68176f77ae340ef673557a3ad9bc08',
   'cord_uid': 'os6f458c',
   'publish_time': '1992-12-31',
   'nearest': -1905314997,
   'dist': 0.4377667307853699},
  {'id': 79245,
   'text': 'One should include appropriate di