<a href="https://colab.research.google.com/github/MrMimic/Machine-Learning/blob/master/Codiv_19_V.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## General info

[trello board](https://trello.com/b/ED3H13vT/covid-19-kaggle-kickoff)

forked [notebook](https://www.kaggle.com/davidmezzetti/cord-19-analysis-with-sentence-embeddings)

List of interesting notebook:

[Explored drugs being develloped](https://www.kaggle.com/maria17/cord-19-explore-drugs-being-developed)

## Kaggle dataset and word vectors downloading

Update your kaggle.json API key with the cell below, then launch the newt two.

It'll download data if needed (*eg*, your kernel has restarted)

In [0]:
# Upload your kaggle API token
from google.colab import files
# In kaggle.com : MyAccount -> Create New API Token, will download kaggle.json that you can upload here.
files.upload()

In [0]:
import os

if not os.path.isfile("kaggle.json") and not os.path.isdir(os.path.expanduser("~/.kaggle")):
  raise Exception("Please import your kaggle key first.")

if os.path.isfile("kaggle.json"):
  !mkdir -p ~/.kaggle
  !cp kaggle.json ~/.kaggle/
  !chmod 600 ~/.kaggle/kaggle.json
  !rm kaggle.json

if not os.path.isdir("kaggle_data"):
  # !kaggle datasets list | head
  !pip install -q kaggle
  !pip install -q kaggle-cli

  !kaggle datasets download -d allen-institute-for-ai/CORD-19-research-challenge
  !mkdir kaggle_data
  !unzip -qq CORD-19-research-challenge.zip -d kaggle_data
  !rm CORD-19-research-challenge.zip

if not os.path.isdir("glove_vectors"):
  !pip install -q kaggle
  !pip install -q kaggle-cli

  !kaggle datasets download -d rtatman/glove-global-vectors-for-word-representation
  !mkdir glove_vectors
  !unzip -qq glove-global-vectors-for-word-representation.zip -d glove_vectors
  !rm glove-global-vectors-for-word-representation.zip

In [0]:
# NLTK data for pre-processing
if not os.path.isdir("/root/nltk_data"):
  import nltk
  nltk.download('stopwords')
  nltk.download('punkt')

if "corpora" not in os.listdir("/root/nltk_data"):
  import nltk
  nltk.download('stopwords')
if "tokenizers" not in os.listdir("/root/nltk_data"):
  import nltk
  nltk.download('punkt')

# Python packages
try:
  from retry import retry
except ModuleNotFoundError:
  !pip install retry

try:
  import pathos
except ModuleNotFoundError:
  !pip install pathos

## Link your Google Drive

In [0]:
# Upload files to your google drive (SQLite file eg) and mount it
from google.colab import drive
drive.mount('/content/drive')

## Libs

### Imports

In [0]:
import os
import re
import json
import time
import tqdm
import time
import pickle
import sqlite3

import numpy as np
import pandas as pd
import multiprocessing as mp

from retry import retry
from sklearn.metrics.pairwise import cosine_similarity
from dateutil import parser
from pathlib import Path
from typing import List, Dict, Any, Generator
from collections import OrderedDict, Counter, MutableMapping, Sequence
from pathos.multiprocessing import ProcessingPool as picklable_pool
from textblob import TextBlob

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import sent_tokenize, RegexpTokenizer

### File processing

In [0]:
def read_file(file_path: str) -> Dict[str, Any]:
    """ Open JSON file and return dict() data """
    with open(file_path, "r") as handler:
        json_data = json.loads(handler.read(), object_pairs_hook=OrderedDict)
    return json_data


def get_body(json_data: Dict[str, Any]) -> str:
    """ Return body from json data """
    return " ".join([json_data["body_text"][index]["text"].strip() for index in range(len(json_data["body_text"]))])

### Language detection

In [0]:
def get_lang(text: str) -> str:
  """ Detects language of text : must contain minimum 3 characters """
  if len(text) >= 3:
    b = TextBlob(text)
    return b.detect_language()
  else:
    raise ValueError('Minimum of 3 characters needed !')

### Database utilities

In [0]:
def instanciate_sql_db(db_path: str = "articles_database.sqlite") -> None:
    """ Create an SQLite database """

    if os.path.isfile(db_path):
        os.remove(db_path)    
    database = sqlite3.connect(db_path)
    # Storing articles
    articles_table = {
        "id": "TEXT PRIMARY KEY",
        "date": "DATETIME",
        "body": "TEXT",
        "abstract": "TEXT",
        "title": "TEXT",
        "sha": "TEXT",
        "folder": "TEXT"
    }
    columns = ["{0} {1}".format(name, col_type) for name, col_type in articles_table.items()]
    command = "CREATE TABLE IF NOT EXISTS articles ({});".format(", ".join(columns))
    database.execute(command)
    # Storing sentences
    sentences_table = {
        "paper": "TEXT",
        "section": "TEXT",
        "sentence": "TEXT",
        "vector": "TEXT"
    }
    columns = ["{0} {1}".format(name, col_type) for name, col_type in sentences_table.items()]
    command = "CREATE TABLE IF NOT EXISTS sentences ({});".format(", ".join(columns))
    database.execute(command)
    database.close()

def get_articles_to_insert(articles_df: pd.DataFrame) -> List[Any]:
    """ List comprehension get stuck, who knows why """
    articles = []
    for index, data in articles_df.iterrows():
        articles.append((index, data))
    return articles
  
@retry(sqlite3.OperationalError, tries=5, delay=2)
def insert_row(list_to_insert: List[Any], table_name: str = "articles", db_path: str = "articles_database.sqlite") -> None:
    """ Insert row of articles into the SQLite database """

    if table_name == "articles":
        command = "INSERT INTO articles(id, title, body, abstract, date, sha, folder) VALUES (?, ?, ?, ?, ?, ?, ?)"
    elif table_name == "sentences":
        command = "INSERT INTO sentences(paper, section, sentence, vector) VALUES (?, ?, ?, ?)"
    else:
        raise Exception(f"Unknown table {table_name}")

    connection = sqlite3.connect(db_path)
    cursor = connection.cursor()
    cursor.execute(command, list_to_insert)  # This line will be retried if fails
    cursor.close()
    connection.commit()
    connection.close()

def insert_article(args: Any) -> None:
    """ Parse and insert a single article into the SQLite DB. args = [(index, df_line), db_path] """
    index = args[0][0]
    data = args[0][1]
    db_path = args[1]

    # Get body
    if data.has_full_text is True:
        json_file = os.path.join(os.sep, "kaggle", "input", "CORD-19-research-challenge", data.full_text_file, data.full_text_file, f"{data.sha}.json")
        try:
            json_data = read_file(json_file)
            body = get_body(json_data=json_data)
            folder = data.full_text_file
        except FileNotFoundError:
            body = None
            folder = None
    else:
        body = None
        folder = None

    try:
        date = parser.parse(data.publish_time)
    except Exception:  # Better to get no date than a string of whatever
        date = None
        
    raw_data = [
        data.doi,
        data.title,
        body,
        data.abstract,
        date,
        data.sha,
        folder
    ]
    insert_row(list_to_insert=raw_data, db_path=db_path)

def get_all_ids(db_path: str = "articles_database.sqlite") -> List[str]:
    """ Return all articles DOI stored in the article table """
    connection = sqlite3.connect(db_path)
    cursor = connection.cursor()
    cursor.execute("SELECT id FROM articles")
    ids = cursor.fetchall()
    cursor.close()
    connection.close()
    ids_cleaneds = [id_[0] for id_ in ids if len(id_) == 1]

    return ids_cleaneds

### Text pre-processing

In [0]:
def preprocess_text(text: str, stem_words: bool = True, remove_num: bool = True) -> List[str]:
    """ Pre-process extracted texts """

    word = RegexpTokenizer(r"\w+")
    stop_words = set(stopwords.words("english"))
    stemmer = SnowballStemmer("english")
    
    def filter_stopwords(sentence: List[str], stopwords: List[str] = stop_words) -> List[str]:
        """ Remove stopwords from a given list of words """
        return [word for word in sentence if word not in stopwords]
    
    def stem_words(sentence: List[str], stem_function: Any = stemmer) -> List[str]:
        """ Get words root for every member of an input list """
        return [stem_function.stem(word) for word in sentence]
    
    def remove_numeric_words(sentence: List[str]) -> List[str]:
        """ Remove number (items) from a list of words """
        letter_pattern = re.compile(r"[a-z]")
        return [word for word in sentence if letter_pattern.match(word)]   

    # Lower
    paragraph = text.lower()
    # Split paragraphs into sentences
    sentences = sent_tokenize(text)
    # Split sentences into words and remove punctuation
    sentences = [word.tokenize(sentence) for sentence in sentences]
    # Remove stopwords
    sentences = [filter_stopwords(sentence) for sentence in sentences]
    if stem_words is True:
        # Stem words
        sentences = [stem_words(sentence) for sentence in sentences]
    if remove_num is True:
        sentences = [remove_numeric_words(sentence) for sentence in sentences]
    # Filter empty sentences and one-letters words
    sentences = [[word for word in sentence if len(word) > 1] for sentence in sentences if sentence != []]

    return sentences

def pre_process_articles(args: List[Any]) -> None:
    """ Apply preprocessing to texts and store result into the SQLite DB """

    article_id: str = args[0]
    embedding_model = args[1]
    db_path: str = args[2]

    connection = sqlite3.connect(db_path)
    cursor = connection.cursor()
    cursor.execute("SELECT * FROM articles WHERE id = ?", [article_id])
    # Get dict {column: value}
    try:
      article = {[col for col in head if col is not None][0]: value for head, value in zip(cursor.description, cursor.fetchone())}
      cursor.close()
      connection.close()
    except TypeError:  # When the DB doest not return a result
      cursor.close()
      connection.close()
      return None
    
    for section in ["title", "abstract", "body"]:
        if article[section] is not None:
            sentences = preprocess_text(article[section], stem_words=False, remove_num=False)
            for sentence in sentences:
              try:
                # paper, section, sentence, vector
                row_to_insert = [
                    article_id,
                    section,
                    json.dumps(sentence),                                                               # Store list of tokens as loadable str
                    json.dumps([str(x) for x in embedding_model.compute_sentence_vector(sentence)])     # Embeded vector
                ]
                insert_row(list_to_insert=row_to_insert, table_name="sentences", db_path=db_path)
              except TypeError:  # When all words are not in the model
                continue

### Embedding

In [0]:
class Embedding():

    def __init__(self, vectors_path: str = None, embeddings_dimension: int = 50, sentence_embedding_method: str = "mowe"):
        
        if vectors_path is None:
            self.vectors_path = os.path.join(os.sep, "kaggle", "input", "glove-global-vectors-for-word-representation", f"glove.6B.{embeddings_dimension}d.txt")
        else:
            self.vectors_path = vectors_path
        
        self.embeddings_dimension = embeddings_dimension
        self.sentence_embedding_method = sentence_embedding_method

    def build_vectors_dictionary(self) -> Dict[str, List[float]]:
        """ Load pre-trained vectors and build a dict """

        tic = time.time()    

        self.vectors = {}
        with open(self.vectors_path, "r") as handler:
            for line in handler.readlines():
                try:
                    # Prevent to keep useless words (otherwise pre-proc return nothing)
                    word = preprocess_text(line.split()[0])[0][0]
                    vector = [float(dimension) for dimension in line.split()[1:None]]
                    assert len(vector) == self.embeddings_dimension
                    self.vectors[word] = vector
                except IndexError:
                    continue

        toc = time.time()
        print(f"Took {round((toc-tic) / 60, 2)} min to load {len(self.vectors.keys())} GloVe vectors (embedding dim: {self.embeddings_dimension}).")
        
    def compute_sentence_vector(self, sentence: List[str], sentence_embedding_method: str = "mowe") -> List[float]:
        """ Compute a SOWE/MOWE over all tokens composing a sentence. Word skipped if not in model. """
        words_vector = [self.vectors[word] if word in self.vectors.keys() else list(list(np.full([1, EMBEDDING_DIMENSION, ], np.nan))[0]) for word in sentence]
        if self.sentence_embedding_method == "mowe":
            sentence_embedding = np.nanmean(words_vector, axis=0)
        elif self.sentence_embedding_method == "sowe":
            sentence_embedding = np.nansum(words_vector, axis=0)
        else:
            raise Exception(f"No such sentence embedding method: {sentence_embedding_method}")
        return sentence_embedding


### Query matching

In [0]:
def vectorize_query(query: str) -> List[float]:
    """ Vectorize a sentence """
    split_query = preprocess_text(query, stem_words=False, remove_num=False)[0]
    query_vector = embedding_model.compute_sentence_vector(split_query)
    return query_vector

def get_sentences(db_path: str) -> List[Any]:
    """ Retrieve all sentences """
    connection = sqlite3.connect(db_path)
    cursor = connection.cursor()
    command = "SELECT * FROM sentences"
    cursor.execute(command)
    data = cursor.fetchall()
    cursor.close()
    connection.close()

    return data

def compute_cosine_distance(args: Any) -> float:
    """ Compute cosine distance between two embeded sentences """
    sentence_vector = args[1]
    query_vector = args[0]
    distance = 1 - cosine_similarity([query_vector], [sentence_vector])[0][0]

    return (distance, sentence_vector)

def query_db_for_sentence(db_path: str, vector: str):
    """ Get a full sentence from a vector """
    connection = sqlite3.connect(db_path)
    cursor = connection.cursor()
    command = "SELECT * FROM sentences WHERE vector='%s'" % vector
    cursor.execute(command)
    data = cursor.fetchall()
    cursor.close()
    connection.close()

    data = list(set(data))

    if len(data) > 1:
      print(f"ERROR: two sentences with vector {vector} have been found.")
      data = None

    return data

## Parameters

In [0]:
EMBEDDING_DIMENSION = 100
SENTENCE_EMBEDDING_METHOD = "mowe"

DB_FILE_NAME = os.path.join(f"articles_database_v2_02042020_embedding_{EMBEDDING_DIMENSION}.sqlite")

## Insert articles into sqlite DB¶


In [0]:
def create_db_and_load_articles(db_path: str = "articles_database.sqlite", load_file: bool = True) -> None:
    """ Load metadata.csv, try to get body texts and insert """

    if load_file is True:
      assert os.path.isfile(db_path)
      print(f"DB {db_path} will be used instead.")

    else:
      tic = time.time()

      # The metadata.csv file will be used to fetch available files
      metadata_path = os.path.join(os.sep, "content", "kaggle_data", "metadata.csv")
      metadata_df = pd.read_csv(metadata_path)
      # The DOI isn't unique, then let's keep the last version of a duplicated paper
      metadata_df.drop_duplicates(subset=["doi"], keep="last", inplace=True)
      # Load usefull information to be stored: id, title, body, abstract, date, sha, folder
      articles_to_be_inserted = [(article, DB_FILE_NAME) for article in get_articles_to_insert(metadata_df)]
      # Create a new SQLite DB file
      instanciate_sql_db(db_path=DB_FILE_NAME)
      # Parallelize articles insertion
      with mp.Pool(os.cpu_count()) as pool:
          pool.map(insert_article, articles_to_be_inserted)

      toc = time.time()
      print(f"Took {round((toc-tic) / 60, 2)} min to insert {len(articles_to_be_inserted)} articles (SQLite DB: {db_path}).")

In [21]:
# Change load_file to False if you want to create the DB for the first time
create_db_and_load_articles(DB_FILE_NAME, load_file=False)

Took 6.39 min to insert 42440 articles (SQLite DB: articles_database_v2_02042020_embedding_100.sqlite).


## Load word embedding

In [22]:
embedding_model = Embedding(
    vectors_path=os.path.join(os.sep, "content", "glove_vectors", f"glove.6B.{EMBEDDING_DIMENSION}d.txt"),
    embeddings_dimension=EMBEDDING_DIMENSION,
    sentence_embedding_method=SENTENCE_EMBEDDING_METHOD
)
embedding_model.build_vectors_dictionary()

Took 1.52 min to load 330620 GloVe vectors (embedding dim: 100).


## Pre-process and vectorize texts

In [0]:
def pre_process_and_vectorize_texts(embedding_model: Embedding, db_path: str = "articles_database.sqlite", load_file: bool = True) -> None:
    """ Apply pre-processing to all loaded articles """

    if load_file is True:
      assert os.path.isfile(db_path)
      print(f"DB {db_path} will be used instead.")

    else:
      tic = time.time()

      # Get all previously inserted IDS as well as a pointer on embedding method
      ids = [(id_, embedding_model, db_path) for id_ in get_all_ids(db_path=db_path)]
      # For each title, abstract and body, pre-processed found data

      with picklable_pool(os.cpu_count()) as pool:
          pool.map(pre_process_articles, ids)

      toc = time.time()
      print(f"Took {round((toc-tic) / 60, 2)} min to pre-process {len(ids)} articles (SQLite DB: {db_path}).")

In [24]:
# Change load_file to False if you want to create the DB for the first time
pre_process_and_vectorize_texts(embedding_model, DB_FILE_NAME, load_file=False)



Took 59.44 min to pre-process 42440 articles (SQLite DB: articles_database_v2_02042020_embedding_100.sqlite).


## Query the DB

In [25]:
# Get sentences vectors to be matched with queries (stay in RAM, thus computed once).
sentences = get_sentences(db_path=DB_FILE_NAME)
sentences_vectors = [[float(x) for x in json.loads(sentence_vector[3])] for sentence_vector in sentences]
sentences_vectors = [vector for vector in sentences_vectors if np.nansum(vector) != 0]

print(f"Queries will be matched versus {len(sentences_vectors)} vectors.")

Queries will be matched versus 327426 vectors.


In [38]:
tic = time.time()

# Will be replaced by a textbox
query = "chloroquine usage coronavirus treatment"

# Vectorize it and format as arguments to be mapped by mp.Pool
query_vector = list(vectorize_query(query))
mapping_arguments = [(query_vector, sentence_vector) for sentence_vector in sentences_vectors]

# Execute
with mp.Pool(os.cpu_count()) as pool:
  distances_and_vectors = pool.map(compute_cosine_distance, mapping_arguments)

# Get results
distances = [item[0] for item in distances_and_vectors]
vectors = [item[1] for item in distances_and_vectors]

# Find closest
closest_sentence_index = distances.index(min(distances))
closest_vector = vectors[closest_sentence_index]
closest_vector_str = json.dumps([str(x) for x in closest_vector])

# Retrieve closest sentence
closest_sentence = query_db_for_sentence(vector=closest_vector_str, db_path=DB_FILE_NAME)

toc = time.time()

for sentence in closest_sentence:
  print(f"DOI:\t\t\t{sentence[0]}")
  print(f"SECTION:\t\t{sentence[1]}")
  print(f"SENTENCE:\t\t{sentence[2]}")
  print(f"VECTOR:\t\t\t{str(sentence[3])}")
  print(f"TIME:\t\t\t{round(toc - tic, 2)}")



DOI:			10.3760/cma.j.issn.1001-0939.2020.0019
SECTION:		title
SENTENCE:		["Expert", "consensus", "chloroquine", "phosphate", "treatment", "novel", "coronavirus", "pneumonia"]
VECTOR:			["-0.10206371428571429", "0.14450642857142856", "-0.11673999999999998", "0.2449471428571428", "-0.09503714285714285", "-0.06895071428571427", "0.2032712714285714", "-0.49277042857142855", "0.19871957142857144", "0.07751585714285714", "-0.21958442857142857", "0.055541999999999994", "0.09598342857142857", "-0.04432700000000001", "0.1801637142857143", "0.16684000000000002", "-0.3738714285714285", "-0.19058114285714287", "0.30563571428571434", "-0.29871455714285716", "-0.04745085714285713", "0.027170142857142865", "-0.27736802857142856", "0.13388771428571428", "0.031063", "0.2810084285714286", "0.36096885714285715", "-0.059184285714285696", "-0.26887742857142855", "0.0929812857142857", "0.16334857142857143", "0.18748", "-0.007528571428571456", "-0.19825714285714283", "0.09999771428571427", "-0.00140495857142

## Tests

In [70]:
connection = sqlite3.connect(DB_FILE_NAME)
cursor = connection.cursor()
cursor.execute("SELECT * FROM sentences")
res = cursor.fetchall()
cursor.close()
connection.close()

print(len(res))
print(res[42440])

344867
('10.1016/j.bbamcr.2014.08.004', 'abstract', '["The", "infected", "host", "cell", "detects", "trace", "amounts", "viral", "RNA", "last", "years", "revealed", "common", "principles", "biochemical", "mechanisms", "leading", "signal", "amplification", "required", "mounting", "powerful", "antiviral", "response"]', '["0.16382169545454547", "0.023354745454545446", "0.013891272727272698", "0.12843163636363636", "-0.16910123636363633", "-0.020394590909090904", "-0.10999465454545455", "-0.19730999999999999", "0.13107109090909094", "0.20234877590909092", "-0.13380909545454542", "0.0007616318181818354", "0.08235004545454545", "-0.13349909090909093", "0.29918350000000005", "0.12036464545454549", "-0.2348821363636363", "0.05956048181818181", "0.129445", "-0.20799213636363637", "-0.21862849999999998", "-0.29948413636363636", "-0.1809764545454546", "-0.06356311818181816", "-0.12140527272727274", "-0.014608181818181819", "0.23308814545454545", "0.12170824090909091", "-0.13597333636363634", "0.0

In [0]:
sentence_1 = ["cool", "concert", "guitar"]
sentence_2 = ["super", "piano", "song"]
sentence_3 = ["boat", "drugs", "corona"]

distance_1_2 = compute_cosine_distance(
    embeding_model.compute_sentence_vector(sentence_1),
    embeding_model.compute_sentence_vector(sentence_2)
)

distance_1_3 = compute_cosine_distance(
    embeding_model.compute_sentence_vector(sentence_1),
    embeding_model.compute_sentence_vector(sentence_3)
)

print(f"Distance between: '{' '.join(sentence_1)}' and '{' '.join(sentence_2)}': {distance_1_2}")
print(f"Distance between: '{' '.join(sentence_1)}' and '{' '.join(sentence_3)}': {distance_1_3}")