# COVID Q&A System

## Load Data

Load libraries and document corpus to use as context for the Q&A System.

In [1]:
# TO RUN ON GOOGLE COLAB

# Install transformers
#!pip install transformers
# For french
#!pip install --no-cache-dir transformers sentencepiece

# Change dir
#%cd drive/MyDrive/NLP/

In [2]:
# Import required libraries for excecution
import numpy as np
import pandas as pd
import random
import json
import csv
import os
import sys

# NLP
from gensim import corpora, models, similarities
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.tokenize import TweetTokenizer
from nltk.stem import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
from nltk.corpus import stopwords

# Import utils class
sys.path.insert(0,'../')
from utils import Utils

# Transformer Models
from transformers import pipeline

# Display
from tqdm import tqdm
from IPython.display import clear_output

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\CESAR\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\CESAR\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Path to directory of docs
path_prefix = r'D:\Cesard\Documents\NLP'
lang = 'es'

# Instanciate utils class
utils = Utils(path_prefix, num_workers=10)

# Load WHO and CDC Docs
WHO_CDC_text, _, WHO_CDC_titles = utils.data_loader(lang, 'WHO_CDC', total_data=None, max_size = None, return_titles = True )

# Load News
news_text, _, news_titles = utils.data_loader(lang, 'news', total_data=1000, max_size=None, return_titles = True)

Starting 10 threads to load 220 documents from WHO_CDC in es
Loaded 220 files in 0.33 seconds.
Removed 0 files becasuse they were too large
Starting 10 threads to load 1000 documents from news in es
Loaded 1000 files in 30.74 seconds.
Removed 0 files becasuse they were too large


In [4]:
# Append lists
doc_text = WHO_CDC_text #+ news_text
doc_titles = WHO_CDC_titles #+ news_titles

## Preprocessing

Standard preprocesseing to all documents and queries for the IR task.

In [5]:
# Stop Words
stop_words = stopwords.words('spanish')

# Stemmers
stem = SnowballStemmer('spanish')
#p_stem = PorterStemmer()

# Tokenizers
#tk = nltk.tokenize.TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
tk = nltk.RegexpTokenizer(r'\w+')

# Lemmatizer
lemma = nltk.stem.WordNetLemmatizer()

# Create vocab (dictionary)
doc_dict = []
for doc in doc_text:
    doc_dict.append(utils.preprocessing(text=doc, stop_words = stop_words,
                                                  stemmer = None,
                                                  tokenizer = tk,
                                                  lemmatizer = lemma))
# Get dict
dictionary = corpora.Dictionary(doc_dict)

# Create doc corpus
doc_corpus = []
for doc in doc_text:
    doc_corpus.append(dictionary.doc2bow(utils.preprocessing(text=doc, stop_words = stop_words,
                                                                       stemmer = None,
                                                                       tokenizer = tk,
                                                                       lemmatizer = lemma)))
    
# Create title corpus
title_corpus = []
for title in doc_titles:
    title_corpus.append(dictionary.doc2bow(utils.preprocessing(text=title, stop_words = stop_words,
                                                                           stemmer = None,
                                                                           tokenizer = tk,
                                                                           lemmatizer = lemma)))

# Serializes and saves dictionary and corpus files
#dictionary.save('vocab.dict')
#corpora.MmCorpus.serialize("covid_qa_corpus.mm", doc_corpus)
#corpora.MmCorpus.serialize("covid_qa_title.mm", title_corpus)

In [6]:
# Load vocabulary, doc_corpus, query_corpus and df with tags
#vocabulary = corpora.Dictionary.load('vocab.dict')
#doc_corpus = corpora.MmCorpus("covid_qa_corpus.mm")

## Information Retrieval

### Create Index

In [7]:
# Create tfidf model for document corpus
tfidf = models.TfidfModel(doc_corpus)
tfidf_title = models.TfidfModel(title_corpus)

# Model transformation
print('Title Doc Example in tfidf form: ')
print(tfidf_title[doc_corpus][0]) 

# Similarity Matrix
index = similarities.MatrixSimilarity(tfidf[doc_corpus])
index_title = similarities.MatrixSimilarity(tfidf_title[title_corpus])

# Save index
#index.save('similarity_matrix.index')

Title Doc Example in tfidf form: 
[(0, 0.4533891098472363), (1, 0.022260663916602915), (2, 0.03410059404562253), (3, 0.2473078033004329), (4, 0.14636309914918055), (5, 0.17699739729504257), (6, 0.026523344241058795), (7, 0.47319269372308664), (8, 0.17691435678393277), (9, 0.30692274190265945), (10, 0.08645164073613239), (11, 0.22322660661206623), (12, 0.12112776838018058), (13, 0.04996133546219115), (14, 0.10374196888335883), (15, 0.13808681039256607), (16, 0.049746801728317705), (17, 0.08612599664338606), (18, 0.15885807781038297), (19, 0.19323745128960043), (20, 0.18879794974705807), (21, 0.20426392385489597), (22, 0.11262310547759184), (23, 0.08448621272719399), (24, 0.23407626506077184)]


### Querying

In [8]:
def perform_query(query, top_n = -1):
    """ Perform IR over the corpus with the provided query using gensim tfidf model.
    Args:
        query (str): raw query (from user input)
        top_n (int): max number of docs to retrieve
    Returns:
        context_doc_ids (list): List with the ids of the context docs
    """

    # Preprocess query
    processed_query = utils.preprocessing(query, 
                                          stop_words = stop_words,
                                          stemmer = None,
                                          tokenizer = tk,
                                          lemmatizer = lemma)

    # Similarity between all docs and query
    #sims = list(enumerate(index[tfidf[dictionary.doc2bow(processed_query)]]))

    # Similarity between all doc titles and query
    sims = list(enumerate(index_title[tfidf_title[dictionary.doc2bow(processed_query)]]))
    
    dtype = [('doc_id', int), ('score', float)]
    doc_sims = np.array(sims, dtype=dtype)
    
    # Sort Docs by similarity
    doc_sims_sorted = np.flip(np.sort(doc_sims, order='score'))

    # Retrieve only documents with non zero score
    k = len(np.nonzero(doc_sims['score'])[0])
    relevant_docs = doc_sims_sorted[0:k]
    
    # Print only top docs
    context_doc_ids = relevant_docs['doc_id'][0:top_n]
    
    return context_doc_ids

## Deep Model

Load deep pre-trained model for Q&A to extract the answer from the context.

In [9]:
# Load pre-trained Q&A Model for lang
if lang == 'en':
    covid_qa = pipeline("question-answering", model='deepset/roberta-base-squad2-covid')
elif lang == 'es':
    covid_qa = pipeline("question-answering", model='mrm8488/bert-base-spanish-wwm-cased-finetuned-spa-squad2-es')
elif lang == 'fr':
    covid_qa = pipeline('question-answering', model='fmikaelian/camembert-base-fquad', tokenizer='fmikaelian/camembert-base-fquad', use_fast = False)
else:
    print('Not supported language')

In [10]:
def perform_qa(query, context_ids, confidence_th = 0.05):
    """ Run QA Deep model with query and context from param.
    Args:
        query (str): raw query (from user input)
        context_id (int): document id with the context.
    Returns:
        result (dir): Answer provided by the model with confidence score.
        k (int): Number of context documents found
    """

    # Get context from retrieved docs
    for k, id in enumerate(context_ids):
        # Get context doc
        context = doc_titles[id] + ' ' + doc_text[id]

        # Perform QA
        result = covid_qa(question=query, context=context)

        # If score is good enough return result
        if result['score'] > confidence_th:
            return result, k
    
    # Default response if not good enough answer nor context was found
    if lang == 'en':
        if not len(context_ids):
            result = {'answer': 'The context of that question is outside of my domain', 'score': 0}
        else:
            result = {'answer': 'Sorry, the answer to that question was not found', 'score': 0}
    elif lang == 'es':
        if not len(context_ids):
            result = {'answer': 'El contexto de esa pregunta esta fuera de mi dominio', 'score': 0}
        else:
            result = {'answer': 'Disculpe, la respuesta a esa pregunta no se encontró', 'score': 0}
    elif lang == 'fr':
        if not len(context_ids):
            result = {'answer': 'Le contexte de cette question est en dehors de mon domaine', 'score': 0}
        else:
            result = {'answer': 'Pardon, la réponse à cette question n\'a pas été trouvée', 'score': 0}
    else:
        print('Not supported language')

    return result, 0


## Q&A Demo

Demo of the two stage system (IR and Q&A) with some example questions or input from user

In [11]:
# Example questions 
q_en = ["What is COVID-19?",
        "How does COVID-19 spreads?",
        "What symptoms does covid-19 causes?", 
        "Which kind of mask should I use to protect me from covid?",
        "How should I protect from covid?",
        "Should I get a vaccine for covid-19?"]

q_es = ["¿Qué es el COVID-19?",
        "¿Cómo se propaga el virus?",
        "¿Cuales son los síntomas del covid-19?", 
        "¿Qué tipo de máscara debería utilizar para protegerme del covid?",
        "¿Cómo protegerse del covid?",
        "¿Debería vacunarme si ya tuve covid-19?",
        "¿La vacuna de Astrazeneca causa trombos?",
        "Qué distancia debo mantener para protegerme del covid?"]

q_fr = ["Qu'est-ce que COVID-19?",
        "Comment le COVID-19 se propage-t-il?",
        "Quels symptômes le covid-19 provoque-t-il?",
        "Quel type de masque dois-je utiliser pour me protéger du covid?",
        "Comment dois-je me protéger de Covid?",
        "Dois-je me faire vacciner contre le covid-19?"]

In [12]:
# Select one random question
q = q_es[0]

# Use input from user
#q = input('COVID Q&A: ')

print(f"Q: {q}")
print('')

# Information Retrieval (get context)
context_ids = perform_query(q)

# Question Answering
result, k = perform_qa(q, context_ids, confidence_th = 0.1)

# Print results
print(f"Context Docs (Ranked): ")
for i, id in enumerate(context_ids):
    print(f"R{i}: {doc_titles[id]}")
    if i >= k:
        print(doc_text[id])
        break
print('')

print(f"A: {result['answer']}")
print(f"Score: {result['score']}")
print(f"Context Doc used: R{k}")

Q: ¿Qué es el COVID-19?

Context Docs (Ranked): 
R0: ¿Qué es el COVID-19​​​​​​​?
El COVID-19 es una enfermedad causada por un virus llamado SARS-CoV-2. La mayoría de las personas con COVID-19 tienen  síntomas  leves, pero algunas personas pueden enfermarse gravemente. Aunque la mayoría de las personas con COVID-19 mejora al cabo de unas semanas de haber estado enfermas, algunas personas experimentan afecciones posteriores al COVID-19. Las  afecciones posteriores al COVID-19  son una amplia variedad de problemas de salud nuevos, recurrentes o en curso que las personas pueden experimentar  más de cuatro semanas  después de haberse infectado por primera vez por el virus que causa el COVID-19. Las personas mayores y las personas que tienen  ciertas afecciones subyacentes  tienen mayor riesgo de enfermarse gravemente a causa del COVID-19.  Las vacunas  contra el COVID-19 son seguras y efectivas.

A: una enfermedad causada por un virus llamado SARS-CoV-2
Score: 0.6277936697006226
Context Doc