# COVID Q&A System

## Load Data

Load libraries and document corpus to use as context for the Q&A System.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
# TO RUN ON GOOGLE COLAB

# Install transformers
#!pip install transformers
# For french
!pip install --no-cache-dir transformers sentencepiece



In [2]:
# Change dir
%cd drive/MyDrive/NLP/

/content/drive/MyDrive/NLP


In [3]:
# Import required libraries for excecution
import numpy as np
import pandas as pd
import random
import json
import csv
import os
import sys

# NLP
from gensim import corpora, models, similarities
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.tokenize import TweetTokenizer
from nltk.stem import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
from nltk.corpus import stopwords

# Import utils class
sys.path.insert(0,'../')
from utils import Utils

# Transformer Models
from transformers import pipeline

# Display
from tqdm import tqdm
from IPython.display import clear_output

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Path to directory of docs
#path_prefix = r'D:\Cesard\Documents\NLP'
path_prefix = './'
lang = 'en'

# Instanciate utils class
utils = Utils(path_prefix, num_workers=10)

# Load WHO and CDC Docs
WHO_CDC_text, _, WHO_CDC_titles = utils.data_loader(lang, 'WHO_CDC', total_data=None, max_size = None, return_titles = True )

# Load News
#news_text, _, news_titles = utils.data_loader(lang, 'news', total_data=1000, max_size=None, return_titles = True)

Starting 10 threads to load 379 documents from WHO_CDC in en
Loaded 379 files in 0.21 seconds.
Removed 0 files becasuse they were too large


In [5]:
# Append lists
doc_text = WHO_CDC_text #+ news_text
doc_titles = WHO_CDC_titles #+ news_titles

## Preprocessing

Standard preprocesseing to all documents and queries for the IR task.

In [6]:
# Stop Words
stop_words = stopwords.words('english')

# Stemmers
stem = SnowballStemmer('english')
#p_stem = PorterStemmer()

# Tokenizers
#tk = nltk.tokenize.TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
tk = nltk.RegexpTokenizer(r'\w+')

# Lemmatizer
lemma = nltk.stem.WordNetLemmatizer()

# Create vocab (dictionary)
doc_dict = []
for doc in doc_text:
    doc_dict.append(utils.preprocessing(text=doc, stop_words = stop_words,
                                                  stemmer = None,
                                                  tokenizer = tk,
                                                  lemmatizer = lemma))
# Get dict
dictionary = corpora.Dictionary(doc_dict)

# Create doc corpus
doc_corpus = []
for doc in doc_text:
    doc_corpus.append(dictionary.doc2bow(utils.preprocessing(text=doc, stop_words = stop_words,
                                                                       stemmer = None,
                                                                       tokenizer = tk,
                                                                       lemmatizer = lemma)))
    
# Create title corpus
title_corpus = []
for title in doc_titles:
    title_corpus.append(dictionary.doc2bow(utils.preprocessing(text=title, stop_words = stop_words,
                                                                           stemmer = None,
                                                                           tokenizer = tk,
                                                                           lemmatizer = lemma)))

# Serializes and saves dictionary and corpus files
#dictionary.save('vocab.dict')
#corpora.MmCorpus.serialize("covid_qa_corpus.mm", doc_corpus)
#corpora.MmCorpus.serialize("covid_qa_title.mm", title_corpus)

In [None]:
# Load vocabulary, doc_corpus, query_corpus and df with tags
#vocabulary = corpora.Dictionary.load('vocab.dict')
#doc_corpus = corpora.MmCorpus("covid_qa_corpus.mm")

## Information Retrieval

### Create Index

In [7]:
# Create tfidf model for document corpus
tfidf = models.TfidfModel(doc_corpus)
tfidf_title = models.TfidfModel(title_corpus)

# Model transformation
print('Title Doc Example in tfidf form: ')
print(tfidf_title[doc_corpus][0]) 

# Similarity Matrix
index = similarities.MatrixSimilarity(tfidf[doc_corpus])
index_title = similarities.MatrixSimilarity(tfidf_title[title_corpus])

# Save index
#index.save('similarity_matrix.index')

Title Doc Example in tfidf form: 
[(0, 0.29810380217412474), (1, 0.29590189482102736), (2, 0.10169570294846703), (3, 0.08291207436260775), (4, 0.14052562327935353), (5, 0.20264852039039022), (6, 0.059965122242362154), (7, 0.10701157626572981), (8, 0.6909548774420143), (9, 0.18561947990613278), (10, 0.23531621953192705), (11, 0.4040441059911029)]


### Querying

In [8]:
def perform_query(query, top_n = -1):
    """ Perform IR over the corpus with the provided query using gensim tfidf model.
    Args:
        query (str): raw query (from user input)
        top_n (int): max number of docs to retrieve
    Returns:
        context_doc_ids (list): List with the ids of the context docs
    """

    # Preprocess query
    processed_query = utils.preprocessing(query, 
                                          stop_words = stop_words,
                                          stemmer = None,
                                          tokenizer = tk,
                                          lemmatizer = lemma)

    # Similarity between all docs and query
    #sims = list(enumerate(index[tfidf[dictionary.doc2bow(processed_query)]]))

    # Similarity between all doc titles and query
    sims = list(enumerate(index_title[tfidf_title[dictionary.doc2bow(processed_query)]]))
    
    dtype = [('doc_id', int), ('score', float)]
    doc_sims = np.array(sims, dtype=dtype)
    
    # Sort Docs by similarity
    doc_sims_sorted = np.flip(np.sort(doc_sims, order='score'))

    # Retrieve only documents with non zero score
    k = len(np.nonzero(doc_sims['score'])[0])
    relevant_docs = doc_sims_sorted[0:k]
    
    # Print only top docs
    context_doc_ids = relevant_docs['doc_id'][0:top_n]
    
    return context_doc_ids

## Deep Model

Load deep pre-trained model for Q&A to extract the answer from the context.

In [9]:
# Load pre-trained Q&A Model for lang
if lang == 'en':
    covid_qa = pipeline("question-answering", model='deepset/roberta-base-squad2-covid')
elif lang == 'es':
    covid_qa = pipeline("question-answering", model='mrm8488/bert-base-spanish-wwm-cased-finetuned-spa-squad2-es')
elif lang == 'fr':
    covid_qa = pipeline('question-answering', model='fmikaelian/camembert-base-fquad', tokenizer='fmikaelian/camembert-base-fquad', use_fast = False)
else:
    print('Not supported language')

In [10]:
def perform_qa(query, context_ids, confidence_th = 0.05):
    """ Run QA Deep model with query and context from param.
    Args:
        query (str): raw query (from user input)
        context_id (int): document id with the context.
    Returns:
        result (dir): Answer provided by the model with confidence score.
        k (int): Number of context documents found
    """

    # Get context from retrieved docs
    for k, id in enumerate(context_ids):
        # Get context doc
        context = doc_titles[id] + ' ' + doc_text[id]

        # Perform QA
        result = covid_qa(question=query, context=context)

        # If score is good enough return result
        if result['score'] > confidence_th:
            return result, k
    
    # Default response if not good enough answer nor context was found
    if lang == 'en':
        if not len(context_ids):
            result = {'answer': 'The context of that question is outside of my domain', 'score': 0}
        else:
            result = {'answer': 'Sorry, the answer to that question was not found', 'score': 0}
    elif lang == 'es':
        if not len(context_ids):
            result = {'answer': 'El contexto de esa pregunta esta fuera de mi dominio', 'score': 0}
        else:
            result = {'answer': 'Disculpe, la respuesta a esa pregunta no se encontró', 'score': 0}
    elif lang == 'fr':
        if not len(context_ids):
            result = {'answer': 'Le contexte de cette question est en dehors de mon domaine', 'score': 0}
        else:
            result = {'answer': 'Pardon, la réponse à cette question n\'a pas été trouvée', 'score': 0}
    else:
        print('Not supported language')

    return result, 0


## Q&A Demo

Demo of the two stage system (IR and Q&A) with some example questions or input from user

In [11]:
# Use input from user
print('-------- COVID Q&A -------')
q = input('Q: ')
print('')

# Information Retrieval (get context)
context_ids = perform_query(q)

# Question Answering
result, k = perform_qa(q, context_ids, confidence_th = 0.1)

# Print results
print(f"Context Docs (Ranked): ")
for i, id in enumerate(context_ids):
    print(f"R{i}: {doc_titles[id]}")
    if i >= k:
        print(doc_text[id])
        break
print('')

print(f"A: {result['answer']}")
print(f"Score: {result['score']}")
print(f"Context Doc used: R{k}")

-------- COVID Q&A -------
Q: What is Covid-19?

Context Docs (Ranked): 
R0: What is COVID-19?
COVID-19 is a disease caused by a virus called SARS-CoV-2. Most people with COVID-19 have mild  symptoms , but some people can become severely ill. Although most people with COVID-19 get better within weeks of illness, some people experience post-COVID conditions.  Post-COVID conditions  are a wide range of new, returning, or ongoing health problems people can experience  more than four weeks  after first being infected with the virus that causes COVID-19. Older people and those who have  certain underlying medical conditions  are more likely to get severely ill from COVID-19.  Vaccines  against COVID-19 are safe and effective.

A: a disease caused by a virus called SARS-CoV-2
Score: 0.6684508323669434
Context Doc used: R0


## Q&A Simple Test

Evaluate performance on small hand-made set of questions.

In [12]:
# Set pandas columns option
pd.set_option('display.max_colwidth', 200)
pd.set_option("max_rows", 30)

In [13]:
# Functions to compute evaluation metrics

def compute_exact_match(prediction, truth):
    em = int(utils.preprocessing(text=prediction, stop_words = stop_words, stemmer = None, tokenizer = tk, lemmatizer = lemma) == utils.preprocessing(text=truth, stop_words = stop_words, stemmer = None, tokenizer = tk, lemmatizer = lemma))
    return em


def compute_f1(prediction, truth):
    pred_tokens = utils.preprocessing(text=prediction, 
                                      stop_words = stop_words,
                                      stemmer = None,
                                      tokenizer = tk,
                                      lemmatizer = lemma)
    truth_tokens = utils.preprocessing(text=truth, 
                                       stop_words = stop_words,
                                       stemmer = None,
                                       tokenizer = tk,
                                       lemmatizer = lemma)
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if pred_tokens == truth_tokens:
        return 1
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

In [14]:
# Load hand-made dataset
df_test = pd.read_csv(f'qa_{lang}.csv', sep=';', encoding='latin-1')
df_test.head()

Unnamed: 0,question,real_answer
0,What is COVID-19?,is the disease caused by a new coronavirus called SARS-CoV-2
1,How does COVID-19 spreads?,through close contact from person to person
2,What are covid-19 symptoms?,Fever Dry cough Fatigue
3,What should I do if I have COVID-19 symptoms?,call your health care provider or COVID-19 hotline
4,What kind of mask should be used by the general public?,"Non-medical, fabric masks"


In [15]:
# Metrics
exact_match = []
f1_score = []
predicted_answer = []
docs_retrieved = []

for i, q in enumerate(df_test['question']):
    print(f"Answering Q{i}: {q}")
    # Information Retrieval (get context)
    context_ids = perform_query(q)

    # Question Answering
    result, k = perform_qa(q, context_ids, confidence_th = 0.1)

    # Compute metrics
    docs_retrieved.append(k)
    exact_match.append(compute_exact_match(result['answer'], df_test['real_answer'][i]))
    f1_score.append(compute_f1(result['answer'], df_test['real_answer'][i]))
    predicted_answer.append(result['answer'])

clear_output()

In [18]:
# Add results to dataframe
df_test['predicted_answer'] = predicted_answer
df_test['exact_match'] = exact_match
df_test['f1_score'] = f1_score
df_test['docs_retrieved'] = docs_retrieved
df_test.head(10)

Unnamed: 0,question,real_answer,predicted_answer,exact_match,f1_score,docs_retrieved
0,What is COVID-19?,is the disease caused by a new coronavirus called SARS-CoV-2,a disease caused by a virus called SARS-CoV-2,0,0.380952,0
1,How does COVID-19 spreads?,through close contact from person to person,when a person with COVID-19 coughs or exhales,0,0.4,1
2,What are covid-19 symptoms?,Fever Dry cough Fatigue,Fever Dry cough Fatigue,1,1.0,0
3,What should I do if I have COVID-19 symptoms?,call your health care provider or COVID-19 hotline,seek medical advice,0,0.228571,1
4,What kind of mask should be used by the general public?,"Non-medical, fabric masks",medical mask,0,0.571429,18
5,Can adolescents catch COVID-19?,Yes. All age groups can catch COVID-19.,Yes. All age groups can catch COVID-19.,1,1.0,0
6,What distance should I mantain to prevent covid?,6 feet,3 or more days per week,0,0.133333,0
7,Can my dog spread COVID-19?,the risk of animals spreading COVID-19 to people is considered to be low,a disease caused by a virus called SARS-CoV-2,0,0.255319,18
8,Is there a COVID-19 vaccine?,Yes there are now several vaccines that are in use.,Yes there are now several vaccines that are in use.,1,1.0,1
9,Should I get vaccinated if I have had covid?,you should be vaccinated when it is offered to you,Pregnant and recently pregnant people,0,0.222222,37


In [19]:
print('AVG RESULTS')
print(f"Exact Match: {df_test['exact_match'].mean()}")
print(f"F1 Score: {df_test['f1_score'].mean()}")
print(f"Docs Retrieved: {df_test['docs_retrieved'].mean()}")

AVG RESULTS
Exact Match: 0.3
F1 Score: 0.5191827085444107
Docs Retrieved: 7.6
