# COVID Q&A System

## Load Data

Load libraries and document corpus to use as context for the Q&A System

In [None]:
# Install libraries
!pip install transformers
# For french
#!pip install --no-cache-dir transformers sentencepiece

In [2]:
%cd drive/MyDrive/NLP/

/content/drive/MyDrive/NLP


In [3]:
# Import required libraries for excecution
import numpy as np
import pandas as pd
import random
import json
import csv
import os
import sys

# NLP
from gensim import corpora, models, similarities
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.tokenize import TweetTokenizer
from nltk.stem import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
from nltk.corpus import stopwords

# Import utils class
#sys.path.insert(0,'../')
from utils import Utils

# Transformer Models
from transformers import pipeline

# Display
from tqdm import tqdm
from IPython.display import clear_output

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Path to directory of docs
path_prefix = './'
source = 'WHO'
lang = 'es'

path = os.path.join(path_prefix, source, lang)
files_list = os.listdir(path)

# Load list of documents
doc_list = []
doc_title = []
for file in files_list:
    # Open docs
    with open(os.path.join(path, file), 'r+') as file_str:
        data_dict = json.load(file_str)
        # Append title and text
        for i, txt in enumerate(data_dict['text']):
            doc_list.append(f"{data_dict['title'][i]} {txt}")
            doc_title.append(f"{data_dict['title'][i]}")

print(f"Se cargaron {len(doc_list)} documentos")

Se cargaron 123 documentos


## Preprocessing

In [5]:
# Stop Words
stop_words = stopwords.words('spanish')

# Stemmers
stem = SnowballStemmer('spanish')
#p_stem = PorterStemmer()

# Tokenizers
#tk = nltk.tokenize.TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
tk = nltk.RegexpTokenizer(r'\w+')

# Lemmatizer
lemma = nltk.stem.WordNetLemmatizer()

# Instanciate utils class
utils = Utils(path_prefix, num_workers=10)

# Create vocab (dictionary)
doc_dict = []
for doc in doc_list:
    doc_dict.append(utils.preprocessing(text=doc, stop_words = stop_words,
                                             stemmer = None,
                                             tokenizer = tk,
                                             lemmatizer = lemma))
# Get dict
dictionary = corpora.Dictionary(doc_dict)

# Create doc corpus
doc_corpus = []
for doc in doc_list:
    doc_corpus.append(dictionary.doc2bow(utils.preprocessing(text=doc, stop_words = stop_words,
                                                                  stemmer = None,
                                                                  tokenizer = tk,
                                                                  lemmatizer = lemma)))
    
# Create title corpus
title_corpus = []
for doc in doc_title:
    title_corpus.append(dictionary.doc2bow(utils.preprocessing(text=doc, stop_words = stop_words,
                                                                  stemmer = None,
                                                                  tokenizer = tk,
                                                                  lemmatizer = lemma)))

# Serializes and saves dictionary and corpus files
#dictionary.save('vocab.dict')
#corpora.MmCorpus.serialize("covid_qa_corpus.mm", doc_corpus)
#corpora.MmCorpus.serialize("covid_qa_title.mm", title_corpus)

In [156]:
# Load vocabulary, doc_corpus, query_corpus and df with tags
#vocabulary = corpora.Dictionary.load('vocab.dict')
#doc_corpus = corpora.MmCorpus("covid_qa_corpus.mm")

## Information Retrieval

### Create Index

In [6]:
# Create tfidf model for document corpus
tfidf = models.TfidfModel(doc_corpus)
tfidf_title = models.TfidfModel(title_corpus)

# Model transformation
print('Doc Example (tfidf form): ')
print(tfidf[doc_corpus][0]) 

# Similarity Matrix
index = similarities.MatrixSimilarity(tfidf[doc_corpus])
index_title = similarities.MatrixSimilarity(tfidf_title[title_corpus])

# Save index
#index.save('similarity_matrix.index')

Doc Example (tfidf form): 
[(0, 0.15230563029421557), (1, 0.17793545924244475), (2, 0.09669082367676309), (3, 0.09669082367676309), (4, 0.09669082367676309), (5, 0.15230563029421557), (6, 0.11168331251137473), (7, 0.13731314145960394), (8, 0.15230563029421557), (9, 0.13731314145960394), (10, 0.07541614344952798), (11, 0.08927081859867517), (12, 0.15230563029421557), (13, 0.15230563029421557), (14, 0.06906180368840908), (15, 0.08927081859867517), (16, 0.15230563029421557), (17, 0.17210696712629106), (18, 0.15230563029421557), (19, 0.0536884108776402), (20, 0.09669082367676309), (21, 0.11842483934039395), (22, 0.13731314145960394), (23, 0.10598343279313906), (24, 0.048648500815834335), (25, 0.0830938222332551), (26, 0.08927081859867517), (27, 0.10598343279313906), (28, 0.17854163719735033), (29, 0.0830938222332551), (30, 0.18559002078432957), (31, 0.10104597239775717), (32, 0.17793545924244475), (33, 0.07317448578895563), (34, 0.07541614344952798), (35, 0.1421219894570678), (36, 0.202091

### Querying

In [7]:
def perform_query(query, top_n = -1):

    # Preprocess query
    processed_query = utils.preprocessing(query, 
                                          stop_words = stop_words,
                                          stemmer = None,
                                          tokenizer = tk,
                                          lemmatizer = lemma)

    # Similarity between all docs and query
    #sims = list(enumerate(index[tfidf[dictionary.doc2bow(processed_query)]]))

    # Similarity between all doc titles and query
    sims = list(enumerate(index_title[tfidf_title[dictionary.doc2bow(processed_query)]]))
    
    dtype = [('doc_id', int), ('score', float)]
    doc_sims = np.array(sims, dtype=dtype)
    
    # Sort Docs by similarity
    doc_sims_sorted = np.flip(np.sort(doc_sims, order='score'))

    # Retrieve only documents with non zero score
    k = len(np.nonzero(doc_sims['score'])[0])
    relevant_docs = doc_sims_sorted[0:k]
    
    # Print only top docs
    context_doc_ids = relevant_docs['doc_id'][0:top_n]
    
    return context_doc_ids

## Deep Model

In [8]:
# Load pre-trained Q&A Model for lang
if lang == 'en':
    covid_qa = pipeline("question-answering", model='deepset/roberta-base-squad2-covid')
elif lang == 'es':
    covid_qa = pipeline("question-answering", model='mrm8488/bert-base-spanish-wwm-cased-finetuned-spa-squad2-es')
elif lang == 'fr':
    covid_qa = pipeline('question-answering', model='fmikaelian/camembert-base-fquad', tokenizer='fmikaelian/camembert-base-fquad', use_fast = False)
else:
    print('Not supported language')

In [9]:
def perform_qa(query, context_ids, confidence_th = 0.01):

    # Get context from retrieved docs
    for k, id in enumerate(context_ids):
        # Get context doc
        context = doc_list[id]

        # Perform QA
        result = covid_qa(question=query, context=context)

        # If score is good enough return result
        if result['score'] > confidence_th:
            return result, k
    
    # Default response if not good enough answer was found
    if lang == 'en':
        result = {'answer': 'Sorry, the answer to that question was not found', 'score': 0}
    elif lang == 'es':
        result = {'answer': 'Disculpe, la respuesta a esa pregunta no se encontró', 'score': 0}
    elif lang == 'fr':
        result = {'answer': 'Pardon, la réponse à cette question n\'a pas été trouvée', 'score': 0}
    else:
        print('Not supported language')

    return result, 0


## Q&A Demo

In [10]:
# Example questions 
q_en = ["What is COVID-19?",
        "How does COVID-19 spreads?",
        "What symptoms does covid-19 causes?", 
        "Which kind of mask should I use to protect me from covid?",
        "How should I protect from covid?",
        "Should I get a vaccine for covid-19?"]

q_es = ["¿Qué es el COVID-19?",
        "¿Cómo se transmite el covid-19?",
        "¿Cuales son los sintomas del covid-19?", 
        "¿Qué tipo de máscara debería utilizar para protegerme del covid?",
        "¿Cómo protegerme del covid?",
        "¿Debería vacunarme para si ya tuve covid-19?"]

q_fr = ["Qu'est-ce que COVID-19?",
        "Comment le COVID-19 se propage-t-il?",
        "Quels symptômes le covid-19 provoque-t-il?",
        "Quel type de masque dois-je utiliser pour me protéger du covid?",
        "Comment dois-je me protéger de Covid?",
        "Dois-je me faire vacciner contre le covid-19?"]

In [12]:
# Select one random question
q = q_es[1]

# Input question
#q = input('COVID Q&A: ')

print(f"Q: {q}")
print('')

# Information Retrieval (get context)
context_ids = perform_query(q)

# Question Answering
result, k = perform_qa(q, context_ids, confidence_th = 0.1)

# Print results
print(f"Context Docs (Ranked): ")
for i, id in enumerate(context_ids):
    print(f"R{i}: {doc_list[id]}")
    if i >= k:
      break
print('')

print(f"A: {result['answer']}")
print(f"Score: {result['score']}")
print(f"Context Doc used: R{k}")

Q: ¿Cómo se transmite el covid-19?

Context Docs (Ranked): 
R0: ¿Qué es la COVID-19? La COVID-19 es la enfermedad causada por el nuevo coronavirus conocido como SARS-CoV-2. La OMS tuvo noticia por primera vez de la existencia de este nuevo virus el 31 de diciembre de 2019, al ser informada de un grupo de casos de «neumonía vírica» que se habían declarado en Wuhan (República Popular China).
R1: ¿Cómo se propaga la COVID-19 entre las personas? La COVID-19 es una enfermedad causada por el virus SARS-CoV-2, que se propaga entre las personas principalmente cuando una persona infectada está en contacto cercano con otra persona. El virus se puede propagar a través de pequeñas partículas líquidas expulsadas por una persona infectada a través de la boca o la nariz al toser, estornudar, hablar, cantar o resoplar. Esas partículas líquidas tienen diferentes tamaños, desde las más grandes ‘gotículas respiratorias’ hasta las más pequeñas, llamadas ‘aerosoles’.  Otras personas pueden contraer la COVI