In [1]:
import sys
sys.path.insert(0, "../../../BERT-FAQ")

from parser.covidfaq import CovidFAQ_Parser
from shared.utils import dump_to_json
import pandas as pd

In [2]:
# read data as pandas DataFrame

df = pd.read_csv("../../../BERT-FAQ/data/CovidFAQ/aligned_question_question_answer.csv")
df

Unnamed: 0,question1,question2,answer,source,rating
0,How quickly can the global economy recover fro...,Can I get COVID-19 from animals when travellin...,Although the current spread and growth of the ...,JHU Public Health Canda Public Health Services,1.0
1,How quickly can the global economy recover fro...,How can I protect myself and others?,The best way to prevent illness from COVID-19 ...,JHU Public Health CDC,0.0
2,How quickly can the global economy recover fro...,Where did COVID-19 come from?,"It was first found in Wuhan City, Hubei Provin...",JHU Public Health CDC,0.0
3,How quickly can the global economy recover fro...,Can my pet or other animals get sick from COVI...,There is currently no evidence to suggest that...,JHU Public Health Canda Public Health Services,0.0
4,How quickly can the global economy recover fro...,How can I protect my child from COVID-19?,By having them practice the same things you ha...,JHU Public Health,0.0
...,...,...,...,...,...
24235,austrian coronavirus,What is novel (new) coronavirus?,"A novel, or new, coronavirus is one that has n...",JHU Public Health CDC,0.0
24236,bbc coronavirus espanol,What is a coronavirus?,Coronavirus are a type of virus - there are ma...,JHU Public Health WHO,0.0
24237,bbc coronavirus espanol,What is novel (new) coronavirus?,"A novel, or new, coronavirus is one that has n...",JHU Public Health CDC,0.0
24238,bbc coronavirus espanol,Do viruses get weaker over time? Will coronavi...,"They can, but it is not guaranteed; it depends...",JHU Public Health ScienceMag,0.0


In [3]:
# rename colnames question1: query_string, qustion2: question
df = df.rename(columns={'question1': 'query_string', 'question2': 'question'})
df

Unnamed: 0,query_string,question,answer,source,rating
0,How quickly can the global economy recover fro...,Can I get COVID-19 from animals when travellin...,Although the current spread and growth of the ...,JHU Public Health Canda Public Health Services,1.0
1,How quickly can the global economy recover fro...,How can I protect myself and others?,The best way to prevent illness from COVID-19 ...,JHU Public Health CDC,0.0
2,How quickly can the global economy recover fro...,Where did COVID-19 come from?,"It was first found in Wuhan City, Hubei Provin...",JHU Public Health CDC,0.0
3,How quickly can the global economy recover fro...,Can my pet or other animals get sick from COVI...,There is currently no evidence to suggest that...,JHU Public Health Canda Public Health Services,0.0
4,How quickly can the global economy recover fro...,How can I protect my child from COVID-19?,By having them practice the same things you ha...,JHU Public Health,0.0
...,...,...,...,...,...
24235,austrian coronavirus,What is novel (new) coronavirus?,"A novel, or new, coronavirus is one that has n...",JHU Public Health CDC,0.0
24236,bbc coronavirus espanol,What is a coronavirus?,Coronavirus are a type of virus - there are ma...,JHU Public Health WHO,0.0
24237,bbc coronavirus espanol,What is novel (new) coronavirus?,"A novel, or new, coronavirus is one that has n...",JHU Public Health CDC,0.0
24238,bbc coronavirus espanol,Do viruses get weaker over time? Will coronavi...,"They can, but it is not guaranteed; it depends...",JHU Public Health ScienceMag,0.0


In [4]:
df.shape

(24240, 5)

In [5]:
# check all rating scores
df.rating.value_counts().sort_index(ascending=False).to_frame()

Unnamed: 0,rating
100.000000,1525
99.833333,1
99.714286,1
99.500000,1
99.333333,1
...,...
1.333333,2
1.000000,358
0.500000,24
0.142857,1


In [6]:
# keep only related question, query_string, answer for FAQ (rating = 100)

df_by_rating = df[df['rating'] == 100]
df_by_rating

Unnamed: 0,query_string,question,answer,source,rating
5,What does Corona and Covid mean?,What is COVID-19?,COVID-19 is a new coronavirus that we have not...,JHU Public Health,100.0
10,when will the social distancing end? and what ...,What is a coronavirus?,Coronavirus are a type of virus - there are ma...,JHU Public Health WHO,100.0
21,How do I go grocery shopping?,Any advice about how to minimize risk during g...,"Currently, there is no evidence of food or foo...",JHU Public Health FDA,100.0
28,what is covid 19,What is COVID-19?,COVID-19 is a new coronavirus that we have not...,JHU Public Health,100.0
33,What is COVID-19's definition?,What is COVID-19?,COVID-19 is a new coronavirus that we have not...,JHU Public Health,100.0
...,...,...,...,...,...
24178,will coronavirus stop in summer,Will warm weather / summer / heat stop outbrea...,"We do not know. Some viruses, like the common ...",JHU Public Health,100.0
24183,will coronavirus stop in the summer,Will warm weather / summer / heat stop outbrea...,"We do not know. Some viruses, like the common ...",JHU Public Health,100.0
24200,will coronavirus survive in the summer,Will warm weather / summer / heat stop outbrea...,"We do not know. Some viruses, like the common ...",JHU Public Health,100.0
24202,will coronavirus survive on surfaces,"How long does the virus live on surfaces, like...",A recent study shows that the virus can live i...,JHU Public Health JHU,100.0


In [7]:
# drop null values
df_by_rating.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_by_rating.dropna(inplace=True)


In [8]:
df_by_rating.shape

(1403, 5)

In [9]:
df_by_rating.rating.value_counts()

100.0    1403
Name: rating, dtype: int64

In [10]:
# create instance of CovidFAQ_Parser and generate query_answer_pairs
covidfaq_parser = CovidFAQ_Parser()
covidfaq_parser.extract_data(df_by_rating)

In [11]:
unique_questions = df_by_rating.question.unique()
len(unique_questions)

68

In [12]:
unique_queries = df_by_rating.query_string.unique()
len(unique_queries)

1078

In [13]:
# get faq pairs
faq_pairs = covidfaq_parser.faq_pairs

In [14]:
faq_pairs[:10]

[{'label': 1,
  'query_type': 'faq',
  'question': 'What is COVID-19? ',
  'answer': 'COVID-19 is a new coronavirus that we have not seen previously; it is not the same as the flu or common cold. Coronaviruses are types of viruses that cause illnesses. These include the common cold, flu, to more severe diseases like Middle East Respiratory Syndrom (MERs-CoV) and Severe Acute Respiratory Syndrome (SARS-CoV).   [[Would you like to know more about COVID-19 symptoms?]]',
  'id': '1'},
 {'label': 1,
  'query_type': 'faq',
  'question': 'What is a coronavirus?',
  'answer': 'Coronavirus are a type of virus - there are many kinds, and some of them can cause disease. Examples of coronaviruses include the common cold, flu, to more severe diseases like Middle East Respiratory Syndrome (MERS-CoV) and Severe Acute Respiratory Syndrome (SARS-CoV).   COVID19 is a novel coronavirus that has been recently identified.   [[Would you like more information on COVID19?]] [[Would you like to know the sympto

In [15]:
covidfaq_parser.num_faq_pairs

68

In [16]:
# get user_query_pairs
user_query_pairs = covidfaq_parser.user_query_pairs

In [17]:
user_query_pairs[:10]

[{'label': 1,
  'query_type': 'user_query',
  'question': 'What does Corona and Covid mean?',
  'answer': 'COVID-19 is a new coronavirus that we have not seen previously; it is not the same as the flu or common cold. Coronaviruses are types of viruses that cause illnesses. These include the common cold, flu, to more severe diseases like Middle East Respiratory Syndrom (MERs-CoV) and Severe Acute Respiratory Syndrome (SARS-CoV).   [[Would you like to know more about COVID-19 symptoms?]]',
  'id': '69'},
 {'label': 1,
  'query_type': 'user_query',
  'question': 'when will the social distancing end? and what is the economic consequent of the pandemic?',
  'answer': 'Coronavirus are a type of virus - there are many kinds, and some of them can cause disease. Examples of coronaviruses include the common cold, flu, to more severe diseases like Middle East Respiratory Syndrome (MERS-CoV) and Severe Acute Respiratory Syndrome (SARS-CoV).   COVID19 is a novel coronavirus that has been recently i

In [18]:
covidfaq_parser.num_user_query_pairs

1387

In [19]:
# get query_answer_pairs
query_answer_pairs = covidfaq_parser.query_answer_pairs

In [20]:
query_answer_pairs[:10]

[{'label': 1,
  'query_type': 'faq',
  'question': 'What is COVID-19? ',
  'answer': 'COVID-19 is a new coronavirus that we have not seen previously; it is not the same as the flu or common cold. Coronaviruses are types of viruses that cause illnesses. These include the common cold, flu, to more severe diseases like Middle East Respiratory Syndrom (MERs-CoV) and Severe Acute Respiratory Syndrome (SARS-CoV).   [[Would you like to know more about COVID-19 symptoms?]]',
  'id': '1'},
 {'label': 1,
  'query_type': 'faq',
  'question': 'What is a coronavirus?',
  'answer': 'Coronavirus are a type of virus - there are many kinds, and some of them can cause disease. Examples of coronaviruses include the common cold, flu, to more severe diseases like Middle East Respiratory Syndrome (MERS-CoV) and Severe Acute Respiratory Syndrome (SARS-CoV).   COVID19 is a novel coronavirus that has been recently identified.   [[Would you like more information on COVID19?]] [[Would you like to know the sympto

In [21]:
covidfaq_parser.num_query_answer_pairs

1455

In [22]:
# Dump data to json file

dump_to_json(covidfaq_parser.query_answer_pairs, '../../../BERT-FAQ/data/CovidFAQ/query_answer_pairs.json', sort_keys=False)