In [25]:
import numpy as np
import pandas as pd
import re
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("covid_faq.csv")

In [3]:
df.head()

Unnamed: 0,questions,answers
0,What is a novel coronavirus?,A novel coronavirus is a new coronavirus that ...
1,Why is the disease being called coronavirus di...,"On February 11, 2020 the World Health Organiza..."
2,How does the virus spread?,The virus that causes COVID-19 is thought to s...
3,Can I get COVID-19 from food (including restau...,Currently there is no evidence that people can...
4,Will warm weather stop the outbreak of COVID-19?,It is not yet known whether weather and temper...


## Cleaning the Data

In [4]:
import nltk    
nltk.download('punkt')    #divides sentence into list of sentences
nltk.download('averaged_perceptron_tagger')   #tagging words with their parts of speech
nltk.download('wordnet')                # a lexical database for the English language
from nltk import word_tokenize
    # Word Tokenization ---> It makes tokens(individual words)
from nltk.stem import wordnet     # To perform Lemmitization
from nltk import pos_tag          # For parts f speech

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
# ----------------------------Text Normalization-----------------------------------#

def lemmatiztaion(text):
    text = str(text).lower()  # Converted into lower case
    
    spl_char_text = re.sub(r'[^a-z]',' ',text) # removed numbers and special characters
    
    tokens = word_tokenize(spl_char_text)   # word tokenization
    
    lemma = wordnet.WordNetLemmatizer()    # initalize lemmatizer
    
    tags_list = pos_tag(tokens, tagset= None) # Parts of speech
    
    # Created empty list
    
    lemma_words= []
    
    for token, pos_token in tags_list:
        
        if pos_token.startswith('V'):     # verb
            pos_val = 'v'
        elif pos_token.startswith('J'):   # adjective
            pos_val = 'a'
        elif pos_token.startswith('R'):   # adverb
            pos_val = 'r'
        else:    # any parts of speech except verb, adjective, adverb
            pos_val ='n'
            
        lemma_token = lemma.lemmatize(token, pos_val)   # computing
        lemma_words.append(lemma_token) # append values in list
        
            
    return " ".join(lemma_words)

In [6]:
df.head(3)

Unnamed: 0,questions,answers
0,What is a novel coronavirus?,A novel coronavirus is a new coronavirus that ...
1,Why is the disease being called coronavirus di...,"On February 11, 2020 the World Health Organiza..."
2,How does the virus spread?,The virus that causes COVID-19 is thought to s...


In [7]:
df['lemmatize_text'] = df.questions.apply(lemmatiztaion)

In [8]:
df.head(15)

Unnamed: 0,questions,answers,lemmatize_text
0,What is a novel coronavirus?,A novel coronavirus is a new coronavirus that ...,what be a novel coronavirus
1,Why is the disease being called coronavirus di...,"On February 11, 2020 the World Health Organiza...",why be the disease be call coronavirus disease...
2,How does the virus spread?,The virus that causes COVID-19 is thought to s...,how do the virus spread
3,Can I get COVID-19 from food (including restau...,Currently there is no evidence that people can...,can i get covid from food include restaurant t...
4,Will warm weather stop the outbreak of COVID-19?,It is not yet known whether weather and temper...,will warm weather stop the outbreak of covid
5,What is community spread?,Community spread means people have been infect...,what be community spread
6,Can mosquitoes or ticks spread the virus that ...,"At this time, CDC has no data to suggest that ...",can mosquitoes or tick spread the virus that c...
7,How can I protect myself?,Visit the How to Protect Yourself & Others pag...,how can i protect myself
8,Does CDC recommend the use of masks to prevent...,Wear masks in public settings when around peop...,do cdc recommend the use of mask to prevent covid
9,Is it safe to get care for my other medical co...,It is important to continue taking care of you...,be it safe to get care for my other medical co...


In [9]:
#stop words
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
stop = stopwords.words('english')
print(stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [11]:
#example
Question = "What is corona virus?"

In [12]:
Q= []
a = Question.split()
for i in a:
    if i in stop:
        continue
    else:
        Q.append(i)
    b =" ".join(Q)

In [13]:
Question_lemma = lemmatiztaion(b)   # applying function for text normalization

In [14]:
b

'What corona virus?'

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
tfidf = TfidfVectorizer()  # initiallizing Tfidf

In [17]:
trans_tfidf = tfidf.fit_transform(df['questions']).toarray()

In [18]:
data_tfidf = pd.DataFrame(trans_tfidf, columns= tfidf.get_feature_names())
data_tfidf.head(3)

Unnamed: 0,14,19,2019,about,access,actions,added,additional,adjusting,adopt,...,why,wild,will,with,work,workers,worn,worry,worse,you
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.104392,0.332053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.286723,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
Question_tfidf = tfidf.transform([Question_lemma]).toarray()


In [20]:
from sklearn.metrics import pairwise_distances

In [21]:
cosine = 1 - pairwise_distances(data_tfidf, Question_tfidf, metric = 'cosine')   # applying cosine similarity

In [22]:
def chatter(text):
    lemma = lemmatiztaion(text) # calling the funtion 
    tf = tfidf.transform([lemma]).toarray()
    cosine = 1 - pairwise_distances(data_tfidf, tf, metric = 'cosine')   # applying cosine similarity
    index_value_new = cosine.argmax()
    return df['answers'].loc[index_value_new]

In [23]:
chatter("Does covid will spread due to animals?")

'CDC does not have any evidence to suggest that imported animals or animal products pose a risk for spreading COVID-19 in the United States. This is a rapidly evolving situation and information will be updated as it becomes available. CDC, the U. S. Department of Agriculture (USDA), and the U.S. Fish and Wildlife Service (FWS) play distinct but complementary roles in regulating the importation of live animals and animal products into the United States. CDC regulates animals and animal products that pose a threat to human health, USDA regulateexternal icon animals and animal products that pose a threat to agriculture; and FWS regulatesexternal icon importation of endangered species and wildlife that can harm the health and welfare of humans, the interests of agriculture, horticulture, or forestry, and the welfare and survival of wildlife resources.'

In [24]:
chatter("Is it safe to get care for my other medical co...")

'It is important to continue taking care of your health and wellness. Continue your medications, and do not change your treatment plan without talking to your healthcare provider. Continue to manage your disease the way your healthcare provider has told you. Have at least a 2-week supply of all prescription and non-prescription medications. Talk to your healthcare provider about whether your vaccinations are up-to-date. Call your healthcare provider if you have any concerns about your medical conditions, or if you get sick. to find out about different ways you can connect with your healthcare provider for chronic disease management or other conditions. Do not delay getting emergency care for your health problems or any health condition that requires immediate attention. If you need emergency help, call 911. Emergency departments have infection prevention plans to protect you from getting COVID-19 if you need care for your medical condition. Continue to practice everyday prevention. Was