In [None]:
# pip install nltk --quiet

In [1]:
import pandas as pd
import numpy as np
import nltk
import re

In [2]:
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HIMANGSHUB\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HIMANGSHUB\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## read the dataset

In [3]:
chatbot_df = pd.read_csv("medical-dataset.csv")
chatbot_df.head()

Unnamed: 0,Question,Answer,intent
0,What does it mean to have a mental illness?,Mental illnesses are health conditions that di...,mental_health
1,Who does mental illness affect?,It is estimated that mental illness affects 1 ...,mental_health
2,What causes mental illness?,It is estimated that mental illness affects 1 ...,mental_health
3,What are some of the warning signs of mental i...,Symptoms of mental health disorders vary depen...,mental_health
4,Can people with mental illness recover?,"When healing from mental illness, early identi...",mental_health


In [4]:
chatbot_df["intent"].value_counts()

covid            199
heart            130
diabetes         102
mental_health     98
nutrition         92
fitness           80
greeting          20
Name: intent, dtype: int64

In [5]:
chatbot_df.shape

(721, 3)

# Separates out input and output data for intent classification

In [6]:
input_data = chatbot_df["Question"].values
output_data = chatbot_df["intent"].values

## data cleaning

### remove some pattern 

In [7]:
for sentence in range(len(input_data)):
    sen = re.sub('\d*[.]', '', input_data[sentence]) # remove "1." etc
    sen = re.sub('\w*[:]', '', sen) # remove "Q:"
    sen = re.sub(r'[^\w\s]', '', sen) # remove punctuations
    sen = sen.lower()
    input_data[sentence] = sen

In [8]:
input_data[373]

' what are the symptoms of covid19'

In [9]:
sentence = input_data[272]
sentence

' i was recently diagnosed with type 2 diabetes do i need to see an endocrinologist'

In [10]:
# sentence = input_data[373]
word_tok = nltk.word_tokenize(sentence)
word_tok

['i',
 'was',
 'recently',
 'diagnosed',
 'with',
 'type',
 '2',
 'diabetes',
 'do',
 'i',
 'need',
 'to',
 'see',
 'an',
 'endocrinologist']

### function for data cleanup 

In [None]:
stemmer = LancasterStemmer()

### Stemming 

In [None]:
def cleanup(sentence):
    word_tok = nltk.word_tokenize(sentence)
    stemmed_words = [stemmer.stem(w) for w in word_tok]

    return ' '.join(stemmed_words)

In [None]:
stemmed_input_data = []
for question in input_data:
    stemmed_input_data.append(cleanup(question))

In [None]:
stemmed_input_data[0]

In [None]:
# train_test_dataset = pd.DataFrame({"Question":stemmed_input_data , "Intent":output_data})

### Lemmatization

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
def lemma(sentence):
    word_tok = nltk.word_tokenize(sentence)
    lemma_words = [lemmatizer.lemmatize(w) for w in word_tok]

    return ' '.join(lemma_words)

In [None]:
lemmetize_input_data = []
for question in input_data:
    lemmetize_input_data.append(lemma(question))
print(lemmetize_input_data[0]," : " ,input_data[0])


## Create csv file for train and testing the model

In [None]:
# train_test_dataset.to_csv("Cleaned Dataset/stemmed-data.csv" , index=False)

In [None]:
# lemma_dataset = pd.DataFrame({"Question":lemmetize_input_data , "Intent":output_data})
# lemma_dataset.to_csv("Cleaned Dataset/lemmatized-data.csv" , index=False)

## Cleaning of Response data for the user after classification

In [11]:
response_data = chatbot_df["Answer"]
response_data

0      Mental illnesses are health conditions that di...
1      It is estimated that mental illness affects 1 ...
2      It is estimated that mental illness affects 1 ...
3      Symptoms of mental health disorders vary depen...
4      When healing from mental illness, early identi...
                             ...                        
716                                               Great!
717                                            All good!
718                              Hello,Nice to meet you!
719                Mention not.Happy to help ypu always!
720                          Bye, Hope to see you again!
Name: Answer, Length: 721, dtype: object

In [12]:
count = 0
for sen in response_data :
    if(len(re.findall("A:", sen))!=0):
        count += 1
count

106

In [13]:
s = response_data[225]
s= re.sub('\xa0',' ',s)
s

'Some individuals consider herbal or botanical supplements to be “natural.” However, herbal or botanical supplements are not proven to be safer. All dietary supplements, including herbal remedies, are not required to be reviewed for safety and effectiveness before being sold. Read 5 Tips: What Consumers Need To Know About Dietary Supplements from the National Center for Complementary and Integrative Health and talk to your doctor before starting any supplements. Find more information on Nutrition.gov’s Herbal Supplements page.'

In [14]:
s = response_data[225]
s = re.sub('[/]',' ',s)
s = re.sub('A:', '', s)
s = re.sub("â€™" ,'', s)
s= re.sub('\xa0',' ',s)
regex = re.compile(r'[\n\r\t]')
s = regex.sub('', s)
s= re.sub(' [\s]','',s)
s

'Some individuals consider herbal or botanical supplements to be “natural.” However, herbal or botanical supplements are not proven to be safer. All dietary supplements, including herbal remedies, are not required to be reviewed for safety and effectiveness before being sold. Read 5 Tips: What Consumers Need To Know About Dietary Supplements from the National Center for Complementary and Integrative Health and talk to your doctor before starting any supplements. Find more information on Nutrition.gov’s Herbal Supplements page.'

In [15]:
s = response_data[14]
s = s.replace("\\n", " ") 
s

'It is not uncommon for people to stop taking their medication when they feel their symptoms are under control. Others may choose to stop taking their medication because of its side effects, without realizing that most side effects can be effectively managed. While it may seem reasonable to stop taking the medication, the problem is that most often, the symptoms will return. If you or your child is taking medication, it is very important that you work together with your doctor before making decisions about any changes in your treatment. Another problem with stopping medication, particularly for stopping it abruptly, is that you may develop withdrawal symptoms that can be very unpleasant. If you and your doctor feel a trial off your medicine is a good idea, it is necessary to slowly decrease the dosage of medications so that these symptoms don’t occur. It is important that your doctor and pharmacist work together to make sure your medications are working safely and effectively. You shou

In [16]:
response_data.iloc[171]

'Generally, cardio training such as swimming or running will burn the most calories per session. Weight training will help you burn more calories over the course of the day â€“ muscle burns more calories at rest than other tissues, including fat, and therefore speeds up your resting metabolism.Researchers at the University of Colorado have found that HIIT exercise burns 25%-30% more calories than a steady-state exercise session such as a run. Therefore, combining cardio, weights and HIIT-specific training alongside a diet that will leave you in a calorie deficit is the most effective route to weight loss.'

In [17]:
import string
punc = string.punctuation
punc

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [18]:
required_removable=[]
for i in punc:
    if (i=='#' or i=='$' or i=='/' or i=='\\' or i=='~' or i=='^'):
        continue
    else:
        required_removable.append(i)

In [19]:
def remove_punctuation(response_data,arr):
    
    ans=set()
    for sen in response_data:
        for i in sen:
            if (i<='z' and i>='a'):
                continue
            elif (i<='Z' and i>='A'):
                continue
            elif (i<='9' and i>='0'):
                continue
            elif (i in arr):
                continue
            else:
                ans.add(i)
       
        
    return ans

In [20]:

special_char = list(remove_punctuation(response_data,required_removable))

In [21]:
special_char_remove = ""
special_char = ''.join(special_char)
for i in special_char:
    if i !=' ':
        special_char_remove += i
special_char_remove

'–¡”‚¬…\u200d×¢#$®—‘Â²\tƒ¹âœµóš\nºÃÅÆ/€\x9d\u202f\xa0“™’½\\'

### Final Cleanup of the answer data


In [22]:
for sentence in range(len(response_data)):
    s = re.sub('[/]',' ',response_data[sentence])
    s = re.sub('A:', '', s)
    # s = re.sub("â€™" ,'', s)
    # s= re.sub('\xa0',' ',s)
    regex = re.compile(r'[\n\r\t]')
    s = regex.sub('', s)
    s= re.sub(' [\s]','',s)
    s = s.replace("\\n", " ")
    s = s.translate(str.maketrans('', '',special_char_remove))
    response_data[sentence] = s
response_data

0      Mental illnesses are health conditions that di...
1      It is estimated that mental illness affects 1 ...
2      It is estimated that mental illness affects 1 ...
3      Symptoms of mental health disorders vary depen...
4      When healing from mental illness, early identi...
                             ...                        
716                                               Great!
717                                            All good!
718                              Hello,Nice to meet you!
719                Mention not.Happy to help ypu always!
720                          Bye, Hope to see you again!
Name: Answer, Length: 721, dtype: object

In [None]:
answers_df = pd.DataFrame({"Answer":response_data})
answers_df.head()

### Save the clean data in csv file

In [None]:
Datset_final = {
    "Question": input_data,
    "Answer": response_data,
    "Intent": output_data
}

In [None]:
Final_dataset_df = pd.DataFrame(Datset_final)

In [None]:
Final_dataset_df.to_csv("Cleaned Dataset/final-dataset.csv",index=False)