In [None]:
# pip install nltk --quiet

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split as tts
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder as LE
from sklearn.metrics.pairwise import cosine_similarity
import random
import nltk
import re

In [28]:
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HIMANGSHUB\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HIMANGSHUB\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## read the dataset

In [4]:
chatbot_df = pd.read_csv("medical-dataset.csv")

In [5]:
chatbot_df["intent"].value_counts()

covid            199
heart            130
diabetes         102
mental_health     98
nutrition         92
fitness           80
greeting          20
Name: intent, dtype: int64

In [6]:
chatbot_df.shape

(721, 3)

# Separates out input and output data for intent classification

In [7]:
input_data = chatbot_df["Question"].values
output_data = chatbot_df["intent"].values

## data cleaning

### remove some pattern 

In [8]:
for sentence in range(len(input_data)):
    sen = re.sub('\d*[.]', '', input_data[sentence]) # remove "1." etc
    sen = re.sub('\w*[:]', '', sen) # remove "Q:"
    sen = re.sub(r'[^\w\s]', '', sen) # remove punctuations
    sen = sen.lower()
    input_data[sentence] = sen

In [9]:
input_data[373]

' what are the symptoms of covid19'

In [10]:
sentence = input_data[272]
sentence

' i was recently diagnosed with type 2 diabetes do i need to see an endocrinologist'

In [11]:
# sentence = input_data[373]
word_tok = nltk.word_tokenize(sentence)
word_tok

['i',
 'was',
 'recently',
 'diagnosed',
 'with',
 'type',
 '2',
 'diabetes',
 'do',
 'i',
 'need',
 'to',
 'see',
 'an',
 'endocrinologist']

### function for data cleanup 

In [14]:
stemmer = LancasterStemmer()

### Stemming 

In [15]:
def cleanup(sentence):
    word_tok = nltk.word_tokenize(sentence)
    stemmed_words = [stemmer.stem(w) for w in word_tok]

    return ' '.join(stemmed_words)

In [16]:
stemmed_input_data = []
for question in input_data:
    stemmed_input_data.append(cleanup(question))

In [17]:
stemmed_input_data[0]

'what doe it mean to hav a ment il'

In [18]:
train_test_dataset = pd.DataFrame({"Question":stemmed_input_data , "Intent":output_data})

### Lemmatization

In [31]:
lemmatizer = WordNetLemmatizer()

In [32]:
def lemma(sentence):
    word_tok = nltk.word_tokenize(sentence)
    lemma_words = [lemmatizer.lemmatize(w) for w in word_tok]

    return ' '.join(lemma_words)

In [40]:
lemmetize_input_data = []
for question in input_data:
    lemmetize_input_data.append(lemma(question))
print(lemmetize_input_data[0]," : " ,input_data[0])


what doe it mean to have a mental illness  :  what does it mean to have a mental illness


## Create csv file for train and testing the model

In [19]:
train_test_dataset.to_csv("Cleaned Dataset/stemmed-data.csv" , index=False)

In [41]:
lemma_dataset = pd.DataFrame({"Question":lemmetize_input_data , "Intent":output_data})
lemma_dataset.to_csv("Cleaned Dataset/lemmatized-data.csv" , index=False)

## Cleaning of Response data for the user after classification

In [20]:
response_data = chatbot_df["Answer"].values

In [21]:
count = 0
for sen in response_data :
    if(len(re.findall("A:", sen))!=0):
        count += 1
count

106

In [22]:
s = response_data[225]
s= re.sub('\xa0',' ',s)
s

'Some individuals consider herbal or botanical supplements to be “natural.” However, herbal or botanical supplements are not proven to be safer. All dietary supplements, including herbal remedies, are not required to be reviewed for safety and effectiveness before being sold. Read 5 Tips: What Consumers Need To Know About Dietary Supplements from the National Center for Complementary and Integrative Health and talk to your doctor before starting any supplements. Find more information on Nutrition.gov’s Herbal Supplements page.'

In [23]:
s = response_data[225]
s = re.sub('[/]',' ',s)
s = re.sub('A:', '', s)
s = re.sub("â€™" ,'', s)
s= re.sub('\xa0',' ',s)
regex = re.compile(r'[\n\r\t]')
s = regex.sub('', s)
s= re.sub(' [\s]','',s)
s

'Some individuals consider herbal or botanical supplements to be “natural.” However, herbal or botanical supplements are not proven to be safer. All dietary supplements, including herbal remedies, are not required to be reviewed for safety and effectiveness before being sold. Read 5 Tips: What Consumers Need To Know About Dietary Supplements from the National Center for Complementary and Integrative Health and talk to your doctor before starting any supplements. Find more information on Nutrition.gov’s Herbal Supplements page.'

In [24]:
s = response_data[14]
s = s.replace("\\n", " ") 
s

'It is not uncommon for people to stop taking their medication when they feel their symptoms are under control. Others may choose to stop taking their medication because of its side effects, without realizing that most side effects can be effectively managed. While it may seem reasonable to stop taking the medication, the problem is that most often, the symptoms will return. If you or your child is taking medication, it is very important that you work together with your doctor before making decisions about any changes in your treatment. Another problem with stopping medication, particularly for stopping it abruptly, is that you may develop withdrawal symptoms that can be very unpleasant. If you and your doctor feel a trial off your medicine is a good idea, it is necessary to slowly decrease the dosage of medications so that these symptoms don’t occur. It is important that your doctor and pharmacist work together to make sure your medications are working safely and effectively. You shou

### Final Cleanup of the answer data


In [25]:
for sentence in range(len(response_data)):
    s = re.sub('[/]',' ',response_data[sentence])
    s = re.sub('A:', '', s)
    s = re.sub("â€™" ,'', s)
    s= re.sub('\xa0',' ',s)
    regex = re.compile(r'[\n\r\t]')
    s = regex.sub('', s)
    s= re.sub(' [\s]','',s)
    s = s.replace("\\n", " ") 
    response_data[sentence] = s
response_data[1:6]

array(['It is estimated that mental illness affects 1 in 5 adults in America, and that 1 in 24 adults have a serious mental illness. Mental illness does not discriminate; it can affect anyone, regardless of gender, age, income, social status, ethnicity, religion, sexual orientation, or background.Although mental illness can affect anyone, certain conditions may be more common in different populations. For instance, eating disorders tend to occur more often in females, while disorders such as attention deficit hyperactivity disorder is more prevalent in children.Additionally, all ages are susceptible, but the young and the old are especially vulnerable. Mental illnesses usually strike individuals in the prime of their lives, with 75 percent of mental health conditions developing by the age of 24. This makes identification and treatment of mental disorders particularly difficult, because the normal personality and behavioral changes of adolescence may mask symptoms of a mental health con

In [26]:
answers_df = pd.DataFrame({"Answer":response_data})
answers_df.head()

Unnamed: 0,Answer
0,Mental illnesses are health conditions that di...
1,It is estimated that mental illness affects 1 ...
2,It is estimated that mental illness affects 1 ...
3,Symptoms of mental health disorders vary depen...
4,"When healing from mental illness, early identi..."


### Save the clean data in csv file

In [27]:
answers_df.to_csv("Cleaned Dataset/answer_data.csv",index=False)