In [None]:
# pip install nltk --quiet

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split as tts
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder as LE
from sklearn.metrics.pairwise import cosine_similarity
import random
import nltk
import re

In [3]:
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
nltk.download('punkt')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HIMANGSHUB\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HIMANGSHUB\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## read the dataset

In [4]:
chatbot_df = pd.read_csv("medical-dataset.csv")

In [5]:
chatbot_df["intent"].value_counts()

covid            199
heart            130
diabetes         102
mental_health     98
nutrition         92
fitness           80
greeting          20
Name: intent, dtype: int64

In [6]:
chatbot_df.shape

(721, 3)

# Separates out input and output data for intent classification

In [18]:
input_data = chatbot_df["Question"].values
output_data = chatbot_df["intent"].values

## data cleaning

### remove some pattern 

In [31]:
for sentence in range(len(input_data)):
    sen = re.sub('\d*[.]', '', input_data[sentence]) # remove "1." etc
    sen = re.sub('\w*[:]', '', sen) # remove "Q:"
    sen = re.sub(r'[^\w\s]', '', sen) # remove punctuations
    input_data[sentence] = sen

In [35]:
input_data[373]

' What are the symptoms of COVID19'

In [29]:
sentence = input_data[272]
sentence

' I was recently diagnosed with type 2 diabetes Do I need to see an endocrinologist'

In [30]:
# sentence = input_data[373]
word_tok = nltk.word_tokenize(sentence)
word_tok

['I',
 'was',
 'recently',
 'diagnosed',
 'with',
 'type',
 '2',
 'diabetes',
 'Do',
 'I',
 'need',
 'to',
 'see',
 'an',
 'endocrinologist']

### function for data cleanup 

In [33]:
stemmer = LancasterStemmer()
label_encode = LE()

In [34]:
def cleanup(sentence):
    word_tok = nltk.word_tokenize(sentence)
    stemmed_words = [stemmer.stem(w) for w in word_tok]

    return ' '.join(stemmed_words)

In [37]:
stemmed_input_data = []
for question in input_data:
    stemmed_input_data.append(cleanup(question))

In [42]:
stemmed_input_data[0]

'what doe it mean to hav a ment il'

In [44]:
train_test_dataset = pd.DataFrame({"Question":stemmed_input_data , "Intent":output_data})

## Create csv file for train and testing the model

In [47]:
train_test_dataset.to_csv("Dataset/stemmed-data.csv" , index=False)

## Cleaning of Response data for the user after classification

In [23]:
response_data = chatbot_df["Answer"]

In [28]:
response_data[227]

'Companies are not required\xa0to prove dietary supplements are safe and effective before they are sold. That means it is possible for supplements to contain too much or too little of an ingredient or be contaminated with an unexpected ingredient. Find more information in the Food and Drug Administration’s\xa0Information for Consumers on Using Dietary Supplements\xa0resource. A variety of resources on the risks and safety of dietary supplements are also available on Nutrition.gov’s\xa0Safety and Health Claims\xa0page. If you experience a serious negative side effect from a dietary supplement, follow the steps on\xa0Reporting Serious Problems to FDA. \xa0\n\nBe sure to consult with your doctor before starting a dietary supplement. Supplements may not be safe for certain health conditions, including pregnancy or chronic diseases. They could also lead to harmful side effects if combined with other supplements or medicines, or if consumed in large amounts. Read more on the FDA’s Supplement

In [50]:
chatbot_df["Answer"].to_csv("Dataset/answer_data.csv",index=False)