In [26]:
import re
import nltk
import pandas as pd
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import string

In [2]:
max_vocabulary = 10000
out_dim = 15

text_vectorization = TextVectorization(max_tokens=max_vocabulary,
                                      split="whitespace",
                                      standardize="lower_and_strip_punctuation",
                                      output_mode="int",
                                      output_sequence_length=out_dim)

In [3]:
original_data = pd.read_csv("./data/medical_dataset.csv")
data = original_data
data.head()

Unnamed: 0,description,medical_specialty,isHigherPriority,doAddToBlockchain
0,A 23-year-old white female presents with compl...,36,1,0
1,Consult for laparoscopic gastric bypass.,25,0,0
2,Consult for laparoscopic gastric bypass.,25,0,0
3,2-D M-Mode. Doppler.,4,1,0
4,2-D Echocardiogram,4,0,1


In [7]:
nlp_x_data = data.drop(columns=["doAddToBlockchain"])
print(nlp_x_data)

nlp_y_data = data["doAddToBlockchain"]
print(nlp_y_data)

                                            description  medical_specialty  \
0     A 23-year-old white female presents with compl...                 36   
1              Consult for laparoscopic gastric bypass.                 25   
2              Consult for laparoscopic gastric bypass.                 25   
3                                  2-D M-Mode. Doppler.                  4   
4                                    2-D Echocardiogram                  4   
...                                                 ...                ...   
3808  Patient suffered from morbid obesity for many ...                 25   
3809  Patient presented to the Bariatric Surgery Ser...                 25   
3810  Evaluation for elective surgical weight loss v...                 25   
3811  Chronic glossitis, xerostomia, probable enviro...                 36   
3812  This is a 14-month-old baby boy Caucasian who ...                 36   

      isHigherPriority  
0                    1  
1            

In [12]:
punctuations = string.punctuation

def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in punctuations])
    return punctuationfree

nlp_x_data['clean_msg'] = nlp_x_data['description'].apply(lambda x:remove_punctuation(x))
nlp_x_data.head()

Unnamed: 0,description,medical_specialty,isHigherPriority,clean_msg
0,A 23-year-old white female presents with compl...,36,1,A 23yearold white female presents with complai...
1,Consult for laparoscopic gastric bypass.,25,0,Consult for laparoscopic gastric bypass
2,Consult for laparoscopic gastric bypass.,25,0,Consult for laparoscopic gastric bypass
3,2-D M-Mode. Doppler.,4,1,2D MMode Doppler
4,2-D Echocardiogram,4,0,2D Echocardiogram


In [13]:
nlp_x_data['msg_lower'] = nlp_x_data['clean_msg'].apply(lambda x: x.lower())
nlp_x_data.head()

Unnamed: 0,description,medical_specialty,isHigherPriority,clean_msg,msg_lower
0,A 23-year-old white female presents with compl...,36,1,A 23yearold white female presents with complai...,a 23yearold white female presents with complai...
1,Consult for laparoscopic gastric bypass.,25,0,Consult for laparoscopic gastric bypass,consult for laparoscopic gastric bypass
2,Consult for laparoscopic gastric bypass.,25,0,Consult for laparoscopic gastric bypass,consult for laparoscopic gastric bypass
3,2-D M-Mode. Doppler.,4,1,2D MMode Doppler,2d mmode doppler
4,2-D Echocardiogram,4,0,2D Echocardiogram,2d echocardiogram


In [16]:
def tokenization(text):
    tokens = re.split('W+',text)
    return tokens

nlp_x_data['msg_tokenied'] = nlp_x_data['msg_lower'].apply(lambda x: tokenization(x))
nlp_x_data.head()

Unnamed: 0,description,medical_specialty,isHigherPriority,clean_msg,msg_lower,msg_tokenied
0,A 23-year-old white female presents with compl...,36,1,A 23yearold white female presents with complai...,a 23yearold white female presents with complai...,[a 23yearold white female presents with compla...
1,Consult for laparoscopic gastric bypass.,25,0,Consult for laparoscopic gastric bypass,consult for laparoscopic gastric bypass,[consult for laparoscopic gastric bypass]
2,Consult for laparoscopic gastric bypass.,25,0,Consult for laparoscopic gastric bypass,consult for laparoscopic gastric bypass,[consult for laparoscopic gastric bypass]
3,2-D M-Mode. Doppler.,4,1,2D MMode Doppler,2d mmode doppler,[2d mmode doppler]
4,2-D Echocardiogram,4,0,2D Echocardiogram,2d echocardiogram,[2d echocardiogram]


In [19]:
stopwords = nltk.corpus.stopwords.words('english')

def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

nlp_x_data['no_stopwords'] = nlp_x_data['msg_tokenied'].apply(lambda x:remove_stopwords(x))
nlp_x_data.head()

Unnamed: 0,description,medical_specialty,isHigherPriority,clean_msg,msg_lower,msg_tokenied,no_stopwords
0,A 23-year-old white female presents with compl...,36,1,A 23yearold white female presents with complai...,a 23yearold white female presents with complai...,[a 23yearold white female presents with compla...,[a 23yearold white female presents with compla...
1,Consult for laparoscopic gastric bypass.,25,0,Consult for laparoscopic gastric bypass,consult for laparoscopic gastric bypass,[consult for laparoscopic gastric bypass],[consult for laparoscopic gastric bypass]
2,Consult for laparoscopic gastric bypass.,25,0,Consult for laparoscopic gastric bypass,consult for laparoscopic gastric bypass,[consult for laparoscopic gastric bypass],[consult for laparoscopic gastric bypass]
3,2-D M-Mode. Doppler.,4,1,2D MMode Doppler,2d mmode doppler,[2d mmode doppler],[2d mmode doppler]
4,2-D Echocardiogram,4,0,2D Echocardiogram,2d echocardiogram,[2d echocardiogram],[2d echocardiogram]


In [23]:
porter_stemmer = PorterStemmer()

def stemming(text):
    stem_text = [porter_stemmer.stem(word) for word in text]
    return stem_text

nlp_x_data['msg_stemmed'] = nlp_x_data['no_stopwords'].apply(lambda x: stemming(x))
nlp_x_data.head()

Unnamed: 0,description,medical_specialty,isHigherPriority,clean_msg,msg_lower,msg_tokenied,no_stopwords,msg_stemmed
0,A 23-year-old white female presents with compl...,36,1,A 23yearold white female presents with complai...,a 23yearold white female presents with complai...,[a 23yearold white female presents with compla...,[a 23yearold white female presents with compla...,[a 23yearold white female presents with compla...
1,Consult for laparoscopic gastric bypass.,25,0,Consult for laparoscopic gastric bypass,consult for laparoscopic gastric bypass,[consult for laparoscopic gastric bypass],[consult for laparoscopic gastric bypass],[consult for laparoscopic gastric bypass]
2,Consult for laparoscopic gastric bypass.,25,0,Consult for laparoscopic gastric bypass,consult for laparoscopic gastric bypass,[consult for laparoscopic gastric bypass],[consult for laparoscopic gastric bypass],[consult for laparoscopic gastric bypass]
3,2-D M-Mode. Doppler.,4,1,2D MMode Doppler,2d mmode doppler,[2d mmode doppler],[2d mmode doppler],[2d mmode doppl]
4,2-D Echocardiogram,4,0,2D Echocardiogram,2d echocardiogram,[2d echocardiogram],[2d echocardiogram],[2d echocardiogram]


In [29]:
wordnet_lemmatizer = WordNetLemmatizer()

def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text

nlp_x_data['msg_lemmatized'] = nlp_x_data['no_stopwords'].apply(lambda x:lemmatizer(x))
nlp_x_data.head()

Unnamed: 0,description,medical_specialty,isHigherPriority,clean_msg,msg_lower,msg_tokenied,no_stopwords,msg_stemmed,msg_lemmatized
0,A 23-year-old white female presents with compl...,36,1,A 23yearold white female presents with complai...,a 23yearold white female presents with complai...,[a 23yearold white female presents with compla...,[a 23yearold white female presents with compla...,[a 23yearold white female presents with compla...,[a 23yearold white female presents with compla...
1,Consult for laparoscopic gastric bypass.,25,0,Consult for laparoscopic gastric bypass,consult for laparoscopic gastric bypass,[consult for laparoscopic gastric bypass],[consult for laparoscopic gastric bypass],[consult for laparoscopic gastric bypass],[consult for laparoscopic gastric bypass]
2,Consult for laparoscopic gastric bypass.,25,0,Consult for laparoscopic gastric bypass,consult for laparoscopic gastric bypass,[consult for laparoscopic gastric bypass],[consult for laparoscopic gastric bypass],[consult for laparoscopic gastric bypass],[consult for laparoscopic gastric bypass]
3,2-D M-Mode. Doppler.,4,1,2D MMode Doppler,2d mmode doppler,[2d mmode doppler],[2d mmode doppler],[2d mmode doppl],[2d mmode doppler]
4,2-D Echocardiogram,4,0,2D Echocardiogram,2d echocardiogram,[2d echocardiogram],[2d echocardiogram],[2d echocardiogram],[2d echocardiogram]
