In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/symptom2disease/Symptom2Disease.csv


In [2]:
#Import Necessary Libraries
import string
from collections import Counter
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn
import torchtext
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [3]:
torch.cuda.is_available()

True

# Reading the Dataset

In [4]:

df = pd.read_csv("/kaggle/input/symptom2disease/Symptom2Disease.csv")
df.drop("Unnamed: 0",inplace=True,axis=1)
df

Unnamed: 0,label,text
0,Psoriasis,I have been experiencing a skin rash on my arm...
1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,Psoriasis,I have been experiencing joint pain in my fing...
3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,Psoriasis,"My nails have small dents or pits in them, and..."
...,...,...
1195,diabetes,I'm shaking and trembling all over. I've lost ...
1196,diabetes,"Particularly in the crevices of my skin, I hav..."
1197,diabetes,I regularly experience these intense urges and...
1198,diabetes,"I have trouble breathing, especially outside. ..."


# Preprocessing steps:stopwords removal,datacleaning etc

In [5]:
# set of English stopwords we will remove from our text data
stop_words = set(stopwords.words('english'))

In [6]:
def clean_text(sent):
    #remove punctuations
    sent = sent.translate(str.maketrans('','',string.punctuation)).strip()
    
    #remove stopwords
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(sent)
    words = [word for word in words if word not in stop_words]
    
    return " ".join(words).lower()

In [7]:
# clean text rows in dataframe
df["text"] = df["text"].apply(clean_text)

In [8]:
# get list of diseases in our dataset
diseases = df["label"].unique()

# helper dictionaries to convert diseases to index and vice versa
idx2dis = {k:v for k,v in enumerate(diseases)}
dis2idx = {v:k for k,v in idx2dis.items()}

In [9]:
# convert disease name to index (label encoding)
df["label"] = df["label"].apply(lambda x: dis2idx[x])

In [10]:
# Split the data into train,test set
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)

In [11]:
# pytorch dataset object use index to return item, so need to reset non-continuoues index of divided dataset
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

In [12]:
# max number of words in symptoms descriptions (cleaned version)
max_words = X_train.apply(lambda x:x.split()).apply(len).max()
max_words

31

In [13]:
# create vocabulart using torchtext vocab class
counter = Counter()
for text in X_train:
    counter.update(text.split())

vocab = torchtext.vocab.vocab(counter,specials=['<unk>', '<pad>'])

In [14]:
# set default index as unknown token
vocab.set_default_index(vocab['<unk>'])

In [15]:
# Create a PyTorch dataset`
class DiseaseDataset(torch.utils.data.Dataset):
    def __init__(self, symptoms,labels):
        self.symptoms = symptoms
        self.labels= torch.tensor(labels.to_numpy())
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.symptoms[idx]
        label = self.labels[idx]

        # Convert the text to a sequence of word indices
        text_indices = [vocab[word] for word in text.split()]
        
        # padding for same length sequence
        if len(text_indices)<max_words:
            text_indices = text_indices + [1]*(max_words - len(text_indices))
        
        return torch.tensor(text_indices), label

In [16]:
# instantiate dataset objects
train_dataset = DiseaseDataset(X_train, y_train)
val_dataset = DiseaseDataset(X_test, y_test)

In [17]:
# choose batch size, will start from smaller values as we got smaller dataset
batch_size = 8

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [18]:
# Define the RNN model
class RNNModel(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim,num_classes,drop_prob,num_layers=1,bidir=False,seq="lstm"):
        super(RNNModel, self).__init__()
        self.seq = seq
        self.bidir_f = 2 if bidir else 0
        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim)
        if seq=="lstm":
            self.rnn = torch.nn.LSTM(embedding_dim, hidden_dim,
                                     num_layers=num_layers,
                                     batch_first=True,
                                     bidirectional=bidir)
        else:
            self.rnn = torch.nn.GRU(embedding_dim, hidden_dim,
                                 num_layers=num_layers,
                                 batch_first=True,
                                bidirectional=bidir)
        
        self.dropout = torch.nn.Dropout(drop_prob) #dropout layer
        self.fc = torch.nn.Linear(hidden_dim*self.bidir_f, num_classes) # fully connected layer

    def forward(self, text_indices):
        # Embed the text indices
        embedded_text = self.embedding(text_indices)
#         print("EMB SHAPE: ",embedded_text.shape)

        # Pass the embedded text through the RNN
        rnn_output,hidden_states = self.rnn(embedded_text)
        # Take the last output of the RNN
        last_rnn_output = rnn_output[:, -1, :]
        x = self.dropout(last_rnn_output)
        # Pass the last output of the RNN through the fully connected layer
        x = self.fc(x)

        # Return the final output
        return x

In [19]:
def train(model,num_epochs=10):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    #choose device for training
    device = "cuda" if torch.cuda.is_available()  else "cpu"
    model = model.cuda()
    model = model.to(device)
    print("IS CUDA: ",next(model.parameters()).is_cuda)
    
    # Training loop
    for epoch in range(num_epochs):
        model.train()
        for data in train_loader:
            inputs,labels = data 
            inputs,labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            acc = (labels == outputs.argmax(dim=-1)).float().mean().item()
            loss.backward()
            optimizer.step()

        # Validation
        model.eval()
        with torch.no_grad():
            val_loss = 0.0
            correct = 0
            total = 0
            for inputs, labels in val_loader:
                inputs,labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                predicted = outputs.argmax(-1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        accuracy = (labels == outputs.argmax(dim=-1)).float().mean().item()
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {val_loss}, Train Accuracy: {acc:.2f}  Val Accuracy: {accuracy:.2f}')

In [20]:
num_classes = len(np.unique(y_train))
vocab_size = len(vocab)
emb_dim = 256
hidden_dim = 128
drop_prob = 0.4

In [21]:
model_lstm = RNNModel(vocab_size,emb_dim,hidden_dim,num_classes,drop_prob,num_layers=3,bidir=True, seq="lstm")

In [22]:
train(model_lstm,35)

IS CUDA:  True
Epoch [1/35], Loss: 92.01725339889526, Train Accuracy: 0.00  Val Accuracy: 0.00
Epoch [2/35], Loss: 78.9551649093628, Train Accuracy: 0.00  Val Accuracy: 0.00
Epoch [3/35], Loss: 74.6918408870697, Train Accuracy: 0.12  Val Accuracy: 0.12
Epoch [4/35], Loss: 61.21231651306152, Train Accuracy: 0.25  Val Accuracy: 0.25
Epoch [5/35], Loss: 50.374439120292664, Train Accuracy: 0.38  Val Accuracy: 0.62
Epoch [6/35], Loss: 32.70776554942131, Train Accuracy: 0.75  Val Accuracy: 0.88
Epoch [7/35], Loss: 29.302746415138245, Train Accuracy: 0.88  Val Accuracy: 0.88
Epoch [8/35], Loss: 28.719481825828552, Train Accuracy: 0.88  Val Accuracy: 0.88
Epoch [9/35], Loss: 26.834640324115753, Train Accuracy: 1.00  Val Accuracy: 1.00
Epoch [10/35], Loss: 23.35372630134225, Train Accuracy: 0.88  Val Accuracy: 0.88
Epoch [11/35], Loss: 25.242680657655, Train Accuracy: 1.00  Val Accuracy: 0.88
Epoch [12/35], Loss: 23.69400708936155, Train Accuracy: 1.00  Val Accuracy: 0.88
Epoch [13/35], Loss: 3

In [23]:
model_gru = RNNModel(vocab_size,emb_dim,hidden_dim,num_classes,drop_prob,num_layers=1,bidir=True,seq="gru")

In [24]:
train(model_gru,20)

IS CUDA:  True
Epoch [1/20], Loss: 88.82546520233154, Train Accuracy: 0.00  Val Accuracy: 0.25
Epoch [2/20], Loss: 61.91657745838165, Train Accuracy: 0.38  Val Accuracy: 0.62
Epoch [3/20], Loss: 36.18146961927414, Train Accuracy: 0.62  Val Accuracy: 0.88
Epoch [4/20], Loss: 22.785705775022507, Train Accuracy: 0.88  Val Accuracy: 1.00
Epoch [5/20], Loss: 16.199264124035835, Train Accuracy: 1.00  Val Accuracy: 1.00
Epoch [6/20], Loss: 12.187511496245861, Train Accuracy: 1.00  Val Accuracy: 1.00
Epoch [7/20], Loss: 11.177432298660278, Train Accuracy: 1.00  Val Accuracy: 1.00
Epoch [8/20], Loss: 9.949819948524237, Train Accuracy: 1.00  Val Accuracy: 1.00
Epoch [9/20], Loss: 8.938183657824993, Train Accuracy: 1.00  Val Accuracy: 1.00
Epoch [10/20], Loss: 8.670395903289318, Train Accuracy: 1.00  Val Accuracy: 1.00
Epoch [11/20], Loss: 8.942874561995268, Train Accuracy: 1.00  Val Accuracy: 1.00
Epoch [12/20], Loss: 8.719596456736326, Train Accuracy: 1.00  Val Accuracy: 1.00
Epoch [13/20], Los

In [25]:
def make_pred(model,text):
    text = clean_text(text)
    # Convert the text to a sequence of word indices
    text_indices = [vocab[word] for word in text.split()]
        
    # padding for same length sequence
    if len(text_indices)<max_words:
        text_indices = text_indices + [1]*(max_words - len(text_indices))
    text_indices = torch.tensor(text_indices).cuda()
    pred = model(text_indices.unsqueeze(0))

    print(idx2dis[pred.argmax(1).item()])

In [29]:
symp2 = "I've been itching a lot"

make_pred(model_lstm, symp2)

Varicose Veins


In [31]:
torch.save(model_lstm.state_dict(), "/kaggle/working/model_lstm.h5")

In [35]:
mdl = RNNModel(vocab_size,emb_dim,hidden_dim,num_classes,drop_prob,num_layers=3,bidir=True, seq="lstm")
mdl.load_state_dict(torch.load("/kaggle/working/model_lstm.h5", weights_only=True))

<All keys matched successfully>

In [40]:
make_pred(mdl.cuda(), "my stomach ache")

peptic ulcer disease
