# Symptom Prediction
### Imports

In [None]:
#import standard + support libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json

#import nlp libs
import nltk
from nltk.stem import PorterStemmer

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [None]:
#download necessary nltk libraries
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
#load intents
with open('intents_new_new.json', 'r') as f:
    intents = json.load(f)

In [None]:
#extract list of tags, patterns

labels = []
patterns = []

for intent in intents['intents']:
    tag = intent['tag']
    labels.append(tag) #add new tag to list of tags/labels
    pattern = intent['patterns']
    patterns.append(pattern) #add new pattern to list of patterns

### Data Processing

In [None]:
#create a function that stems a tokenized pattern / sentence and makes a bag of words / vector out of it (of all words of the intents)
def word_to_vect(tokenized_sentence, all_words):
    #stem tokenized sentence
    #as we have little words per pattern we only stem the words without looking at the context of the words around it
    tokenized_sentence = [PorterStemmer().stem(w.lower()) for w in tokenized_sentence] 
    
    #create vector
    vect=[]    
    for word in all_words:
        if word in tokenized_sentence:
            vect.append(1)
        else: 
            vect.append(0)
    return np.asarray(vect, dtype=np.float32)

In [None]:
#in order for the word to vec to work we need a list of every word in the patterns of the intents

word_list = []
for pattern in patterns:
    for sentence in pattern:
        #tokenize
        words = nltk.word_tokenize(sentence)
        #stemming
        for word in words:
            word = PorterStemmer().stem(word.lower())
            word_list.append(word) #add new word to existing ones
            
#list of all words
word_list = sorted(set(word_list))

In [None]:
#create vector for every pattern in the intents

data_train = []
for i in range(len(labels)):
    for j in range(len(patterns[i])):
        token=nltk.word_tokenize(patterns[i][j]) #tokenize sentences
        word_bag=word_to_vect(token, word_list) #create a bag of words/ word to vect out of the tokens
        data_train.append((i, word_bag))

In [None]:
#split up data in vectors of the patterns = X and labels = Y
x_train=[]
y_train=[]
for y,x in data_train:
    x_train.append(x)
    y_train.append(y)
x_train = np.array(x_train)
y_train = np.array(y_train)

### Pytorch

In [None]:
#create Dataset that is usable for PyTorch
class CustomDataset(Dataset):
    def __init__(self, words, tags):
        self.x = words
        self.y = tags
        self.length = len(words)
    
    def __len__(self):
        return  self.length
    
    def __getitem__(self, i):
        return self.x[i], self.y[i]

In [None]:
#Neural Network
#this is a simple neural network for the classification process

class Net(nn.Module):
    def __init__(self, number_words, hidden_size, number_tags):
        super(Net, self).__init__()
        self.start = nn.Linear(number_words, hidden_size) #input layer
        self.h1 = nn.Linear(hidden_size, hidden_size) #hidden layer
        self.end = nn.Linear(hidden_size, number_tags) #output layer
        
        self.relu = nn.ReLU() 
        
    def forward(self, x):
        x = self.relu(self.start(x))
        x = self.relu(self.h1(x))
        x = self.end(x)
        return x

In [None]:
#Model preparation
data = DataLoader(CustomDataset(x_train, y_train),
                  batch_size=8, shuffle=True, num_workers=2)

#if you wanna use your gpu, type 'cuda' instead of 'cpu'
model = Net(number_words=len(word_list), number_tags=len(labels), hidden_size=8).to(torch.device('cpu'))

#these loss functions and optimizers were chosen due to recommendations of Pytorch
crit = nn.CrossEntropyLoss()
optimize = torch.optim.ASGD(model.parameters(), lr=0.01)

#### Model Train

In [None]:
for epoch in range(1000):
    for (words, tags) in data:
        #if you wanna use your gpu, type 'cuda' instead of 'cpu'
        words = words.to(torch.device('cpu'))
        tags = tags.to(torch.device('cpu'))
        
        #forward step
        output = model(words)
        loss = crit(output, tags)
        
        #backward step
        optimize.zero_grad()
        loss.backward()
        optimize.step()

    #print loss every 100 steps of training    
    if (epoch + 1) % 100 == 0:
        print(f'epoch {epoch + 1}/1000, loss = {loss.item():.4f}')

epoch 100/1000, loss = 0.3618
epoch 200/1000, loss = 1.2081
epoch 300/1000, loss = 0.0839
epoch 400/1000, loss = 0.1676
epoch 500/1000, loss = 0.0166
epoch 600/1000, loss = 0.0133
epoch 700/1000, loss = 0.1331
epoch 800/1000, loss = 0.0015
epoch 900/1000, loss = 0.0500
epoch 1000/1000, loss = 0.0769


In [None]:
#save trained data
data = {
    "model_state": model.state_dict(),
    "word_list": word_list,
    "tags": labels
}

FILE = "data.pth"
torch.save(data, FILE)