In [1]:
import pandas as pd
import os
import numpy as np
import string
import random

import torch
import torch.nn as nn
import torch.optim as optim
import math
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [3]:
# Считаем тренировочные данные.

df_train = pd.read_csv('/content/drive/MyDrive/Диплом_2024/data/therapy_train_true.csv')

df_train.head()

Unnamed: 0,text
0,Московский государственный медико-стоматологич...
1,Башкирский Государственный Медицинский Универс...
2,Министерство здравоохранения Республики Белару...
3,\nПаспортная часть\n\nФИО: \nВозраст: 29 лет\n...
4,\nИстория болезни.\nФамилия: \n Имя: \nОтчест...


In [4]:
# Считаем тестовые данные.

df_test = pd.read_csv('/content/drive/MyDrive/Диплом_2024/data/therapy_test_true.csv')

df_test.head()

Unnamed: 0,text
0,\n\nМинистерство здравоохранения Российской Фе...


In [5]:
print(df_train.shape)

print(df_test.shape)

(65, 1)
(1, 1)


In [6]:
# Уберем лишние знаки и цифры из текста трейн.

df_train = df_train.replace(r'\n', ' ', regex=True)

df_train = df_train.replace(r'\t', ' ', regex=True)

df_train['text'] = df_train['text'].str.lower()

df_train['text'] = df_train['text'].str\
                                   .replace('[{}]'.format(string.punctuation), '', regex=True)

from string import digits

remove_digits = str.maketrans('', '', digits)

df_train['text'] = df_train['text'].str.translate(remove_digits)


df_train.loc[0]

text    московский государственный медикостоматологиче...
Name: 0, dtype: object

In [7]:
# Разделим наши большие тексты на более короткие.

limit = 64  # Set to limit words per row
df_train['text'] = (
    df_train['text'].str.split().apply(lambda s: [
        ' '.join(s[pos:pos + limit])  # group into 64 size chunks
        for pos in range(0, len(s), limit)
    ])
)
# Explode Into Multiple Rows
df_train = df_train.explode('text')

In [8]:
df_train.head()

Unnamed: 0,text
0,московский государственный медикостоматологиче...
0,бескуднический бульвар дата поступления в клин...
0,появляться приступы удушья провоцируемые измен...
0,нагрузке подъем на ступеньки причину заболеван...
0,последний приступ был вечером когда больной не...


In [9]:
df_train.shape

(2479, 1)

In [10]:
df_train = df_train.reset_index(drop=True)

df_train.head()

Unnamed: 0,text
0,московский государственный медикостоматологиче...
1,бескуднический бульвар дата поступления в клин...
2,появляться приступы удушья провоцируемые измен...
3,нагрузке подъем на ступеньки причину заболеван...
4,последний приступ был вечером когда больной не...


In [11]:
# Уберем лишние знаки и цифры из текста тест.

df_test = df_test.replace(r'\n', ' ', regex=True)

df_test = df_test.replace(r'\t', ' ', regex=True)

df_test['text'] = df_test['text'].str.lower()

df_test['text'] = df_test['text'].str\
                                   .replace('[{}]'.format(string.punctuation), '', regex=True)

from string import digits

remove_digits = str.maketrans('', '', digits)

df_test['text'] = df_test['text'].str.translate(remove_digits)


df_test

Unnamed: 0,text
0,министерство здравоохранения российской феде...


In [12]:
# Разделим наш тестовый текст на более короткие.

limit = 64  # Set to limit words per row
df_test['text'] = (
    df_test['text'].str.split().apply(lambda s: [
        ' '.join(s[pos:pos + limit])  # group into 64 size chunks
        for pos in range(0, len(s), limit)
    ])
)
# Explode Into Multiple Rows
df_test = df_test.explode('text')

In [13]:
df_test.head()

Unnamed: 0,text
0,министерство здравоохранения российской федера...
0,respiratoria gradus ii status praesens subject...
0,сухой кашель одышка при физической нагрузке го...
0,с родителями в город младший ребёнок в семье и...
0,техникум получил профессию машиниста и в течен...


In [14]:
df_test = df_test.reset_index(drop=True)

df_test.head()

Unnamed: 0,text
0,министерство здравоохранения российской федера...
1,respiratoria gradus ii status praesens subject...
2,сухой кашель одышка при физической нагрузке го...
3,с родителями в город младший ребёнок в семье и...
4,техникум получил профессию машиниста и в течен...


In [15]:
# Загрузим наши токенизаторы.

import pickle

with open('/content/drive/MyDrive/Диплом_2024/tokenizers/saved_word_to_int_therapy.pkl', 'rb') as f:
    word_to_int = pickle.load(f)

with open('/content/drive/MyDrive/Диплом_2024/tokenizers/saved_int_to_word_therapy.pkl', 'rb') as f:
    int_to_word = pickle.load(f)

In [16]:
input_sequences = []

for line in df_train['text']:
    token_list = [word_to_int.get(word, 0) for word in line.split()]

    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)



print("Total input sequences: ", len(input_sequences))

Total input sequences:  154147


In [17]:
input_sequences[:6]

[[1, 2],
 [1, 2, 3],
 [1, 2, 3, 4],
 [1, 2, 3, 4, 5],
 [1, 2, 3, 4, 5, 6],
 [1, 2, 3, 4, 5, 6, 7]]

In [18]:
# Сделаем паддинг.

max_sequence_len = max([len(x) for x in input_sequences])

input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

input_sequences[1]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3],
      dtype=int32)

In [19]:
vocabulary_size = len(word_to_int)

print(vocabulary_size)

20413


In [20]:
# create features and label

xs, labels = input_sequences[:,:-1],input_sequences[:,-1]

ys = to_categorical(labels, num_classes=vocabulary_size)

In [21]:
print(xs[5])
print(labels[5])
print(ys[5])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 3 4 5 6]
7
[0. 0. 0. ... 0. 0. 0.]


In [22]:
torch.LongTensor(xs[5])

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6])

In [23]:
input_sequences_test = []

for line in df_test['text']:
    token_list = [word_to_int.get(word, 0) for word in line.split()]

    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences_test.append(n_gram_sequence)



print("Total input sequences: ", len(input_sequences_test))

Total input sequences:  2567


In [24]:
input_sequences_test[:6]

[[2125, 2126],
 [2125, 2126, 3271],
 [2125, 2126, 3271, 3272],
 [2125, 2126, 3271, 3272, 3273],
 [2125, 2126, 3271, 3272, 3273, 2],
 [2125, 2126, 3271, 3272, 3273, 2, 2133]]

In [25]:
# Сделаем паддинг для теста.

#max_sequence_len = max([len(x) for x in input_sequences])

input_sequences_test = np.array(pad_sequences(input_sequences_test, maxlen=max_sequence_len, padding='pre'))

input_sequences_test[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0, 2125, 2126, 3271], dtype=int32)

In [26]:
# create features and label for test

xs_test, labels_test = input_sequences_test[:,:-1],input_sequences_test[:,-1]

ys_test = to_categorical(labels_test, num_classes=vocabulary_size)

In [27]:
print(xs_test[5])
print(labels_test[5])
print(ys_test[5])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 2125 2126 3271 3272 3273    2]
2133
[0. 0. 0. ... 0. 0. 0.]


In [28]:
class TextDataset(Dataset):
    def __init__(self, samples_x, samples_y):
        self.samples_x = samples_x
        self.samples_y = samples_y
    def __len__(self):
        return len(self.samples_y)
    def __getitem__(self, idx):
        input_seq = torch.LongTensor(self.samples_x[idx])
        target_seq = torch.LongTensor(self.samples_y[idx])
        return input_seq, target_seq

In [29]:
BATCH_SIZE = 32

train_dataset = TextDataset(xs, ys)

train_dataloader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
)
print(train_dataset[1])

(tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2]), tensor([0, 0, 0,  ..., 0, 0, 0]))


In [30]:
test_dataset = TextDataset(xs_test, ys_test)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
)
print(test_dataset[1])

(tensor([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0, 2125, 2126]), tensor([0, 0, 0,  ..., 0, 0, 0]))


In [31]:
seq_length = train_dataset[1][0].size()[0]
seq_length

63

In [32]:
#lstm model
class lstm(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, seq_length):
        super().__init__()
        #simple lookup table that stores embeddings of a fixed dictionary and size.
        self.embed = nn.Embedding(vocab_size, embed_size)

        #lstm
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers=2, bidirectional=False)

        #fully connected layer
        self.linear = nn.Linear(hidden_size*seq_length, vocab_size)

    def forward(self, input_word):
        #input sequence to embeddings
        embedded = self.embed(input_word)

        #passing the embedding to lstm model
        output, hidden = self.lstm(embedded)

        #reshaping
        output=output.view(output.size(0), -1)

        #fully connected layer
        output = self.linear(output)
        return output, hidden

In [33]:
def train(model, epochs, dataloader, criterion):
    model.train()
    for epoch in range(epochs):
        running_loss = 0
        for input_seq, target_seq in dataloader:
            input_seq, target_seq = input_seq.to(device), target_seq.to(device)

            y_pred, (state_h, state_c) = model(input_seq)

            target_seq = target_seq.float()

            loss=criterion(y_pred, target_seq)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            running_loss += loss.detach().cpu().numpy()
        epoch_loss = running_loss / len(dataloader)
        print(f"Epoch {epoch} loss: {epoch_loss:.3f}")


In [34]:
epochs = 7

learning_rate = 0.001

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = lstm(vocab_size=vocabulary_size,embed_size=128, hidden_size=256, seq_length=seq_length).to(device)

criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

print(model)

# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")

total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.\n")

lstm(
  (embed): Embedding(20413, 128)
  (lstm): LSTM(128, 256, num_layers=2)
  (linear): Linear(in_features=16128, out_features=20413, bias=True)
)
332,775,741 total parameters.
332,775,741 training parameters.



In [35]:
%%time

train(model, epochs, train_dataloader, criterion)

Epoch 0 loss: 7.401
Epoch 1 loss: 2.560
Epoch 2 loss: 0.935
Epoch 3 loss: 0.554
Epoch 4 loss: 0.380
Epoch 5 loss: 0.296
Epoch 6 loss: 0.237
CPU times: user 1h 11min 17s, sys: 9.48 s, total: 1h 11min 26s
Wall time: 1h 11min 43s


In [42]:
checkpoint = {'model': lstm(vocab_size=vocabulary_size, embed_size=128, hidden_size=256, seq_length=seq_length),
              'state_dict': model.state_dict(),
              'optimizer' : optimizer.state_dict()}

torch.save(checkpoint, '/content/drive/My Drive/Диплом_2024/models/lstm_therapy_checkpoint_2.pth')

In [None]:
def load_checkpoint_for_eval(filepath, device):
    checkpoint = torch.load(filepath, map_location=torch.device(device))
    model = checkpoint['model']
    model.load_state_dict(checkpoint['state_dict'])
    for parameter in model.parameters():
        parameter.requires_grad = False

    model.eval()
    model = model.to(device)

    return model

In [None]:
model = load_checkpoint_for_eval('/content/drive/My Drive/Диплом_2024/models/lstm_therapy_checkpoint_1.pth', device)
print(model)

lstm(
  (embed): Embedding(20413, 128)
  (lstm): LSTM(128, 256, num_layers=2)
  (linear): Linear(in_features=16128, out_features=20413, bias=True)
)


In [36]:
seed_text = "жалобы на"
next_words = 3
max_sequence_len = 64
model.eval()
device = torch.device('cpu')
model.to(device)

for _ in range(next_words):

    sample = seed_text.split()
    token_list = [word_to_int.get(word, 0) for word in sample[-63:]]
    token_list = pad_sequences(
        [token_list], maxlen=max_sequence_len-1, padding='pre')
    token_input = torch.LongTensor(np.array(token_list))
    token_input = token_input.to(device)
    with torch.no_grad():
            predicted_probs, _ = model(token_input)

    predicted_tokens = np.argmax(predicted_probs)
    predicted_word = int_to_word[predicted_tokens.item()]
    seed_text += ' ' + predicted_word



print("Next predicted words:", seed_text )

Next predicted words: жалобы на момент курации жалоб


In [37]:
seed_text = "хрипы сухие"
next_words = 3
max_sequence_len = 64
model.eval()
device = torch.device('cpu')
model.to(device)

for _ in range(next_words):

    sample = seed_text.split()
    token_list = [word_to_int.get(word, 0) for word in sample[-63:]]
    token_list = pad_sequences(
        [token_list], maxlen=max_sequence_len-1, padding='pre')
    token_input = torch.LongTensor(np.array(token_list))
    token_input = token_input.to(device)
    with torch.no_grad():
            predicted_probs, _ = model(token_input)

    predicted_tokens = np.argmax(predicted_probs)
    predicted_word = int_to_word[predicted_tokens.item()]
    seed_text += ' ' + predicted_word



print("Next predicted words:", seed_text )

Next predicted words: хрипы сухие хрипы синдром дыхательной


In [38]:
seed_text = "одышка"
next_words = 3
max_sequence_len = 64
model.eval()
device = torch.device('cpu')
model.to(device)

for _ in range(next_words):

    sample = seed_text.split()
    token_list = [word_to_int.get(word, 0) for word in sample[-63:]]
    token_list = pad_sequences(
        [token_list], maxlen=max_sequence_len-1, padding='pre')
    token_input = torch.LongTensor(np.array(token_list))
    token_input = token_input.to(device)
    with torch.no_grad():
            predicted_probs, _ = model(token_input)

    predicted_tokens = np.argmax(predicted_probs)
    predicted_word = int_to_word[predicted_tokens.item()]
    seed_text += ' ' + predicted_word



print("Next predicted words:", seed_text )

Next predicted words: одышка смешанная кашель влажный


In [39]:
seed_text = "боли в"
next_words = 3
max_sequence_len = 64
model.eval()
device = torch.device('cpu')
model.to(device)

for _ in range(next_words):

    sample = seed_text.split()
    token_list = [word_to_int.get(word, 0) for word in sample[-63:]]
    token_list = pad_sequences(
        [token_list], maxlen=max_sequence_len-1, padding='pre')
    token_input = torch.LongTensor(np.array(token_list))
    token_input = token_input.to(device)
    with torch.no_grad():
            predicted_probs, _ = model(token_input)

    predicted_tokens = np.argmax(predicted_probs)
    predicted_word = int_to_word[predicted_tokens.item()]
    seed_text += ' ' + predicted_word



print("Next predicted words:", seed_text )

Next predicted words: боли в области сердца колющего


In [40]:
model.eval()

preds = []
#targets = []

for input_seq, target_seq in test_dataloader:
        input_seq, target_seq = input_seq.to(device), target_seq.to(device)

        with torch.no_grad():
            predictions, _ = model(input_seq)

        predictions_max = np.argmax(predictions.cpu().numpy(), axis=1)

        preds.extend(predictions_max)

In [41]:
# Метрики качества на тестовой выборке.

print(classification_report(list(labels_test), preds))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           2       1.00      1.00      1.00         1
           4       1.00      1.00      1.00         1
           5       1.00      1.00      1.00         1
           6       1.00      1.00      1.00         1
           7       1.00      0.50      0.67         2
           8       1.00      1.00      1.00         2
          10       1.00      1.00      1.00         1
          12       1.00      1.00      1.00         1
          21       1.00      1.00      1.00         1
          22       1.00      0.50      0.67         2
          23       0.36      0.67      0.47         6
          25       0.60      1.00      0.75         3
          26       0.36      0.80      0.50         5
          30       1.00      1.00      1.00         1
          31       1.00      1.00      1.00         1
          32       0.85      0.85      0.85        13
          33       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
