In [1]:
import yaml
import os

In [2]:
with open(os.path.expanduser('~/develop/ClearML_ML_SD.yml'), 'r') as f:
    keys = yaml.safe_load(f)

In [3]:
os.environ["CLEARML_WEB_HOST"] = "https://app.clear.ml"
os.environ["CLEARML_API_HOST"] = "https://api.clear.ml"
os.environ["CLEARML_FILES_HOST"] = "https://files.clear.ml"
os.environ["CLEARML_API_ACCESS_KEY"] = keys['access_key']
os.environ["CLEARML_API_SECRET_KEY"] = keys['secret_key']

In [4]:
from clearml import Task, Logger

In [5]:
task = Task.init(
    project_name='ML_SD', 
    task_name='cnn', 
    tags=['cnn'])

ClearML Task: created new task id=d183beaa3d574de5b5cba1e883092136
2022-11-08 22:42:38,886 - clearml.Task - INFO - No repository found, storing script code instead
ClearML results page: https://app.clear.ml/projects/922c69dbd48249b183708fef50f18e10/experiments/d183beaa3d574de5b5cba1e883092136/output/log
ClearML Monitor: Could not detect iteration reporting, falling back to iterations as seconds-from-start


In [135]:
import pandas as pd
import numpy as np
import torch

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
from torch.nn.utils.rnn import pad_sequence

# from torchtext.legacy import datasets
# from torchtext.legacy.data import Field, LabelField
# from torchtext.legacy.data import BucketIterator

from torchtext.datasets import IMDB
from sklearn.model_selection import train_test_split
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab, Vectors, GloVe

from collections import Counter, OrderedDict
from functools import partial

import random
import copy
import gc
from tqdm.autonotebook import tqdm
from sklearn.preprocessing import OneHotEncoder
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
tqdm.pandas()

In [7]:
torch.cuda.is_available()

True

In [8]:
SEED = 21
PATH = '../data'

In [9]:
df_train = pd.read_csv(os.path.join(PATH, 'train.csv'))
df_val = pd.read_csv(os.path.join(PATH, 'val.csv'))
df_test = pd.read_csv(os.path.join(PATH, 'test.csv'))

In [91]:
def norm_form(list_words, morph):
    return [morph.parse(word)[0].normal_form for word in list_words]

def del_stopwords(list_words, stop_words):
    return [word for word in list_words if word not in stop_words]

def transform_data(df):
    df = df.copy()
    df['level_2'] = df['icd10'].str.split('.').apply(lambda x: x[0])
    df['level_1'] = df['icd10'].apply(lambda x: x[0])
    df['symptoms_tokens'] = df['symptoms'] \
        .str.lower() \
        .str.split('[^a-zа-яё]+')
#         .progress_apply(partial(del_stopwords, stop_words=get_stop_words('russian'))) \
#         .progress_apply(partial(norm_form, morph=MorphAnalyzer()))
    return df

In [92]:
df_train = transform_data(df_train)
df_val = transform_data(df_val)
df_test = transform_data(df_test)

In [13]:


# y_train = df_train['level_2'].values
# y_val = df_val['level_2'].values
# y_test = df_test['level_2'].values

In [93]:
# create vocab
counter = Counter(np.concatenate(df_train['symptoms_tokens'].tolist()))
sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)
vocab_text = vocab(ordered_dict, min_freq=10, specials=('<unk>', '<PAD>', '<BOS>', '<EOS>'))
vocab_text.set_default_index(vocab_text['<unk>'])

text_stoi = vocab_text.get_stoi()
text_itos = {v: k for k, v in text_stoi.items()}

In [94]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, text_vocab: vocab):
        self.texts = [[text_vocab[token] for token in text] for text in texts]
        default_label = y_train.shape[1]
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {'X': self.texts[idx], 'y': self.labels[idx]} 
    

class Collator(object):
    def __init__(self, padding_value: int = 0, device: str = 'cpu', sort_key=None, batch_first=False):
        self.padding_value = padding_value
        self.sort_key = sort_key
        self.batch_first = batch_first
        
    def __call__(self, batch):
        if self.sort_key is not None:
            batch = sorted(batch, key=self.sort_key)
        
        text = []
        label = []
        for item in batch:
            text.append(torch.tensor(item['X']))
            label.append(item['y'])
        
        text = pad_sequence(text, padding_value=self.padding_value)
        if self.batch_first:
            text = text.T
            
        label = torch.tensor(label)
            
        batch = {
            'X': text, 
            'y': label,
        }

        return batch


class BucketSampler:
    def __init__(self, dataset, batch_size: int, drop_last: bool = False, sort_key=None, shuffle: bool = False) -> None:
        self.dataset = dataset
        self.batch_size = batch_size
        self.drop_last = drop_last
        self.sort_key = sort_key
        self.shuffle = shuffle
        
    def __iter__(self):
        if self.sort_key is not None:
            indices = [(i, self.sort_key(item)) for i, item in enumerate(self.dataset)]
            if self.shuffle:
                random.shuffle(indices)
            pooled_indices = []
            # create pool of indices with similar lengths 
            for i in range(0, len(indices), self.batch_size * 100):
                pooled_indices.extend(sorted(
                    indices[i:i + self.batch_size * 100], 
                    key=lambda x: x[1], 
                ))
            indices = [x[0] for x in pooled_indices]    
        else:
            indices = np.arange(len(self.dataset))
            if self.shuffle:
                random.shuffle(indices)

        # yield indices for current batch
        for i in range(0, len(indices), self.batch_size):
            yield indices[i:i + self.batch_size]

    def __len__(self) -> int:
        if self.drop_last:
            return len(self.dataset) // self.batch_size  
        else:
            return (len(self.dataset) + self.batch_size - 1) // self.batch_size

In [112]:
mask = df_train.groupby('level_2', sort=False).transform('size') > 10

In [113]:
target_enc = OneHotEncoder(sparse=False, handle_unknown='ignore')
target_enc.fit(df_train.loc[mask, ['level_2']])

X_train = df_train.loc[mask, 'symptoms_tokens'].tolist()
X_val = df_val['symptoms_tokens'].tolist()
X_test = df_test['symptoms_tokens'].tolist()

y_train = target_enc.transform(df_train.loc[mask, ['level_2']])
y_val = target_enc.transform(df_val[['level_2']])
y_test = target_enc.transform(df_test[['level_2']])

# create datasets
train_dataset = TextDataset(X_train, y_train, vocab_text)
val_dataset = TextDataset(X_val, y_val, vocab_text)
test_dataset = TextDataset(X_test, y_test, vocab_text)

In [134]:
y_test.shape

(1011, 108)

In [33]:
ttt = iter(val_dataset)

In [34]:
next(ttt)

{'X': [16,
  5,
  354,
  96,
  9,
  8,
  41,
  0,
  56,
  589,
  407,
  842,
  635,
  230,
  12,
  8,
  185,
  345,
  374,
  723,
  0,
  0,
  0,
  0,
  1401,
  12,
  8,
  185,
  210,
  345,
  374,
  28,
  513,
  159,
  557,
  0,
  0,
  0,
  0,
  0,
  1404,
  0,
  120,
  709,
  579,
  88,
  252,
  6],
 'y': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0

In [35]:
# кастомные функции для обучения моделей
import gc

def clear_cache():
    gc.collect()
    torch.cuda.empty_cache()


class Trainer():
    def __init__(self, model, loss_func, opt, device='cpu'):
        self.model = model.to(device)
        self.loss_func = loss_func
        self.opt = opt
        self.device = device
            
    def train_epoch(self, train_iter, epoch):
        loss_value = 0.0
        
        y_fact = []
        y_pred = []
        self.model.train()
        pbar = tqdm(enumerate(train_iter), total=len(train_iter), leave=False)
        pbar.set_description(f"Epoch {epoch}")
        for it, batch in pbar: 
            self.opt.zero_grad()
            
            X = batch['X'].to(self.device)
            y = batch['y'].to(self.device)
            outputs = self.model(X)

            loss = self.loss_func(outputs, y)
            loss.backward()
            # torch.nn.utils.clip_grad_norm_(model.parameters(), 20)
            self.opt.step()

            loss_value += loss.item()

            y_fact.append(batch['y'].numpy().argmax(axis=1))
            y_pred.append(outputs.cpu().detach().numpy())

            pbar.set_description(f"""
                Train Loss: {loss:.4}
            """)

        y_fact = np.hstack(y_fact)
        y_pred = np.vstack(y_pred)
        
        metrics = dict(
            loss = loss_value / len(train_iter),
            hit3 = hit_at_n(y_fact, y_pred, n=3),
            precision = hit_at_n(y_fact, y_pred, n=1),
        )
        
        return metrics


    def eval_epoch(self, val_iter, epoch):
        loss_value = 0.0
        
        y_fact = []
        y_pred = []
        
        self.model.eval()
        pbar = tqdm(enumerate(val_iter), total=len(val_iter), leave=False)
        pbar.set_description(f"Epoch {epoch}")
        with torch.no_grad():
            for it, batch in pbar:
                X = batch['X'].to(self.device)
                y = batch['y'].to(self.device)
                outputs = self.model(X)
                loss = self.loss_func(outputs, y)
                loss_value += loss.item()

                y_fact.append(batch['y'].numpy().argmax(axis=1))
                y_pred.append(outputs.cpu().detach().numpy())

                pbar.set_description(f"""
                    Test Loss: {loss:.4}
                """)

        y_fact = np.hstack(y_fact)
        y_pred = np.vstack(y_pred)
        
        metrics = dict(
            loss = loss_value / len(val_iter),
            hit3 = hit_at_n(y_fact, y_pred, n=3),
            precision = hit_at_n(y_fact, y_pred, n=1),
        )
        
        return metrics


    def train_loop(self, train_iter, valid_iter, max_epochs, patience):

        min_loss = np.inf

        cur_patience = 0

        for epoch in range(1, max_epochs + 1):
            train_metrics = self.train_epoch(train_iter, epoch)
            clear_cache()
            
            val_metrics = self.eval_epoch(valid_iter, epoch)
            clear_cache()
            
            val_loss = val_metrics['loss']
            if val_loss < min_loss:
                min_loss = val_loss
                best_model = self.model.state_dict()
            else:
                cur_patience += 1
                if cur_patience == patience:
                    cur_patience = 0
                    break
            clear_output()
            print('%20s: %2d' % ('epoch', epoch))
            print()
            print('%20s: %7.4f %3.4f' % ('loss', train_metrics['loss'], val_metrics['loss']))
            print()
            print('%20s: %7.4f %3.4f' % ('hit3', train_metrics['hit3'], val_metrics['hit3']))
            print('%20s: %7.4f %3.4f' % ('precision', train_metrics['precision'], val_metrics['precision']))

#             print(*[f'{k}: {v}' for k, v in train_metrics.items()])
#             print(*[f'{k}: {v}' for k, v in val_metrics.items()])

        self.model.load_state_dict(best_model)
        
        return None

In [37]:
next(iter(loaders['val']))

  label = torch.tensor(label)


{'X': tensor([[ 16,   5, 354,  ...,   1,   1,   1],
         [ 23,   0,   0,  ...,   1,   1,   1],
         [  5,   7,   4,  ...,   1,   1,   1],
         ...,
         [189, 277,   7,  ...,   1,   1,   1],
         [  5, 484,   7,  ...,   1,   1,   1],
         [  7,   4, 232,  ...,   1,   1,   1]]),
 'y': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)}

In [38]:
class CNN(nn.Module):
    def __init__(
        self,
        vocab_size,
        emb_dim,
        out_channels,
        kernel_sizes,
        dropout=0.5,
        n_classes=1,
    ):
        super().__init__()
        # num_filters = 36
        self.embedding = nn.Embedding(vocab_size, emb_dim)
#         self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
#         self.embedding.weight.requires_grad = False
        
#         self.conv_0 = nn.Conv1d(emb_dim, out_channels, kernel_size=kernel_sizes[0], padding=1, stride=2)  # YOUR CODE GOES HERE     
#         self.conv_1 = nn.Conv1d(emb_dim, out_channels, kernel_size=kernel_sizes[1], padding=1, stride=2)  # YOUR CODE GOES HERE
#         self.conv_2 = nn.Conv1d(emb_dim, out_channels, kernel_size=kernel_sizes[2], padding=1, stride=2)  # YOUR CODE GOES HERE
        
        self.convs1 = nn.ModuleList([nn.Conv2d(1, out_channels, (K, emb_dim)) for K in kernel_sizes])
        
        # self.fc = nn.Linear(len(kernel_sizes) * out_channels, n_classes)
        self.fc = nn.Linear(len(kernel_sizes)*out_channels, n_classes)
        
        self.dropout = nn.Dropout(dropout)
        
        
    def forward(self, text):
        
        x = self.embedding(text)
        
        # embedded = embedded.permute(0, 2, 1)  # may be reshape here
        
#         conved_0 = F.relu(self.conv_0(embedded))  # may be reshape here
#         conved_1 = F.relu(self.conv_1(embedded))  # may be reshape here
#         conved_2 = F.relu(self.conv_2(embedded))  # may be reshape here
        
#         pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
#         pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
#         pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
#         cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim=1))

        x = x.unsqueeze(1)  
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] 
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  
        x = torch.cat(x, 1)
        
            
        return self.fc(x)

In [219]:
# class CNN_Text(nn.Module):
    
#     def __init__(self):
#         super(CNN_Text, self).__init__()
#         filter_sizes = [1,2,3,5]
#         num_filters = 36
#         n_classes = len(le.classes_)
#         self.embedding = nn.Embedding(max_features, embed_size)
#         self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
#         self.embedding.weight.requires_grad = False
#         self.convs1 = nn.ModuleList([nn.Conv2d(1, num_filters, (K, embed_size)) for K in filter_sizes])
#         self.dropout = nn.Dropout(0.1)
#         self.fc1 = nn.Linear(len(filter_sizes)*num_filters, n_classes)


#     def forward(self, x):
#         x = self.embedding(x)  
#         x = x.unsqueeze(1)  
#         x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] 
#         x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  
#         x = torch.cat(x, 1)
#         x = self.dropout(x)  
#         logit = self.fc1(x) 
#         return logit

In [39]:
def hit_at_n(y_true, y_pred, n=3):
    assert len(y_true) == len(y_pred)
    
    score = np.mean(np.any(
        np.argsort(-y_pred, axis=1)[:, :n] == y_true.reshape(-1,1), 
        axis=1
    ))
    return score

In [40]:
from IPython.display import clear_output

436

In [107]:
# model = CNN(
#     vocab_size=len(vocab_text), 
#     emb_dim=300, # 300, 
#     out_channels=512, # 64,
#     kernel_sizes=[1, 2, 3, 4, 5], # [3, 4, 5], 
#     dropout=0.8, # 0.5
#     n_classes=y_train.shape[1],
# )



# trainer = Trainer(
#     model, 
#     loss_func=nn.CrossEntropyLoss(reduction='sum'), 
#     opt=torch.optim.Adam(model.parameters(), lr=0.001),
#     device='cuda'
# )

In [108]:
# loaders = {
#     name: DataLoader(
#         dataset, 
#         batch_sampler=BucketSampler(dataset, batch_size=512, shuffle=name=='train'),
#         collate_fn=Collator(padding_value=vocab_text['<PAD>'], batch_first=True),
#     )
#     for name, dataset in zip(['train', 'val', 'test'], [train_dataset, val_dataset, test_dataset])
# }



# trainer.train_loop(loaders['train'], loaders['val'], max_epochs=20, patience=10)

In [223]:
del trainer
clear_cache()

In [44]:
from gensim.models import KeyedVectors

In [49]:
vectors = KeyedVectors.load_word2vec_format('~/Downloads/ft_native_300_ru_wiki_lenta_nltk_word_tokenize.vec')

In [50]:
vectors.similar_by_word('боль')

[('боли', 0.8303021788597107),
 ('тошноту', 0.731131374835968),
 ('тошнота', 0.7193885445594788),
 ('болезненность', 0.7105488181114197),
 ('головокружение', 0.6976202726364136),
 ('сонливость', 0.6919495463371277),
 ('жалость', 0.6907328963279724),
 ('болью', 0.684307336807251),
 ('рвоту', 0.6833842396736145),
 ('раздражительность', 0.6796847581863403)]

In [124]:
word_embeddings = torch.Tensor(np.vstack([
    vectors[word] if word in vectors else np.zeros(300) for ind, word 
    in sorted(text_itos.items(), key=lambda x: x[0], reverse=False)
]))

In [224]:
model = CNN(
    vocab_size=len(vocab_text), 
    emb_dim=300, # 300, 
    out_channels=512, # 64,
    kernel_sizes=[3, 4, 5], # [3, 4, 5], 
    dropout=0.5, # 0.5
    n_classes=y_train.shape[1],
)

# prev_shape = model.embedding.weight.shape
# model.embedding.weight = nn.Parameter(word_embeddings)
# model.embedding.weight.requires_grad = False

trainer = Trainer(
    model, 
    loss_func=nn.CrossEntropyLoss(reduction='sum'), 
    opt=torch.optim.Adam(model.parameters(), lr=0.001),
    device='cuda'
)

In [225]:
# model.embedding.weight.requires_grad = True

In [226]:
def get_sampler(dataset, log_counts=False):
    """
    функция возвращает семплер, для балансировки класса
    """
    target = np.argmax(train_dataset.labels, axis=1)
    class_sample_counts = np.unique(target, return_counts=True)[1]
    if log_counts:
        class_sample_counts = np.log(class_sample_counts)
    weight = 1. / torch.tensor(class_sample_counts, dtype=torch.float)
    samples_weight = weight[target]
    sampler = WeightedRandomSampler(samples_weight, len(samples_weight))
    return sampler

In [228]:
loaders = {
    name: DataLoader(
        dataset, 
        # batch_sampler=BucketSampler(dataset, batch_size=512, shuffle=name=='train'),
        collate_fn=Collator(padding_value=vocab_text['<PAD>'], batch_first=True),
        sampler=get_sampler(dataset, log_counts=True) if name=='train' else None,
        batch_size=256, 
    )
    for name, dataset in zip(['train', 'val', 'test'], [train_dataset, val_dataset, test_dataset])
}


trainer.train_loop(loaders['train'], loaders['val'], max_epochs=40, patience=20)

               epoch: 26

                loss:  2.7951 482.6909

                hit3:  1.0000 0.5752
           precision:  0.9996 0.3851


  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

In [209]:
ttt = next(iter(loaders['train']))

In [210]:
ttt['X'].shape

torch.Size([512, 435])

In [211]:
ttt['y'].shape

torch.Size([512, 108])