In [3]:
!pip install torchmetrics -q

In [26]:
!pip install tqdm

Collecting tqdm
  Using cached tqdm-4.65.0-py3-none-any.whl (77 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.65.0


In [32]:
!pip install datasets -q

In [38]:
!pip install nltk -q

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
     - -------------------------------------- 0.1/1.5 MB 1.7 MB/s eta 0:00:01
     ----- ---------------------------------- 0.2/1.5 MB 2.1 MB/s eta 0:00:01
     --------- ------------------------------ 0.4/1.5 MB 2.5 MB/s eta 0:00:01
     --------------- ------------------------ 0.6/1.5 MB 3.2 MB/s eta 0:00:01
     ------------------------------ --------- 1.1/1.5 MB 4.9 MB/s eta 0:00:01
     ---------------------------------------  1.5/1.5 MB 5.6 MB/s eta 0:00:01
     ---------------------------------------  1.5/1.5 MB 5.6 MB/s eta 0:00:01
     ---------------------------------------  1.5/1.5 MB 5.6 MB/s eta 0:00:01
     ---------------------------------------  1.5/1.5 MB 5.6 MB/s eta 0:00:01
     ---------------------------------------- 1.5/1.5 MB 3.3 MB/s eta 0:00:00
Collecting click
  Downloading click-8.1.3-py3-none-any.whl (96 kB)
     ---------

In [43]:
!pip install gensim -q

In [141]:
!pip install --upgrade tqdm 



In [179]:
!jupyter nbextension enable --py widgetsnbextension --sys-prefix

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: ok


In [147]:
!pip install --upgrade ipywidgets



In [1]:
import nltk
import torch
import sklearn
import datasets
import ipywidgets
import numpy as np
import torch.nn.functional as f
import gensim.downloader as api
import matplotlib.pyplot as plt
from torch import nn
from tqdm import tqdm, trange
from torchmetrics import Accuracy
from ipywidgets import FloatProgress
from torch.utils.data import DataLoader, TensorDataset

In [2]:
def encode(word):
    if word in word2idx.keys():
        return word2idx[word]
    
    return word2idx['unk']

def collate_fn(batch):
    max_len = max(len(row['features']) for row in batch)
    input_embeds = torch.empty((len(batch), max_len), dtype=torch.long) # матрица фичей для передачи в сеть
    labels = torch.empty(len(batch), dtype=torch.long)

    for idx, row in enumerate(batch):
        to_pad = max_len - len(row['features'])
        input_embeds[idx] = torch.cat((row['features'], torch.zeros(to_pad)))
        labels[idx] = row['label'] 

    return {'features': input_embeds, 'labels': labels}



In [None]:
# заморозка градиентов на первых N итерациях (для того, чтобы они не вносили неопределенность в веса)

def freeze_embeddings(model, req_grad=False):
    embeddings = model.embeddings
    for c_p in embeddings.parameters():
        c_p.requires_grad = req_grad

In [16]:
def train_network(model, criterion, optim, metric, num_epochs, loaders, max_grad_norm=2, num_freeze_iter=1000):
    freeze_embeddings(model) # чтобы только на 1 итерации была заморозка
    for e in tqdm(range(num_epochs)):
        model.train()
        num_iter = 0
        pbar = loaders['train']

        for batch in pbar:
            if num_iter > num_freeze_iter:
                freeze_embeddings(model, True)
            optimizer.zero_grad()
            input_embeds = batch['features'].to(device)
            labels = batch['labels'].to(device)
            pred = model(input_embeds)
            loss = criterion(pred, labels)
            
            loss.backward()

            if max_grad_norm:
                torch.nn.utils.clip_grad_norm(model.parameters(), max_grad_norm)
            
            optimizer.step()
            num_iter += 1
            input_embeds.to('cpu')
            labels.to('cpu')
            torch.cuda.empty_cache()

        valid_loss = 0
        valid_acc = 0
        num_iter = 0
        model.eval()

        with torch.no_grad():
            for batch in loaders['test']:
                input_embeds = batch['features'].to(device)
                labels = batch['labels'].to(device)
                pred = model(input_embeds)

                valid_loss += criterion(pred, labels)
                valid_acc += metric(pred, labels)
                num_iter += 1
        
        print(f'Valid Loss: {valid_loss / num_iter}, Accuracy: {valid_acc/num_iter}')

In [None]:
# использование предобученных эмбеддингов
# (перед этим передать в model нужную архитектуру, обучить, вызвать этот блок и еще раз обучить)
with torch.no_grad():
    for word, idx in word2idx.items():
        if word in word2vec:
            model.embedding.weight[idx] = torch.from_numpy(word2vec.get_vector(word))

In [4]:
SEED = 0xDEAD

np.random.seed(SEED)
torch.random.manual_seed(SEED)
torch.cuda.random.manual_seed_all(SEED)

In [5]:
device = torch.device('cuda:0') if torch.cuda.is_available else torch.device('cpu')

In [7]:
dataset = datasets.load_dataset('ag_news')

Found cached dataset ag_news (C:/Users/Messi/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


  0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
tokenizer = nltk.WordPunctTokenizer()
max_length = 128

dataset = dataset.map(lambda x: {
    'tokenized': tokenizer.tokenize(x['text'])[:max_length]
})

Loading cached processed dataset at C:\Users\Messi\.cache\huggingface\datasets\ag_news\default\0.0.0\bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548\cache-869cdecf2899d3ea.arrow


Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [9]:
word2vec = api.load('glove-twitter-50')

In [174]:
len(loaders['train'])

3750

In [10]:
word2idx = {word: ind for ind, word in enumerate(word2vec.index_to_key)}

In [11]:
dataset = dataset.map(lambda x:{
    'features': [encode(word) for word in x['tokenized']]
})

Loading cached processed dataset at C:\Users\Messi\.cache\huggingface\datasets\ag_news\default\0.0.0\bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548\cache-b0426b68151d937a.arrow


Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [12]:
dataset = dataset.remove_columns(['text', 'tokenized'])

In [13]:
dataset.set_format(type='torch')

In [14]:
loaders = {k: DataLoader(ds, shuffle=(k=='train'), batch_size=32, collate_fn = collate_fn)
for k, ds in dataset.items()}

# Сверточная нейросеть

In [15]:
class CNN_Model(nn.Module):
    def __init__(self, embed_size, hidden_size, num_classes=4):
        super().__init__()

        self.embeddings = nn.Embedding(len(word2idx), embed_size) # инициализация эмбеддингов для всех слов из словаря
        self.cnn = nn.Sequential(
            nn.Conv1d(embed_size, hidden_size, kernel_size=3, padding=1, stride=2),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Conv1d(hidden_size, hidden_size, kernel_size=3, padding=1, stride=2),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Conv1d(hidden_size, hidden_size, kernel_size=3, padding=1, stride=2),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1),
            nn.Flatten()
        )

        self.cls = nn.Sequential(
            nn.Linear(hidden_size, num_classes)
        )

    def forward(self, x):
        x = self.embeddings(x)
        x = x.permute(0, 2, 1)
        x = self.cnn(x)
        pred = self.cls(x)

        return pred

## Инициализация

In [22]:
model_cnn = CNN_Model(word2vec.vector_size, 50).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_cnn.parameters(), lr=1e-2)
metric_cnn = Accuracy('multiclass', num_classes=4).to(device)

## Обучение

In [23]:
train_network(model_cnn, criterion, optimizer, metric_cnn, 1, loaders)

  torch.nn.utils.clip_grad_norm(model.parameters(), max_grad_norm)
  0%|          | 0/1 [01:26<?, ?it/s]


KeyboardInterrupt: 

# Классическая рекуррентная нейросеть

In [236]:
class RNN_block(nn.Module):
    def __init__(self, embed_size, hidden_size):
        super().__init__()
        
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        
        self.W = nn.Parameter(torch.rand(embed_size, hidden_size))
        self.U = nn.Parameter(torch.rand(hidden_size, hidden_size))
        self.V = nn.Parameter(torch.rand(hidden_size, hidden_size))
        self.b_x = nn.Parameter(torch.rand(1, hidden_size))
        self.b_h = nn.Parameter(torch.rand(1, hidden_size))
        
    def forward(self, x, hidden=None):
        hidden = torch.zeros((x.size(0), self.hidden_size)).to(x.device) # h(t-1) размер батча x размер скрытого состояния
        seq_len = x.size(1) # длина max предложения
        
        if hidden is None:
            for cur_idx in range(seq_len): # обновляем hidden по каждому номеру слова каждого предл-я в батче
                hidden = torch.tanh(x[:, cur_idx] @ self.W + hidden @ self.U + self.b_h)
#         print(hidden.is_cuda, self.V.is_cuda, self.b_x.is_cuda)
        res = torch.tanh(hidden @ self.V + self.b_x)
        return res 
            
            
         

In [237]:
class RNN_Model(nn.Module):
    def __init__(self, embed_size, hidden_size, num_classes=4):
        super().__init__()
        self.embeddings = nn.Embedding(len(word2idx), embed_size)
        self.rnn = RNN_block(embed_size, hidden_size)
        self.cls = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        x = self.embeddings(x)
        hidden = self.rnn(x)
        output = self.cls(hidden)
        return output
        
        

In [249]:
model_rnn = RNN_Model(word2vec.vector_size, 50).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_rnn.parameters(), lr=1e-2)
metric_rnn = Accuracy('multiclass', num_classes=4).to(device)

In [363]:
torch.cuda.empty_cache()

In [364]:
!nvidia-smi

Wed Jun  7 16:59:34 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 531.14                 Driver Version: 531.14       CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                      TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce GTX 960        WDDM | 00000000:01:00.0  On |                  N/A |
| 21%   38C    P5               21W / 150W|   1957MiB /  2048MiB |      1%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [250]:
train_network(model_rnn, criterion, optimizer, metric, 1, loaders)

  torch.nn.utils.clip_grad_norm(model.parameters(), max_grad_norm)
100%|██████████| 1/1 [00:29<00:00, 29.09s/it]

Valid Loss: 1.3884234428405762, Accuracy: 0.2501313090324402





# GRU (модификация RNN)

In [27]:
class GRU(nn.Module):
    def __init__(self, embed_size, hidden_size):
        super().__init__()
        
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        
        self.w_rh = nn.Parameter(torch.rand(hidden_size, hidden_size))
        self.b_rh = nn.Parameter(torch.rand(1, hidden_size))
        self.w_rx = nn.Parameter(torch.rand(embed_size, hidden_size))
        self.b_rx = nn.Parameter(torch.rand(1, hidden_size))
        
        self.w_zh = nn.Parameter(torch.rand(hidden_size, hidden_size))
        self.b_zh = nn.Parameter(torch.rand(1, hidden_size))
        self.w_zx = nn.Parameter(torch.rand(embed_size, hidden_size))
        self.b_zx = nn.Parameter(torch.rand(1, hidden_size))
        
        self.w_nh = nn.Parameter(torch.rand(hidden_size, hidden_size))
        self.b_nh = nn.Parameter(torch.rand(1, hidden_size))
        self.w_nx = nn.Parameter(torch.rand(embed_size, hidden_size))
        self.b_nx = nn.Parameter(torch.rand(1, hidden_size))
        
    def forward(self, x, hidden=None):
        
        if hidden is None:
            hidden = torch.zeros((x.size(0), self.hidden_size)).to(x.device)
        
        seq_len = x.size(1) 
        for cur_idx in range(seq_len):
            r = torch.sigmoid(x[:, cur_idx] @ self.w_rx + self.b_rx + hidden @ self.w_rh + self.b_rh)
            z = torch.sigmoid(x[:, cur_idx] @ self.w_zx + self.b_zx + hidden @ self.w_zh + self.b_zh)
            n = torch.tanh(x[:, cur_idx] @ self.w_nx + self.b_nx + r * (hidden @ self.w_nh + self.b_nh))
            hidden = (1 - z) * n + z * hidden
        
        return hidden
        

In [28]:
class GRU_Model(nn.Module):
    def __init__(self, embed_size, hidden_size, num_classes=4):
        super().__init__()
        self.embeddings = nn.Embedding(len(word2idx), embed_size)
        self.gru = GRU(embed_size, hidden_size)
        self.cls = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        x = self.embeddings(x)
        hidden = self.gru(x)
        output = self.cls(hidden)
        return output

In [34]:
model_gru = GRU_Model(embed_size=word2vec.vector_size, hidden_size=50).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_gru.parameters(), lr=1e-2)
metric = Accuracy('multiclass', num_classes=4).to(device)

In [35]:
train_network(model_gru, criterion, optimizer, metric, 1, loaders)

  torch.nn.utils.clip_grad_norm(model.parameters(), max_grad_norm)
  0%|          | 0/1 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 228.00 MiB (GPU 0; 2.00 GiB total capacity; 1.35 GiB already allocated; 0 bytes free; 1.57 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [32]:
torch.cuda.empty_cache()

In [33]:
!nvidia-smi

Wed Jun  7 17:27:33 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 531.14                 Driver Version: 531.14       CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                      TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce GTX 960        WDDM | 00000000:01:00.0  On |                  N/A |
| 21%   40C    P5               24W / 150W|   1430MiB /  2048MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    