In [1]:
!pip install sentence-transformers
!pip install torchmetrics
import nltk
from sentence_transformers import SentenceTransformer

nltk.download('punkt')
rubert_sentence = SentenceTransformer('all-distilroberta-v1')

Collecting sentence-transformers
  Downloading sentence-transformers-2.1.0.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 787 kB/s eta 0:00:01
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 2.8 MB/s eta 0:00:01
[?25hCollecting tokenizers>=0.10.3
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 53.2 MB/s eta 0:00:01
Collecting scikit-learn
  Downloading scikit_learn-1.0.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (23.2 MB)
[K     |████████████████████████████████| 23.2 MB 7.5 MB/s eta 0:00:01
Collecting nltk
  Downloading nltk-3.6.5-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 7.8 MB/s eta 0:00:01
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylin

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Downloading:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.86k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/653 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/15.7k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [3]:
import numpy as np
augmented_chats = np.load("data/augmented_chats.npy")
augmented_targets = np.load("data/augmented_targets.npy")
augmented_chats.shape

(2850,)

In [4]:
chat_embeddings = []
for i in range(2787):
  chat_embeddings.append(rubert_sentence.encode(augmented_chats[i]))
  if(i%500==0):
    print(i)

0
500
1000
1500
2000
2500


In [None]:
import numpy as np

all_targets = np.asarray(augmented_targets)
all_chat_embeddings = np.asarray(chat_embeddings)

train_targets = np.asarray(augmented_targets[:2500])
train_chat_embeddings = np.asarray(chat_embeddings[:2500])

test_targets = np.asarray(augmented_targets[2500:])
test_chat_embeddings = np.asarray(chat_embeddings[2500:])


In [None]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch
from torch.nn.functional import one_hot

class TargetDataset(Dataset):
    def __init__(self, target_list, chat_list):
        self.target_list = target_list
        self.chat_list = chat_list

    def __len__(self):
        return len(self.target_list)

    def __getitem__(self, idx):
        target = self.target_list[idx]
        target_tensor_target = torch.tensor(target, dtype=torch.long)
        chat_tensor = self.chat_list[idx]
        return chat_tensor, target_tensor_target
    
training_dataset = TargetDataset(train_targets, train_chat_embeddings)
test_dataset = TargetDataset(test_targets, test_chat_embeddings)

In [None]:
train_set, val_set = torch.utils.data.random_split(training_dataset, [2300, 200])
train_dataloader = DataLoader(train_set, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_set, batch_size=8, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [None]:
import torch
from torch import nn
from torch import optim
import numpy as np

device = torch.device('cuda')

class TargetDecoder(nn.Module):
    def __init__(self, features_dim=768): #768
        super(TargetDecoder, self).__init__()

        self.linear = nn.Sequential(nn.Linear(features_dim, 15680))

        self.cnn = nn.Sequential(
            nn.ConvTranspose3d(64, 64, kernel_size=3),
            nn.ReLU(),
            nn.ConvTranspose3d(64, 32, kernel_size=3, padding=1),
            nn.ReLU(), 
            nn.ConvTranspose3d(32, 7, kernel_size=3),
            # nn.Tanh()
        )

    def forward(self, x):
        x = self.linear(x)
        x = x.reshape(x.shape[0], 64, 5, 7, 7)
        x = self.cnn(x)
        return x

    

target_decoder = TargetDecoder().to(device)
loss_function = nn.CrossEntropyLoss()

In [None]:
from torchmetrics import Accuracy

accuracy = Accuracy()
EPOCHS = 50
optimizer = optim.Adam(target_decoder.parameters(), lr=1e-3)

for epoch in range(EPOCHS):
  target_decoder.train()
  train_loss = []
  train_acc = []
  for target_tensor_input, target_tensor_target in train_dataloader:
    target_tensor_input = target_tensor_input.float().to(device)
    target_tensor_target = target_tensor_target.to(device)
    optimizer.zero_grad()
    predict = target_decoder(target_tensor_input)
    loss = loss_function(predict, target_tensor_target)
    train_loss.append(loss.item())
    train_acc.append(accuracy(predict.to("cpu"),target_tensor_target.to("cpu")))
    loss.backward()
    optimizer.step()
  train_loss = np.array(train_loss).mean()
  train_acc = np.array(train_acc).mean()

  target_decoder.eval()
  val_loss = []
  val_acc = []
  for target_tensor_input, target_tensor_target in val_dataloader:
    target_tensor_input = target_tensor_input.float().to(device)
    predict = target_decoder(target_tensor_input)
    loss = loss_function(predict.to(device), target_tensor_target.to(device))
    val_loss.append(loss.item())
    val_acc.append(accuracy(predict.to("cpu"), target_tensor_target.to("cpu")))
  val_loss = np.array(val_loss).mean()
  val_acc = np.array(val_acc).mean()
  print(f"epoch: {epoch} | loss: {train_loss} | val_loss: {val_loss}")
  print(f"train_acc: {train_acc} | val_acc: {val_acc} ")

epoch: 0 | loss: 0.1900106864826133 | val_loss: 0.11249557018280029
train_acc: 0.9666821956634521 | val_acc: 0.977433443069458 
epoch: 1 | loss: 0.09815001247140269 | val_loss: 0.08519115000963211
train_acc: 0.9776672720909119 | val_acc: 0.9776722192764282 
epoch: 2 | loss: 0.07233100663870573 | val_loss: 0.06115288868546486
train_acc: 0.9790992736816406 | val_acc: 0.9807299971580505 
epoch: 3 | loss: 0.05369513946223176 | val_loss: 0.05005870401859283
train_acc: 0.9828003644943237 | val_acc: 0.983144998550415 
epoch: 4 | loss: 0.040204232720296 | val_loss: 0.03629449240863323
train_acc: 0.9870162606239319 | val_acc: 0.9888980984687805 
epoch: 5 | loss: 0.033371007289840944 | val_loss: 0.03311623014509678
train_acc: 0.9899161458015442 | val_acc: 0.9902111887931824 
epoch: 6 | loss: 0.03025070740517953 | val_loss: 0.03062201224267483
train_acc: 0.9914166927337646 | val_acc: 0.9915381669998169 
epoch: 7 | loss: 0.02827864657673571 | val_loss: 0.031184499636292457
train_acc: 0.99231463670

In [None]:
test_loss = []
test_acc = []
target_decoder.eval()
for target_tensor_input, target_tensor_target in test_dataloader:
  target_tensor_input = target_tensor_input.float().to(device)
  predict = target_decoder(target_tensor_input)
  loss = loss_function(predict.to(device), target_tensor_target.to(device))
  test_loss.append(loss.item())
  test_acc.append(accuracy(predict.to("cpu"), target_tensor_target.to("cpu")))
test_loss = np.array(test_loss).mean()
test_acc = np.array(test_acc).mean()
print(f"test_loss: {test_loss} | test_acc: {test_acc} ")

test_loss: 0.038611365068289966 | test_acc: 0.9924993515014648 


In [None]:
# 0.032, 0.9942 - distil_roberta

## Production

In [None]:
all_training_dataset = TargetDataset(all_targets, all_chat_embeddings)
all_train_dataloader = DataLoader(all_training_dataset, batch_size=8, shuffle=True)

In [None]:
import torch
from torch import nn
from torch import optim
import numpy as np

device = torch.device('cuda')

class TargetDecoder(nn.Module):
    def __init__(self, features_dim=768): #768
        super(TargetDecoder, self).__init__()

        self.linear = nn.Sequential(nn.Linear(features_dim, 15680))

        self.cnn = nn.Sequential(
            nn.ConvTranspose3d(64, 64, kernel_size=3),
            nn.ReLU(),
            nn.ConvTranspose3d(64, 32, kernel_size=3, padding=1),
            nn.ReLU(), 
            nn.ConvTranspose3d(32, 7, kernel_size=3),
            # nn.Tanh()
        )

    def forward(self, x):
        x = self.linear(x)
        x = x.reshape(x.shape[0], 64, 5, 7, 7)
        x = self.cnn(x)
        return x

    

target_decoder = TargetDecoder().to(device)
loss_function = nn.CrossEntropyLoss()

In [None]:
from torchmetrics import Accuracy

accuracy = Accuracy()
EPOCHS = 100
optimizer = optim.Adam(target_decoder.parameters(), lr=1e-3)

for epoch in range(EPOCHS):
  target_decoder.train()
  train_loss = []
  train_acc = []
  for target_tensor_input, target_tensor_target in all_train_dataloader:
    target_tensor_input = target_tensor_input.float().to(device)
    target_tensor_target = target_tensor_target.to(device)
    optimizer.zero_grad()
    predict = target_decoder(target_tensor_input)
    loss = loss_function(predict, target_tensor_target)
    train_loss.append(loss.item())
    train_acc.append(accuracy(predict.to("cpu"),target_tensor_target.to("cpu")))
    loss.backward()
    optimizer.step()
  train_loss = np.array(train_loss).mean()
  train_acc = np.array(train_acc).mean()

  print(f"epoch: {epoch} | loss: {train_loss} | train_acc: {train_acc}")


epoch: 0 | loss: 0.17136185693108932 | train_acc: 0.9688841104507446
epoch: 1 | loss: 0.08767896863954798 | train_acc: 0.9779171943664551
epoch: 2 | loss: 0.06064164436777205 | train_acc: 0.9808591604232788
epoch: 3 | loss: 0.04285138433420214 | train_acc: 0.9860563278198242
epoch: 4 | loss: 0.03454755353381094 | train_acc: 0.9896852970123291
epoch: 5 | loss: 0.03070264969842994 | train_acc: 0.9915357828140259
epoch: 6 | loss: 0.028426078332315884 | train_acc: 0.9926621317863464
epoch: 7 | loss: 0.027797443752195738 | train_acc: 0.9929534792900085
epoch: 8 | loss: 0.02682961901132135 | train_acc: 0.9932955503463745
epoch: 9 | loss: 0.02652437320475421 | train_acc: 0.9934324026107788
epoch: 10 | loss: 0.026255515598849446 | train_acc: 0.9935398101806641
epoch: 11 | loss: 0.025552482787188795 | train_acc: 0.9937337636947632
epoch: 12 | loss: 0.025555175667719376 | train_acc: 0.9937359094619751
epoch: 13 | loss: 0.025715004994312465 | train_acc: 0.9936652183532715
epoch: 14 | loss: 0.0249

In [None]:
torch.save(target_decoder.state_dict(), 'roberta_large_target_decoder.pth')